我想要绘制火车和使用下面的代码测试学习的学习曲线:绘制图形使用matplotlib
import numpy as np
from sklearn import cross_validation
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.linear_model as lm
import pandas as pd
from sklearn.learning_curve import learning_curve
def main():
print("loading data..")
train_data = list(np.array(pd.read_table('train.tsv'))[:, 2])
test_data = list(np.array(pd.read_table('test.tsv'))[:, 2])
tr = np.array(pd.read_table('train.tsv'))
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode',analyzer='word',
token_pattern=r'\w{1,}',ngram_rang(1,2),
use_idf=1, smooth_idf=1, sublinear_tf=1)
rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001,
C=1, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
y=tr[:,-1].astype(int)
X_all = train_data + test_data
len_train = len(train_data)
print("fitting pipeline")
tfv.fit(X_all)
print("transforming data")
X_all = tfv.transform(X_all)
X = X_all[:len_train]
X_test = X_all[len_train:]
print("20 Fold CV Score: " +
str(np.mean(cross_validation.cross_val_score(rd, X, y, cv=20,
scoring='roc_auc'))))
print("training on full data")
rd.fit(X,y)
pred = rd.predict_proba(X_test)[:, 1]
test_file = pd.read_csv('test.tsv', sep="\t", na_values=['?'],index_col=1)
pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label'])
pred_df.to_csv('benchmark.csv')
print("submission file created..")
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(1,7000,10)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y,
cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-
validation score")
plt.legend(loc="best")
return plt
X,Y = y,pred
title = "Learning Curves (tf-idf)"
cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20,
random_state=0)
estimator = TfidfVectorizer()
plot_learning_curve(estimator, title, X, Y, ylim=(0.1, 1.01), cv=cv,
n_jobs=4)
title = "Learning Curves (lr)"
cv = cross_validation.ShuffleSplit(pred_df, n_iter=10, test_size=0.20,
random_state=0)
estimator = lm()
plot_learning_curve(estimator, title, X, Y, (0.1, 1.01), cv=cv, n_jobs=4)
plt.show()
if __name__ == "__main__":
main()
它给了以下错误:
Traceback (most recent call last):
File "<ipython-input-17-fe9e40bbce16>", line 1, in <module>
runfile('C:/Users/Maitri/Documents/Python Scripts/first.py', wdir='C:/Users/Maitri/Documents/Python Scripts')
File "C:\Users\Maitri\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
execfile(filename, namespace)
File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 136, in <module>
main()
File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 122, in main
cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20, random_state=0)
File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 771, in __init__
train_size)
File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 922, in _validate_shuffle_split
n_test = ceil(test_size * n)
TypeError: a float is required
有没有更好的办法预测结果的曲线图?
输入文件加载正确。而改变数据类型并不能解决问题。实际上,cross_validation.ShuffleSplit()允许int和float参数。 http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.ShuffleSplit.html – Incognito 2015-02-23 19:12:32
您传递给'ShuffleSplit' *的数据*仅*浮点数?熊猫数据框可以包含混淆的数据,可能会让它感到困惑。例如,它是否有助于使用'pred_df.values'(删除索引)? – mfitzp 2015-02-23 19:31:33