2015-02-23 135 views
0

我想要绘制火车和使用下面的代码测试学习的学习曲线:绘制图形使用matplotlib

import numpy as np 
from sklearn import cross_validation 
import matplotlib.pyplot as plt 
from sklearn.feature_extraction.text import TfidfVectorizer 
import sklearn.linear_model as lm 
import pandas as pd 
from sklearn.learning_curve import learning_curve 

def main(): 
    print("loading data..") 
    train_data = list(np.array(pd.read_table('train.tsv'))[:, 2]) 
    test_data = list(np.array(pd.read_table('test.tsv'))[:, 2]) 
    tr = np.array(pd.read_table('train.tsv')) 
    tfv = TfidfVectorizer(min_df=3, max_features=None, 
    strip_accents='unicode',analyzer='word', 
    token_pattern=r'\w{1,}',ngram_rang(1,2), 
         use_idf=1, smooth_idf=1, sublinear_tf=1) 

    rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
          C=1, fit_intercept=True, intercept_scaling=1.0, 
          class_weight=None, random_state=None) 
    y=tr[:,-1].astype(int) 
    X_all = train_data + test_data 
    len_train = len(train_data) 

    print("fitting pipeline") 
    tfv.fit(X_all) 
    print("transforming data") 
    X_all = tfv.transform(X_all) 

    X = X_all[:len_train] 
    X_test = X_all[len_train:] 

    print("20 Fold CV Score: " +     
    str(np.mean(cross_validation.cross_val_score(rd, X, y, cv=20, 
    scoring='roc_auc')))) 

    print("training on full data") 
    rd.fit(X,y) 
    pred = rd.predict_proba(X_test)[:, 1] 
    test_file = pd.read_csv('test.tsv', sep="\t", na_values=['?'],index_col=1) 
    pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label']) 
    pred_df.to_csv('benchmark.csv') 
    print("submission file created..") 

    def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,   
    n_jobs=1, train_sizes=np.linspace(1,7000,10)): 
    """ 
    Generate a simple plot of the test and traning learning curve. 

    Parameters 
    ---------- 
    estimator : object type that implements the "fit" and "predict" methods 
    An object of that type which is cloned for each validation. 

    title : string 
    Title for the chart. 

    X : array-like, shape (n_samples, n_features) 
    Training vector, where n_samples is the number of samples and 
    n_features is the number of features. 

    y : array-like, shape (n_samples) or (n_samples, n_features), optional 
    Target relative to X for classification or regression; 

    ylim : tuple, shape (ymin, ymax), optional 
    Defines minimum and maximum yvalues plotted. 

    cv : integer, cross-validation generator, optional 

    n_jobs : integer, optional 
    Number of jobs to run in parallel (default 1). 
    """ 
    plt.figure() 
    plt.title(title) 
    if ylim is not None: 
     plt.ylim(*ylim) 
    plt.xlabel("Training examples") 
    plt.ylabel("Score") 
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y,  
     cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) 
    train_scores_mean = np.mean(train_scores, axis=1) 
    train_scores_std = np.std(train_scores, axis=1) 
    test_scores_mean = np.mean(test_scores, axis=1) 
    test_scores_std = np.std(test_scores, axis=1) 
    plt.grid() 

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,  
    train_scores_mean + train_scores_std, alpha=0.1, color="r") 
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 
    test_scores_mean + test_scores_std, alpha=0.1, color="g") 
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", 
     label="Training score") 
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross- 
    validation score") 

    plt.legend(loc="best") 
    return plt 

    X,Y = y,pred 


    title = "Learning Curves (tf-idf)" 

    cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20,  
    random_state=0) 

    estimator = TfidfVectorizer() 
    plot_learning_curve(estimator, title, X, Y, ylim=(0.1, 1.01), cv=cv, 
    n_jobs=4) 

    title = "Learning Curves (lr)" 

    cv = cross_validation.ShuffleSplit(pred_df, n_iter=10, test_size=0.20, 
    random_state=0) 
    estimator = lm() 
    plot_learning_curve(estimator, title, X, Y, (0.1, 1.01), cv=cv, n_jobs=4) 

    plt.show()  

if __name__ == "__main__": 
    main() 

它给了以下错误:

Traceback (most recent call last): 

    File "<ipython-input-17-fe9e40bbce16>", line 1, in <module> 
    runfile('C:/Users/Maitri/Documents/Python Scripts/first.py', wdir='C:/Users/Maitri/Documents/Python Scripts') 

    File "C:\Users\Maitri\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile 
    execfile(filename, namespace) 

    File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 136, in <module> 
    main() 

    File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 122, in main 
    cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20, random_state=0) 

    File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 771, in __init__ 
    train_size) 

    File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 922, in _validate_shuffle_split 
    n_test = ceil(test_size * n) 

TypeError: a float is required 

有没有更好的办法预测结果的曲线图?

回答

0

您正在接收的错误的追踪显示问题的根源。

File "C:/Users/Maitri/Documents/Python Scripts/first.py", line 122, in main cv = cross_validation.ShuffleSplit(pred_df, n_iter=100, test_size=0.20, random_state=0) 

File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 771, in init train_size) 

File "C:\Users\Maitri\Anaconda\lib\site-packages\sklearn\cross_validation.py", line 922, in _validate_shuffle_split n_test = ceil(test_size * n) 

TypeError: a float is required 

的误差在sklearn包提高,但是从line 122脚本中的最终起源。在这里你传入一个包含你的数据的Pandas DataFrame。给出的错误TypeError: a float is required指示ShuffleSplit正期待float类型。

的第一步将是检查你是否正确加载输入文件,以下行:

test_file = pd.read_csv('test.tsv', sep="\t", na_values=['?'],index_col=1) 
pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label']) 

如果数据加载OK,但类型是只是不是float,你可以将数据转换为正确的类型,使用numpy或astypeas_dtype使用熊猫。例如

pred_df = pd.DataFrame(pred, index=test_file.index, columns=['label']).as_dtype(np.float) 

# ...or... 

pred_df = pd.DataFrame(pred.astype(np.float), index=test_file.index, columns=['label']) 
+0

输入文件加载正确。而改变数据类型并不能解决问题。实际上,cross_validation.ShuffleSplit()允许int和float参数。 http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.ShuffleSplit.html – Incognito 2015-02-23 19:12:32

+0

您传递给'ShuffleSplit' *的数据*仅*浮点数?熊猫数据框可以包含混淆的数据,可能会让它感到困惑。例如,它是否有助于使用'pred_df.values'(删除索引)? – mfitzp 2015-02-23 19:31:33