我试图使用某些变量派生的特征来预测种族。从我以前的问题How to interpret this triangular shape ROC AUC curve?,我学会了使用decision_function或prediction_proba代替实际预测来拟合ROC曲线。无法从朴素贝叶斯分类器生成ROC-AUC曲线


# coding=utf-8 
import pandas as pd 
from pandas import DataFrame, Series 
import numpy as np 
import nltk 
import re 
import random 
from random import randint 
import csv 
import sys 

from sklearn.metrics import classification_report 
from sklearn.svm import LinearSVC 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction import DictVectorizer 
from sklearn.metrics import confusion_matrix as sk_confusion_matrix 
from sklearn.metrics import roc_curve, auc 
import matplotlib.pyplot as plt 

# multi_class : str, {'ovr', 'multinomial'} 
lr = LogisticRegression() 
#lr = LogisticRegression(penalty='l2', class_weight='auto', solver='lbfgs', multi_class='multinomial') 
nb = MultinomialNB(fit_prior=False) 
svm = LinearSVC(class_weight='auto') 

dv = DictVectorizer() 

# Get csv file into data frame 
data = pd.read_csv("FamilySearchData_All_OCT2015_newEthnicity_filledEthnicity_processedName_trimmedCol.csv", header=0, encoding="utf-8") 
df = DataFrame(data) 

# Class list 
ethnicity2 = ['fr', 'en', 'ir', 'sc', 'others', 'ab', 'rus', 'ch', 'it', 'ja'] 
Ab_group = ['fr', 'en', 'ir', 'sc', 'others', 'ab', 'rus', 'ch', 'it', 'ja', 'fn', 'metis', 'inuit'] 
Ab_lang = ['fr', 'en', 'ir', 'sc', 'others', 'ab', 'rus', 'ch', 'it', 'ja', 'x', 'y'] 

########## CONTROL ROOM #################### 
# change-tag: '#$$' 
# Output file name decoration 
# Total N = 5031794 
featureUsed = 8 
subsample_size = 50000 
ethnicity_var = 'ethnicity2' # Ab_group, Ab_tribe, Ab_lang 
count = 0 

# Declaration 
print 'No. features=', featureUsed 
print 'N=', subsample_size, 'Training_N=', subsample_size/2, 'Test_N=', subsample_size/2 
print 'ethnicity_var:', ethnicity_var 
print ethnicity2 
print 'ML classifier:', 'svm = LinearSVC(class_weight=\'auto\')' 
print '' 
print '//////////////////////////////////////////////////////' 
print '' 

    for i in ethnicity2: 
     ethnicity_tar = str(i) # fr, en, ir, sc, others, ab, rus, ch, it, ja 
     # fn, metis, inuit; algonquian, iroquoian, athapaskan, wakashan, siouan, salish, tsimshian, kootenay 

     def ethnicity_target(row): 
       if row[ethnicity_var] == ethnicity_tar: 
        return 1 
        return 0 
      except: return None 
     df['ethnicity_scan'] = df.apply(ethnicity_target, axis=1) 
     print '1=', ethnicity_tar 
     print '0=', 'non-'+ethnicity_tar 

     # Random sampling a smaller dataframe for debugging 
     rows = random.sample(df.index, subsample_size) 
     df = df.ix[rows] # Warning!!!! overwriting original df 
     print 'Class count:' 
     print df['ethnicity_scan'].value_counts() 

     # Assign X and y variables 
     X = df.raw_name.values 
     y = df.ethnicity_scan.values 

     # Feature extraction functions 
     def feature_full_name(nameString): 
      #... codes omitted 

     # Transform format of X variables, and spit out a numpy array for all features 
     my_dict = [{'last-name': feature_full_last_name(i)} for i in X] 
     my_dict2 = [list_to_dict(feature_twoLetters(feature_full_last_name(i))) for i in X] 
     my_dict3 = [list_to_dict(feature_threeLetters(feature_full_last_name(i))) for i in X] 
     my_dict4 = [list_to_dict(feature_fourLetters(feature_full_last_name(i))) for i in X] 

     my_dict5 = [{'first-name': feature_full_first_name(i)} for i in X] 
     my_dict6 = [list_to_dict(feature_twoLetters(feature_full_first_name(i))) for i in X] 
     my_dict7 = [list_to_dict(feature_threeLetters(feature_full_first_name(i))) for i in X] 
     my_dict8 = [list_to_dict(feature_fourLetters(feature_full_first_name(i))) for i in X] 

     all_dict = [] 
     for i in range(0, len(my_dict)): 
      temp_dict = dict(my_dict[i].items() + my_dict2[i].items() + my_dict3[i].items() + my_dict4[i].items() 
       + my_dict5[i].items() + my_dict6[i].items() + my_dict7[i].items() + my_dict8[i].items()) 

     newX = dv.fit_transform(all_dict) 

     # Separate the training and testing data sets 
     half_cut = int(len(df)/2.0)*-1 
     X_train = newX[:half_cut] 
     X_test = newX[half_cut:] 
     y_train = y[:half_cut] 
     y_test = y[half_cut:] 

     # Fitting X and y into model, using training data 
     svm.fit(X_train, y_train) 

     # Making predictions using trained data 
     y_train_predictions = svm.predict(X_train) 
     y_test_predictions = svm.predict(X_test) 

     #print (y_train_predictions == y_train).sum().astype(float)/(y_train.shape[0]) 
     print 'Accuracy:',(y_test_predictions == y_test).sum().astype(float)/(y_test.shape[0]) 

     print 'Classification report:' 
     print classification_report(y_test, y_test_predictions) 
     #print sk_confusion_matrix(y_train, y_train_predictions) 
     print 'Confusion matrix:' 
     print sk_confusion_matrix(y_test, y_test_predictions) 

     #print y_test[1:20] 
     #print y_test_predictions[1:20] 

     #print y_test[1:10] 
     #print np.bincount(y_test) 
     #print np.bincount(y_test_predictions) 

     # Find and plot AUC 
     false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_test_predictions) 
     roc_auc = auc(false_positive_rate, true_positive_rate) 

     # Find and plot AUC 
     y_score = svm.fit(X_train, y_train).decision_function(X_test) 
     false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_score) 
     roc_auc = auc(false_positive_rate, true_positive_rate) 
     print 'AUC-'+ethnicity_tar+'=',roc_auc 

     # Get different color each graph line 
     colorSet = ['navy', 'greenyellow', 'deepskyblue', 'darkviolet', 'crimson', 
      'darkslategray', 'indigo', 'brown', 'orange', 'palevioletred', 'mediumseagreen', 
      'k', 'darkgoldenrod', 'g', 'midnightblue', 'c', 'y', 'r', 'b', 'm', 'lawngreen' 
      'mediumturquoise', 'lime', 'teal', 'drive', 'sienna', 'sandybrown'] 
     color = colorSet[count-1] 

     # Plotting 
     plt.plot(false_positive_rate, true_positive_rate, c=color, label=('AUC-'+ethnicity_tar+'= %0.2f'%roc_auc)) 
     plt.legend(loc='lower right', prop={'size':8}) 
     plt.plot([0,1],[0,1], color='lightgrey', linestyle='--') 
     plt.ylabel('True Positive Rate') 
     plt.xlabel('False Positive Rate') 
     # Save ROC graphs 

     print '' 
     print '//////////////////////////////////////////////////////' 
     print '' 
except Exception as e: 
    print 'Error:', str(e) 
    print '' 
    print '//////////////////////////////////////////////////////' 
    print '' 


nb.fit(X_train, y_train) # from svm.fit(X_train, y_train) 

y_train_predictions = nb.predict(X_train) # from y_train_predictions = svm.predict(X_train) 

y_test_predictions = nb.predict(X_test) # from y_test_predictions = svm.predict(X_test) 

y_score = nb.fit(X_train, y_train).predict_proba(X_test) # from y_score = svm.fit(X_train, y_train).decision_function(X_test) 


Error: bad input shape (25000L, 2L) 


在这行中出现的正是这种错误? X_test和X_train有什么样的形状?另外,为什么你第二次打电话给nb(和svm)?您已经拥有该数据的训练模型。你可以在最后一行调用nb.predict_proba(X_test)。 –


您正在导入您的示例中的nltk。这仍然是必要的吗? – colidyre


@Olologin这个错误发生在predict_proba语句之后,具体在'false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_score)'。 X_test和X_train是具有形状的稀疏矩阵(25000,63470) – KubiK888




false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_score[:,1]) 




On the flip side, although naive Bayes is known as a decent classifier, it is known to be a bad estimator, so the probability outputs from predict_proba are not to be taken too seriously.


我已添加它(请参阅上面的编辑)。它确实生成ROC图和AUC,但看起来很奇怪(有点像我以前的问题)。 – KubiK888


所以这可能不是由于我在编码方面可能出现的错误,尽管NB有奇怪的外观图。我在某个我不知道的地方犯错是我所担心的。 – KubiK888


另外还有一个问题,尽管我故意使用非常低的训练数据1k(50:50训练/测试),但'fr'总是得到非常高的RUC。是否预计?('fr'是最流行类之一,占总数据的1/3左右) – KubiK888