2017-10-09 108 views
0

我想计算它的准确度(在测试数据集上)。 该模式具有以下预测值:比较python中RF模型的准确性

[0 1 0 1 1 1 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 
1 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0] 

我怎样才能把它比作实际值(在这种情况下,B或M)在检测数据得到其准确性。这对其他数据集值也应该是通用的。 这里是我使用随机森林模型的代码:

import pandas as pd 
import numpy as np 
# Load scikit's random forest classifier library 
from sklearn.ensemble import RandomForestClassifier 

from sklearn.model_selection import train_test_split 
file_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data' 
dataset2 = pd.read_csv(file_path, header=None, sep=',') 

train, test = train_test_split(dataset2, test_size=0.1) 
y = pd.factorize(train[1])[0] 
clf = RandomForestClassifier(n_jobs=2, random_state=0) 
features = train.columns[2:] 
clf.fit(train[features], y) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', 
      max_depth=None, max_features='auto', max_leaf_nodes=None, 
      min_impurity_split=1e-07, min_samples_leaf=1, 
      min_samples_split=2, min_weight_fraction_leaf=0.0, 
      n_estimators=10, n_jobs=2, oob_score=False, random_state=0, 
      verbose=0, warm_start=False) 
# Apply the Classifier we trained to the test data 
clf.predict(test[features]) 
+0

以下回答你想要做什么?问题的含义被解释为希望用原始标签的B,M来评估准确性。 – Keiku

回答

0

您可以编码B和M使用sklearn的preprocessing.LabelEncoder()如下与inverse_transform()返回。另外,精度评估可以使用pandas_ml包和sklearn的accuracy_score()ConfusionMatrix()完成。

import pandas as pd 
import numpy as np 
# Load scikit's random forest classifier library 
from sklearn.ensemble import RandomForestClassifier 

from sklearn.model_selection import train_test_split 
file_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data' 
dataset2 = pd.read_csv(file_path, header=None, sep=',') 

from sklearn import preprocessing 
le = preprocessing.LabelEncoder() 

# Encode B, M to 0, 1 
y = le.fit_transform(dataset2[1]) 
dataset2[1] = y 

train, test = train_test_split(dataset2, test_size=0.1) 
y = train[1] 
y_test = test[1] 
clf = RandomForestClassifier(n_jobs=2, random_state=0) 
features = train.columns[2:] 
clf.fit(train[features], y) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', 
      max_depth=None, max_features='auto', max_leaf_nodes=None, 
      min_impurity_split=1e-07, min_samples_leaf=1, 
      min_samples_split=2, min_weight_fraction_leaf=0.0, 
      n_estimators=10, n_jobs=2, oob_score=False, random_state=0, 
      verbose=0, warm_start=False) 
# Apply the Classifier we trained to the test data 
y_pred = clf.predict(test[features]) 

# Decode from 0, 1 to B, M 
y_test_label = le.inverse_transform(y_test) 
y_pred_label = le.inverse_transform(y_pred) 

from pandas_ml import ConfusionMatrix 
confusion_matrix = ConfusionMatrix(y_test_label, y_pred_label) 
print("Confusion matrix:\n%s" % confusion_matrix) 
# Confusion matrix: 
# Predicted B M __all__ 
# Actual      
# B   35 1  36 
# M   4 17  21 
# __all__ 39 18  57 

from sklearn.metrics import accuracy_score 
accuracy_score(y_test_label, y_pred_label) 
# Out[14]: 0.035087719298245612 

请注意,pandas_ml可以通过pip轻松安装,如下所示。

pip install pandas_ml