2017-04-27 29 views
0
# -*- coding: utf-8 -*- 
""" 
Created on Wed Apr 26 21:28:31 2017 

@author: Chirantan 
""" 

import pandas 
from pandas.tools.plotting import scatter_matrix 
import matplotlib.pyplot as plt 
from sklearn import model_selection 
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC 

# Load dataset 
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data" 
names = ['Sequence Name','mcg', 'gvh', 'alm', 'mit', 'erl','pox','vac','nuc'] 
dataset = pandas.read_csv(url, names=names, delim_whitespace=True) 


# shape 
print(dataset.shape) 

# head 
print(dataset.head(20)) 

# descriptions 
print(dataset.describe()) 
# class distribution 
#print(dataset.groupby('').size()) 
# box and whisker plots 
dataset.plot(kind='box', subplots=True, layout=(10,10), sharex=False, sharey=False) 
plt.show() 
# histograms 
dataset.hist() 
plt.show() 
# scatter plot matrix 
scatter_matrix(dataset) 
plt.show() 
# histograms 
dataset.hist() 
plt.show() 
# scatter plot matrix 
scatter_matrix(dataset) 
plt.show() 
# Split-out validation dataset 
array = dataset.values 
X = array[:,0:9] 
Y = array[:,9]#HERE IS THE ERROR 
validation_size = 0.20 
seed = 7 
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) 
# Test options and evaluation metric 
seed = 7 
scoring = 'accuracy' 
# Spot Check Algorithms 
models = [] 
models.append(('LR', LogisticRegression())) 
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('KNN', KNeighborsClassifier())) 
models.append(('CART', DecisionTreeClassifier())) 
models.append(('NB', GaussianNB())) 
models.append(('SVM', SVC())) 
# evaluate each model in turn 
results = [] 
names = [] 
for name, model in models: 
    kfold = model_selection.KFold(n_splits=10, random_state=seed) 
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) 
    results.append(cv_results) 
    names.append(name) 
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 
    print(msg) 

# Compare Algorithms 
fig = plt.figure() 
fig.suptitle('Algorithm Comparison') 
ax = fig.add_subplot(111) 
plt.boxplot(results) 
ax.set_xticklabels(names) 
plt.show() 

# Make predictions on validation dataset 
#knn = KNeighborsClassifier() 
svm = SVC() 
svm.fit(X_train, Y_train) 
predictions = svm.predict(X_validation) 
#knn.fit(X_train, Y_train) 
#predictions = knn.predict(X_validation) 
print(accuracy_score(Y_validation, predictions)) 
print(confusion_matrix(Y_validation, predictions)) 
print(classification_report(Y_validation, predictions)) 

我试图用不同的分类从UCI repository.Everything多类酵母数据集工作正常与上面的代码与虹膜数据集有以下变化只是我在Python中使用酵母数据出了界限错误问题。为什么?

# Split-out validation dataset 
array = dataset.values 
X = array[:,0:4] 
Y = array[:,4] 
validation_size = 0.20 

但它无法正常工作与当我做这个

# Split-out validation dataset 
array = dataset.values 
X = array[:,0:9] 
Y = array[:,9] 
validation_size = 0.20 

这里酵母数据集是错误messaage

File "<ipython-input-40-707d4eef8576>", line 55, in <module> 
    Y = array[:,9] 

IndexError: index 9 is out of bounds for axis 1 with size 9 

我不明白这个.array存储数据集的值,现在array [:,9]会给我最后一列。我错了吗?请帮忙。

+0

没有索引9。 – bhansa

回答

0

array不具有与索引9.柱它具有9列,而最后一个具有索引8(因为第一列的索引为0)

0

我有固定的这个错误,但另一个错误想出了

results = [] 
names = [] 
for name, model in models: 
    kfold = model_selection.KFold(n_splits=10, random_state=seed) 
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) 
    results.append(cv_results) 
    names.append(name) 
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 
    print(msg) 

以下是错误:

File "<ipython-input-43-df643b71a66e>", line 75, in <module> 
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\model_selection\_validation.py", line 140, in cross_val_score 
    for train, test in cv.split(X, y, groups)) 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__ 
    while self.dispatch_one_batch(iterator): 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch 
    self._dispatch(tasks) 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch 
    job = self._backend.apply_async(batch, callback=cb) 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async 
    result = ImmediateResult(func) 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 322, in __init__ 
    self.results = batch() 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__ 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp> 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\model_selection\_validation.py", line 238, in _fit_and_score 
    estimator.fit(X_train, y_train, **fit_params) 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\linear_model\logistic.py", line 1174, in fit 
    order="C") 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\utils\validation.py", line 521, in check_X_y 
    ensure_min_features, warn_on_dtype, estimator) 

    File "C:\Users\Chirantan\Downloads\WinPython-64bit-3.5.2.3\python-3.5.2.amd64\lib\site-packages\sklearn\utils\validation.py", line 382, in check_array 
    array = np.array(array, dtype=dtype, order=order, copy=copy) 

ValueError: could not convert string to float: 'CISZ_YEAST' 

现在我该怎样解决这个问题?

相关问题