import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
sentences = ["i want take a photo", "i go to take a photo", "i go to use my camera", "i go to eat something", "i like my food"]
labels = ["photo", "photo", "photo", "eat", "eat"]
tfv = TfidfVectorizer()
# Fit TFIDF
tfv.fit(traindata)
X = tfv.transform(traindata)
lbl = LabelEncoder()
y = lbl.fit_transform(labels)
xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X, y, stratify=y, random_state=42)
clf = LogisitcRegression()
clf.fit(xtrain, ytrain)
predictions = clf.predict(xtest)
print "Accuracy Score = ", metrics.accuracy_score(ytest, predictions)
新的数据:
new_sentence = ["this is a new sentence"]
X_Test = tfv.transform(new_sentence)
print clf.predict_proba(X_Test)
?好的,但我如何检查所有标签的新随机句子? – esemve
查看最新的答案 –
Thx很多,但是我的最后一个问题是:这是工作,但是如果我搜索测试现有句子,例如:“我去吃东西”,它回答:0.55 0.44,但是为什么?它的一个列车数据为吃饭类别:\第一个数字不是照片,第二个是吃饭类别?或者,如果不是,我可以得到什么数字是什么类别? – esemve