我试图创建一个使用规范化的tf-idf作为功能的K-means模型。我得到'ZeroDivisionError:浮动除以零'。以下是代码。请让我知道是否需要更多细节。Python 2.7 - GraphLab:ZeroDivisionError:float除零
import graphlab as gl
data = gl.SFrame(data)
data['tf_idf'] = gl.text_analytics.tf_idf(data['text'])
def normalize(d, target=1.0):
raw = sum(d.values())
try:
factor = target/raw
except ZeroDivisionError:
print 'Zero Error in file ', d['file_name']
return {key:value*factor for key,value in d.iteritems()}
data['tf_idf_norm'] = data['tf_idf'].apply(normalize)
print data[0]
{'file_name': 'ap-4081.pdf.txt',
'tf_idf_norm': {'september': 0.006612160101629999, 'issued': 0.004914445160361691, 'declaration': 0.018380116959675345, 'pursuant': 0.0015236875459684344, 'held': 0.006661456734504585, 'edt': 0.01993691915396277, 'its': 0.0018031759056466382, 'before': 0.000628458524686868, 'prehearing': 0.03603879166997583, 'mail': 0.010151784816687623, 'administrative': 0.0038202023891634983, 'scheduled': 0.02675584978978817, 'division': 0.003714090891806661, 'greater': 0.014401469513143977, 'express': 0.015442536961835316, 'judge': 0.0056255693137487245, 'postal': 0.01448758917369378, 'timely': 0.009547408004733245, 'postponing': 0.023771054663189694, 'james': 0.013851907261754153, 'establishing': 0.015525275482306627, 'securities': 0.0009534453943686734, 'release': 0.000994876598539298, 'served': 0.007549203784675396, 'cliffs': 0.040192712832365154, 'realty': 0.027963087504615878, 'financial': 0.006067940095145491, 'instituting': 0.0047412698163078965, 'are': 0.0024740930507990123, 'fails': 0.013316168540074749, 'proceeding': 0.0031604332608950046, 'appear': 0.010272410560940281, 'for': 0.00039400705802880267, 'defend': 0.01741380246078061, 'asia': 0.028632052997003922, 'deemed': 0.008702433747892771, 'august': 0.04977168128599991, 'exchange': 0.0005249518349832802, 'answers': 0.015575372135328506, 'respondents': 0.022052537701443286, 'corp': 0.010689517733349321, 'eastland': 0.04688688851929564, 'revoked': 0.013668927632973267, 'against': 0.00388458245265655, 'telephonic': 0.016974699222910256, 'incorporated': 0.014218288205022512, 'ltd': 0.014589362228277223, 'postponed': 0.0209823757486391, 'otherwise': 0.007582793422109354, 'respondent': 0.0054516655648239295, 'washington': 0.005091825331541719, 'hearing': 0.02249732139496055, 'registration': 0.00852794089032502, 'conference': 0.03047929459424198, 'america': 0.001440915940409403, 'service': 0., 'holdings': 0.013681846704902636, 'due': 0.006408671617186687, 'commission': 0.0003703379846648254, 'enforcement': 0.0040660794190181156, 'that': 0.0008014938443938574, 'filed': 0.004601893518537033, 'with': 0.0011834598692978579, 'commence': 0.01644634375933502, 'accordance': 0.006995433441162778, 'default': 0.010296776564423474, 'rulings': 0.00878134432538458, 'will': 0.004454693796954499, 'matter': 0.0008886177351788676, 'were': 0.00495349246481796, 'grimes': 0.019846163536283806, 'and': 0.00019120815327678883, 'states': 0.0006306247910547858, 'file': 0.0014093253240757065, 'scheduling': 0.015608961772762466, 'any': 0.0024796720760966906, 'united': 0.0007573801483105002, 'answer': 0.010898516012467644, 'granite': 0.03675899168009445, 'practice': 0.005139863202847838, 'shall': 0.004531342051741035, 'act': 0.0, 'law': 0.003763823781424106, 'oip': 0.051638100949650705, 'rule': 0.0022896158932922445, 'order': 0.0013019082720686934, 'proceedings': 0.005189982291505062, 'the': 0.0003944745275206197, 'section': 0.0023716422033868588},
'tf_idf': {'september': 0.9050645370154226, 'issued': 0.6726833539094077, 'declaration': 2.515848344672878, 'pursuant': 0.20856052215192625, 'held': 0.9118122009441979, 'edt': 2.7289415601335865, 'its': 0.24681655330746285, 'before': 0.08602264841392737, 'prehearing': 4.93294654032065, 'mail': 1.3895641188014474, 'administrative': 0.5229047169927326, 'scheduled': 3.662308599647982, 'division': 0.5083803026181314, 'greater': 1.97125585843607, 'express': 2.113755921043353, 'judge': 0.7700211743422583, 'postal': 1.9830438141881122, 'timely': 1.306837746316985, 'postponing': 3.253753425874317, 'james': 1.8960324371984822, 'establishing': 2.125081070400406, 'securities': 0.1305064609991892, 'release': 0.1361775144891726, 'served': 1.0333259514584336, 'cliffs': 5.501530282373367, 'financial': 0.8305723558472731, 'registration': 1.1672943115358345, 'are': 0.3386508867204352, 'fails': 1.8227011641129327, 'proceeding': 0.43259631074797295, 'appear': 1.4060752258672398, 'for': 0.053931213109576576, 'section': 0.3246275377157642, 'asia': 3.9191210423271863, 'deemed': 1.191178684406377, 'august': 6.812687985048638, 'exchange': 0.07185477698391116, 'answers': 2.1319382401265434, 'respondents': 3.018524887177448, 'corp': 1.4631683549022034, 'eastland': 6.417821014247522, 'revoked': 1.8709864052723948, 'against': 0.531716982797367, 'telephonic': 2.323476452025422, 'incorporated': 1.9461822208839539, 'ltd': 1.9969743877240589, 'postponed': 2.87204240377426, 'otherwise': 1.037923660707063, 'respondent': 0.7462174379555567, 'washington': 0.696962938801067, 'hearing': 3.07940634519689, 'instituting': 0.648979320707021, 'conference': 4.171969254600169, 'law': 0.5151876808461564, 'service': 1.686270670295957, 'holdings': 1.8727547523291366, 'due': 0.8772112775465153, 'commission': 0.050691418761513035, 'enforcement': 0.5565600696713887, 'that': 0.10970751525181831, 'realty': 3.827553848801696, 'with': 0.16199056620216346, 'commence': 2.2511557904457957, 'accordance': 0.9575265316227637, 'default': 1.4094104151733782, 'rulings': 1.2019798695237245, 'will': 0.6097531392036623, 'matter': 0.12163292883290797, 'were': 0.6780281020682886, 'grimes': 2.7165190401350294, 'and': 0.026172342480994967, 'states': 0.08631916435383152, 'file': 0.19290675850759448, 'scheduling': 2.136535949375173, 'any': 0.33941453700573243, 'united': 0.10366928548906945, 'proceedings': 0.7103985456464252, 'answer': 1.4917757884518863, 'granite': 5.031526653127632, 'practice': 0.7035383049574554, 'shall': 0.6202446603049608, 'act': 0.1691336021724683, 'america': 0.19723084414778347, 'oip': 7.068161268029516, 'rule': 0.31339987486008647, 'order': 0.17820364137975558, 'filed': 0.629901660385122, 'the': 0.05399519977243369, 'defend': 2.3835803760951273},
'text': 'united states america before the securities and exchange commission washington administrative proceedings rulings release august administrative proceeding file the matter eastland financial corp granite cliffs incorporated and greater asia realty holdings ltd order postponing hearing and scheduling prehearing conference august the securities and exchange commission issued order instituting proceedings oip against respondents pursuant section the securities exchange act the hearing scheduled commence august august the division enforcement filed declaration service establishing that respondents were served with the oip postal service express mail august accordance with rule practice respondents answers are due august oip order that the hearing scheduled for august postponed and telephonic prehearing conference shall held edt september any respondent that fails timely file answer appear the prehearing conference otherwise defend the proceeding will deemed default and the registration its securities will revoked oip james grimes administrative law judge'}
model = gl.clustering.kmeans.create(data,num_clusters=4,features=['tf_idf_norm'])
[ERROR] graphlab.toolkits._main: Toolkit error: Exception in python callback function evaluation:
ZeroDivisionError('float division by zero',):
Traceback (most recent call last):
File "graphlab/cython/cy_pylambda_workers.pyx", line 426, in graphlab.cython.cy_pylambda_workers._eval_lambda
File "graphlab/cython/cy_pylambda_workers.pyx", line 169, in graphlab.cython.cy_pylambda_workers.lambda_evaluator.eval_simple
File "<ipython-input-14-405770e0af16>", line 3, in normalize
ZeroDivisionError: float division by zero
ERROR:graphlab.toolkits._main:Toolkit error: Exception in python callback function evaluation:
ZeroDivisionError('float division by zero',):
Traceback (most recent call last):
File "graphlab/cython/cy_pylambda_workers.pyx", line 426, in graphlab.cython.cy_pylambda_workers._eval_lambda
File "graphlab/cython/cy_pylambda_workers.pyx", line 169, in graphlab.cython.cy_pylambda_workers.lambda_evaluator.eval_simple
File "<ipython-input-14-405770e0af16>", line 3, in normalize
ZeroDivisionError: float division by zero
---------------------------------------------------------------------------
ToolkitError Traceback (most recent call last)
<ipython-input-17-1591a6c5df9a> in <module>()
----> 1 model = gl.clustering.kmeans.create(data,num_clusters=4,features=['tf_idf_norm'])
/home/praveen/anaconda/lib/python2.7/site-packages/graphlab/toolkits/clustering/kmeans.pyc in create(dataset, num_clusters, features, label, initial_centers, max_iterations, batch_size, verbose)
659
660 ## Create and return the model
--> 661 params = _gl.toolkits._main.run('kmeans_train', opts, verbose)
662 return KmeansModel(params['model'])
/home/praveen/anaconda/lib/python2.7/site-packages/graphlab/toolkits/_main.pyc in run(toolkit_name, options, verbose, show_progress)
87 _get_metric_tracker().track(metric_name, value=1, properties=track_props, send_sys_info=False)
88
---> 89 raise ToolkitError(str(message))
ToolkitError: Exception in python callback function evaluation:
ZeroDivisionError('float division by zero',):
Traceback (most recent call last):
File "graphlab/cython/cy_pylambda_workers.pyx", line 426, in graphlab.cython.cy_pylambda_workers._eval_lambda
File "graphlab/cython/cy_pylambda_workers.pyx", line 169, in graphlab.cython.cy_pylambda_workers.lambda_evaluator.eval_simple
File "<ipython-input-14-405770e0af16>", line 3, in normalize
ZeroDivisionError: float division by zero
UPDATE:
下面的线工作正常。
data['tf_idf_norm'] = data['tf_idf'].apply(lambda x: {key:value*(1.0/sum(x.values())) for key, value in x.iteritems()})
非常感谢您的回答!没有意识到sFrame中的一行实际上是一本字典。 – Praveen
但是当我使用标准化列作为训练K均值模型的特征时,我仍然会得到相同的错误。相应地更新了问题。 – Praveen
我没有你的原始数据,所以很难重新创建,但我会尝试在我回家的时候嘲笑一些东西。上面的字典示例中的tf_idf是否表示所有输入的外观?你知道具体哪个输入导致中断或者是否有输入吗? – Lost