links_list = char.getLinks(words)
for source_url in links_list:
try:
print 'Downloading URL: ' + source_url
urldict = hash_url(source_url)
source_url_short = urldict['url_short']
source_url_hash = urldict['url_short_hash']
if Url.objects.filter(source_url_short = source_url_short).count() == 0:
try:
htmlSource = getSource(source_url)
except:
htmlSource = '-'
print '\thtmlSource got an error...'
new_u = Url(source_url = source_url, source_url_short = source_url_short, source_url_hash = source_url_hash, html = htmlSource)
new_u.save()
time.sleep(3)
else:
print '\tAlready in database'
except:
print '\tError with downloading URL..'
time.sleep(3)
pass
def getSource(theurl, unicode = 1, moved = 0):
if moved == 1:
theurl = urllib2.urlopen(theurl).geturl()
urlReq = urllib2.Request(theurl)
urlReq.add_header('User-Agent',random.choice(agents))
urlResponse = urllib2.urlopen(urlReq)
htmlSource = urlResponse.read()
htmlSource = htmlSource.decode('utf-8').encode('utf-8')
return htmlSource
基本上这个代码的作用是...它需要一个URL列表并下载它们,并将它们保存到数据库中。就这样。我的代码是否泄漏内存(python)?
是有一个原因你认为你的代码泄漏内存? – Jehiah 2009-11-28 03:09:42
发生任何错误?或花费太多时间?虽然'htmlSource.decode('utf-8')。encode('utf-8')'这个技术很奇怪,它的解码来自utf8并且同时编码回utf8。 – YOU 2009-11-28 03:10:45
没有错误发生。但是,我的脚本随机被“杀死”。之前有人建议这是内存泄漏,导致我的内存过载。 – TIMEX 2009-11-28 03:12:14