我制作了这个愚蠢的小应用程序,它使用twitters api并扫描最后的'x'推文,找到小小的URL,找出小网址的URL指向,累积已被推送过的顶级域名的频率,并根据频率输出html tagcloud页面。python,google app engine和twitter:我的程序不能正常工作
但它不能正常工作。有一些解决问题,我会照顾。我不认为这是我最关心的。我真的想尝试解决的是两件事:
有时候,应用程序崩溃(特别是如果我选择大量的推文扫描),通常是“Downloaderror Applicationerror 2”和“Downloaderror Applicationerror 5”。我无法解决的另一个问题是运行所需的时间......它很慢。我尝试设置短暂超时。但是,如果我扫描了很多推文,它仍然需要运行。
任何想法?谢谢!
import logging
import wsgiref.handlers
from google.appengine.ext import webapp
import urllib2
from urllib import urlencode
from urllib2 import urlopen
from BeautifulSoup import BeautifulStoneSoup
import socket
import re
from urlparse import urlparse
from google.appengine.api import urlfetch
#from google.appengine.api.urlfetch import DownloadError
#timeout = 3
#socket.setdefaulttimeout(timeout)
class Link():
def __init__(self, a, b):
self.link = a
self.number = b
def __str__(self):
return "%s ; %s" % (self.link, self.number)
def getFeed(i):
r = urlopen('http://search.twitter.com/search.atom?q=twitter&since=2010-02-28&rpp=100&page=%i' %(i))
return r
def processFeed(f):
soup = BeautifulStoneSoup(f.read(),selfClosingTags=["link"])
tweets = []
final = {}
k = 0
j = 0
for entry in soup.findAll("entry"):
title = entry.find('title').contents[0]
if 'http' in title:
temp = re.search("(?P<url>https?://[^\s]+)", title).group("url")
tweets.append(Link(temp,0))
#The for loop below takes care of good urls (yahoo.com), non-sense url (http://asdfaf, http://blah.blah), pages not found (http://google.com/tuff).
#BUT...there are certain response from the host server that just totally crashes the program.
#Downloaderror Applicationerror 5 is a timeout error and Downloaderror Applicationerror 2 is also a connection error
for address in tweets:
#address.link = address.link.strip()
try:
response = urllib2.urlopen(address.link)
#response = urlfetch.fetch(address.link, method=urlfetch.HEAD, deadline=10)
#url_destination = response.final_url
url_destination = response.url
address.link = url_destination
j = j + 1
except urllib2.URLError:
pass
except urllib2.HTTPError:
pass
except UnicodeDecodeError:
pass
while k < j:
o = urlparse(tweets[k].link)
tweets[k].link = o.netloc
k = k + 1
for link in tweets:
temp = link.link.split('.')
temp[len(temp)-1] = temp[len(temp)-1][0:3]
temp = [temp[len(temp)-2],temp[len(temp)-1]]
link.link = '.'.join(temp)
if link.link in final:
final[link.link] += 1
else:
final[link.link] = 1
return final
def TagCloudDivHeader(txt):
return "<div class = 'tagcloud'>\n<div class = 'tagcloudtitle'>%s</div>\n" % txt
def TagCloudDivFooter():
return "</div>\n"
def size(freq):
return freq
def writeTerm(term,freq):
return " <span class='term' style='font-size:"+str(size(freq))+"em'>" + "<a href = 'http://%s'>" %term + term.encode('ISO-8859-1', 'replace') + "</a></span> "+ "\n"
def genForm(prompt = ""):
numberoftweets = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
res = ""
if prompt:
res += "<div class= 'formtitle'>%s</div>" % (prompt)
res += """<form action="index.py" method="post">"""
res +="""<label for="Tweets">Number of Tweets to scan:</label>
<select id="Tweets" name="Tweets">"""
for n in numberoftweets:
res += "<option value = \"%i\">%i</option>" %(n*100,n*100)
res += "</select>"
res += '<input type="submit" value="Go" name="gobtn"/> </form>'
res += "</br>WARNING!!!! The fewer Tweets you scan, the more stable this program is!!!!"
return res
def makeTagCloud(cloudtitle, items):
result = ''
result += TagCloudDivHeader(cloudtitle)
for thing in items:
result += writeTerm(thing, items[thing])
result += TagCloudDivFooter()
result += HTMLFooter()
return result
def HTMLHeader(pageheader = ""):
s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">"
s += "\n<html><head>\n <title>%s</title>\n" % pageheader
s = s + "<link rel='stylesheet' href='/assets/mystyles.css' type='text/css' />\n"
s = s + "</head>\n"
s = s + "<body>\n"
return s
def HTMLFooter():
return "</body>\n</html>"
def generateLinks():
result = ""
result += HTMLHeader("Who's getting the most traffic from Twitter?")
result += "<p>" + "<a href = 'results/'>Proceed?</a>" + "</p>\n"
result += HTMLFooter()
return result
class MainHandler(webapp.RequestHandler):
def get(self):
self.response.headers['Content-Type'] = 'text/html'
path = self.request.path
logging.info("path is " + path)
form = genForm()
contents = generateLinks()
self.response.out.write(HTMLHeader("Who's getting the most traffic from Twitter?"))
self.response.out.write(form)
self.response.out.write(HTMLFooter())
def post(self):
self.response.out.write("Where are links are Twitter taking you?")
self.response.out.write(HTMLHeader("Domain cloud for Twitter Tweets"))
tweets = int(self.request.get('Tweets'))
tweets = int(tweets/100)
self.response.out.write(makeTagCloud("Domains most linked to by Tweets", processFeed(getFeed(tweets))))
def main():
application = webapp.WSGIApplication([('/.*', MainHandler)],debug=True)
wsgiref.handlers.CGIHandler().run(application)
if __name__ == '__main__':
main()
你有没有管理解决这个问题? – theheadofabroom 2011-07-03 14:43:56