2011-12-30 105 views
0

我已经用python编写了一个计算机程序,但运行速度比我想要的要慢很多。有没有办法让我的Python程序运行得更快?

下面是代码:

from gzip import GzipFile 
from cStringIO import StringIO 
import re 
import webbrowser 
import time 
from difflib import SequenceMatcher 
import os 
import sys 
from BeautifulSoup import BeautifulSoup 
import eventlet 
from eventlet.green import urllib2 
import urllib 
import urllib2 
import cookielib 

TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$') 
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$') 
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp') 

def download(url): 
    print "Downloading:", url 
    s = urllib2.urlopen(url).read() 
    if s[:2] == '\x1f\x8b': # assume it's gzipped data 
     ifh = GzipFile(mode='rb', fileobj=StringIO(s)) 
     s = ifh.read() 
    print "Downloaded: ", url 
    return s 

def replace_chars(text, replacements): 
    return ''.join(replacements.get(x,x) for x in text) 

def handle_listing(listing_url): 
    listing_document = BeautifulSoup(download(listing_url)) 

    # ignore pages that link to yellowpages 
    if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")): 
     listing_title = listing_document.title.text 
     reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''} 
     if TITLE_MATCH.match(listing_title) is not None: 
      title, = TITLE_MATCH.match(listing_title).groups() 
      address, = ADDRESS_MATCH.match(listing_title).groups() 

      yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
       replace_chars(address, reps), 
       replace_chars(title, reps), 
      ) 

      yellow_page = BeautifulSoup(download(yellow_page_url)) 

      page_url = yellow_page.find("h3", {"class" : "business-name fn org"}) 
      if page_url: 
       page_url = page_url.a["href"] 

       business_name = title[:title.index(",")] 

       page = BeautifulSoup(download(page_url)) 
       yellow_page_address = page.find("span", {"class" : "street-address"}) 
       if yellow_page_address: 

        if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5: 
         pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0) 
         page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'}) 

         final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
           pid, page_escaped) 
         return final_url 

def log_in(final_url): 
    data = urllib.urlencode({"inUserName":"[email protected]", "inUserPass":"secretword"}) 
    jar = cookielib.FileCookieJar("cookies") 
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar)) 
    opener.addheaders.append(('User-agent', 'Mozilla/4.0')) 
    opener.addheaders.append(('Referer', 'http://www.locationary.com/')) 
    opener.addheaders.append(('Cookie','site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.1.10.1325009956; __utmc=47547066')) 
    opener.addheaders.append(('Cookie','Cookie: site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.4.10.1325009956; __utmc=47547066')) 
    request = urllib2.Request("https://www.locationary.com/index.jsp?ACTION_TOKEN=tile_loginBar_jsp$JspView$LoginAction", data) 
    response = opener.open(request) 
    url = str(final_url) 
    anything = opener.open(url) 
    page = anything.read() 

States = [#'Alabama', 
      #'Alaska', 
      'Arizona', 
      'Arkansas', 
      'California', 
      'Colorado', 
      'Connecticut', 
      'Delaware', 
      'Florida', 
      'Georgia', 
      'Hawaii', 
      'Idaho', 
      'Illinois', 
      'Indiana', 
      'Iowa', 
      'Kansas', 
      'Kentucky', 
      'Louisiana', 
      'Maine', 
      'Maryland', 
      'Massachusetts', 
      'Michigan', 
      'Minnesota', 
      'Mississippi', 
      'Missouri', 
      'Montana', 
      'Nebraska', 
      'Nevada', 
      'New_Hampshire', 
      'New_Jersey', 
      'New_Mexico', 
      'New_York', 
      'North_Carolina', 
      'North_Dakota', 
      'Ohio', 
      'Oklahoma', 
      'Oregon', 
      'Pennsylvania', 
      'Rhode_Island', 
      'South_Carolina', 
      'South_Dakota', 
      'Tennessee', 
      'Texas', 
      'Utah', 
      'Vermont', 
      'Virginia', 
      'Washington', 
      'West_Virginia', 
      'Wisconsin', 
      'Wyoming'] 

Cities = [] 

def find_cities(state): 
    state_url = 'http://www.locationary.com/place/en/US/' + str(state) 
    state_document = download(str(state_url)) 
    findCities = re.compile('<b>(.*)</b>') 
    getCities = re.findall(findCities,state_document) 

    for City in getCities: 
     reps = {' ':'_'} 
     City = replace_chars(City, reps) 
     Cities.append(str(City)) 

bestworst = ['0','1'] 

def main(): 
    for state in States: 
     find_cities(state) 
     for city in Cities: 
      for num in range(0,1): 
       for pagenum in range(15,16): 
        print '------------------------------------------------------------------------------------------------------------------------------------------------------------' 
        print '------------------------------------------------------------------------------------------------------------------------------------------------------------' 
        if str(num) == '0': 
         print str(state) + ', ' + str(city) + ', ' + 'Best Profiles' + ', ' + 'Page ' + str(pagenum) 
        else: 
         print str(state) + ', ' + str(city) + ', ' + 'Worst Profiles' + ', ' + 'Page ' + str(pagenum) 
        START_URL = 'http://www.locationary.com/place/en/US/' + str(state) + '/' + city + '-page' + str(pagenum) + '/?ACTION_TOKEN=NumericAction&order=' + str(num) 
        pool = eventlet.GreenPool() 
        listings_document = BeautifulSoup(download(START_URL)) 
        listings = listings_document.findAll("a", href = LOCATION_LISTING) 
        listings = [listing['href'] for listing in listings] 

        count_listings = 0 

        for final_url in pool.imap(handle_listing, listings): 
         print final_url 
         if final_url is not None: 
          log_in(final_url) 

if __name__ == '__main__': 
    main() 

有没有一种方法,使其更快或者是不可能的?它必须从互联网下载很多网址,但我非常确定我的网络连接速度比现在快10到50倍......而且我的电脑速度不是很慢......所以,有什么办法可以让我的程序速度提高10-50倍?我知道这可能听起来很荒谬,但专业程序员如何让他们的程序更快呢?

+2

它属于http://codereview.stackexchange.com/ – 2011-12-30 16:45:39

+0

您可以使用多个线程来获取不同的页面。 – 2011-12-30 16:46:14

+0

专业程序员使他们的程序更快的方式是通过分析。看看Python的cProfile模块。 – nmichaels 2011-12-30 16:47:14

回答

6

加速任何程序的第一步是了解为什么它很慢 - 即时间到了哪里?工具程序员用来做这件事叫做profiler。标准Python包括几个这些:你可以了解他们here.

一旦你学会了使用分析器,在你的程序来识别热点,或位置在程序中花费时间最多的运行它。然后尝试通过以下两种方式之一加速程序:

  1. 尽量让热点花费更少的时间;或
  2. 尽量让热点执行次数更少。

通常情况下#2会更有成效。选择更好或更合适的算法可以减少执行的代码量。

不要浪费时间猜测为什么程序很慢; 衡量它,然后投入你的精力来解决真正的问题。程序员在猜测性能问题出在哪里的时候是非常糟糕的。

2

程序员优化代码的方式是使用探查器,python使几个可用。 Here is a great article让你开始。

可以通过命令行调用timeit

python -m timeit myprogram.py 

上面的链路具有一堆使用timeit的例子。一旦你找出你的瓶颈所在,你可以想办法解决它们。如果你的程序在下载()函数中花费了过多的时间,你可以考虑引入某种并发性并在后台下载东西,而你的程序继续使用BeautifulSoup来解析已经存在的东西的提取信息下载。

这里的关键是看:

  1. 如果你的程序花费了大部分时间。
  2. 在那里你可以优化的最简单的

对于一个假设的例子在步骤#1的位置,如果你的正则表达式特别写得不好,他们可以花费很长的时间,然后你就可以工作进行优化。我说的是“假设的”,因为在实践中,你的正则表达式不可能是一个重要的瓶颈,除非你正在执行它们数百万次或者类似的奇怪事情。

相关问题