2009-01-19 50 views

回答

19

urllib模块中有urlretrieve(url, filename=None, reporthook=None, data=None)函数。 如果您将reporthook功能/对象作为token bucket或泄漏存储桶实施,则可以使用全局速率限制。

编辑:经过仔细检查,我发现它并不像我想象的那样容易地用reporthook做全球限价。 reporthook只给出下载量和总大小,它们本身不足以用于令牌桶。解决这个问题的一种方法是将最后下载的数量存储在每个速率限制器中,但使用全局令牌桶。


编辑2:组合两个代码转换成一个例子。

"""Rate limiters with shared token bucket.""" 

import os 
import sys 
import threading 
import time 
import urllib 
import urlparse 

class TokenBucket(object): 
    """An implementation of the token bucket algorithm. 
    source: http://code.activestate.com/recipes/511490/ 

    >>> bucket = TokenBucket(80, 0.5) 
    >>> print bucket.consume(10) 
    True 
    >>> print bucket.consume(90) 
    False 
    """ 
    def __init__(self, tokens, fill_rate): 
     """tokens is the total tokens in the bucket. fill_rate is the 
     rate in tokens/second that the bucket will be refilled.""" 
     self.capacity = float(tokens) 
     self._tokens = float(tokens) 
     self.fill_rate = float(fill_rate) 
     self.timestamp = time.time() 
     self.lock = threading.RLock() 

    def consume(self, tokens): 
     """Consume tokens from the bucket. Returns 0 if there were 
     sufficient tokens, otherwise the expected time until enough 
     tokens become available.""" 
     self.lock.acquire() 
     tokens = max(tokens,self.tokens) 
     expected_time = (tokens - self.tokens)/self.fill_rate 
     if expected_time <= 0: 
      self._tokens -= tokens 
     self.lock.release() 
     return max(0,expected_time) 

    @property 
    def tokens(self): 
     self.lock.acquire() 
     if self._tokens < self.capacity: 
      now = time.time() 
      delta = self.fill_rate * (now - self.timestamp) 
      self._tokens = min(self.capacity, self._tokens + delta) 
      self.timestamp = now 
     value = self._tokens 
     self.lock.release() 
     return value 

class RateLimit(object): 
    """Rate limit a url fetch. 
    source: http://mail.python.org/pipermail/python-list/2008-January/472859.html 
    (but mostly rewritten) 
    """ 
    def __init__(self, bucket, filename): 
     self.bucket = bucket 
     self.last_update = 0 
     self.last_downloaded_kb = 0 

     self.filename = filename 
     self.avg_rate = None 

    def __call__(self, block_count, block_size, total_size): 
     total_kb = total_size/1024. 

     downloaded_kb = (block_count * block_size)/1024. 
     just_downloaded = downloaded_kb - self.last_downloaded_kb 
     self.last_downloaded_kb = downloaded_kb 

     predicted_size = block_size/1024. 

     wait_time = self.bucket.consume(predicted_size) 
     while wait_time > 0: 
      time.sleep(wait_time) 
      wait_time = self.bucket.consume(predicted_size) 

     now = time.time() 
     delta = now - self.last_update 
     if self.last_update != 0: 
      if delta > 0: 
       rate = just_downloaded/delta 
       if self.avg_rate is not None: 
        rate = 0.9 * self.avg_rate + 0.1 * rate 
       self.avg_rate = rate 
      else: 
       rate = self.avg_rate or 0. 
      print "%20s: %4.1f%%, %5.1f KiB/s, %.1f/%.1f KiB" % (
        self.filename, 100. * downloaded_kb/total_kb, 
        rate, downloaded_kb, total_kb, 
       ) 
     self.last_update = now 


def main(): 
    """Fetch the contents of urls""" 
    if len(sys.argv) < 4: 
     print 'Syntax: %s rate url1 url2 ...' % sys.argv[0] 
     raise SystemExit(1) 
    rate_limit = float(sys.argv[1]) 
    urls = sys.argv[2:] 
    bucket = TokenBucket(10*rate_limit, rate_limit) 

    print "rate limit = %.1f" % (rate_limit,) 

    threads = [] 
    for url in urls: 
     path = urlparse.urlparse(url,'http')[2] 
     filename = os.path.basename(path) 
     print 'Downloading "%s" to "%s"...' % (url,filename) 
     rate_limiter = RateLimit(bucket, filename) 
     t = threading.Thread(
      target=urllib.urlretrieve, 
      args=(url, filename, rate_limiter)) 
     t.start() 
     threads.append(t) 

    for t in threads: 
     t.join() 

    print 'All downloads finished' 

if __name__ == "__main__": 
    main() 
+0

谢谢MizardX。这不是我正在寻找的,因为我需要urllib2而不是urllib的实现,但是我认为这肯定指向了正确的方向。 – 2009-01-19 09:21:40