2017-09-26 139 views
1

我想问题出在哪里,我的代码解析Python的队列对象

from queue import Queue 
from threading import Thread 
from html.parser import HTMLParser 
import urllib.request 

hosts = ["http://yahoo.com", "http://google.com", "http://ibm.com"] 

queue = Queue() 

class ThreadUrl(Thread): 
    def __init__(self, queue): 
     Thread.__init__(self) 
     self.queue = queue 

    def run(self): 
     while True: 
     host = self.queue.get() 
     url=urllib.request.urlopen(host) 
     url.read(4096) 
     self.queue.task_done() 


class MyHTMLParser(HTMLParser): 
    def handle_starttag(self, tag, attrs): 
     print("Start tag:", tag) 
     for attr in attrs: 
      print("  attr:", attr) 



def consumer(): 
    for i in range(3): 
     t = ThreadUrl(queue) 
     t.setDaemon(True) 
     t.start() 

    for host in hosts: 
     parser = MyHTMLParser() 
     parser.feed(host) 
     queue.put(host) 
    queue.join() 

consumer() 

我的目标是提取URL的内容,读取队列,最后解析它。当我执行它不代码打印任何东西。我应该在哪里放置解析器?

+0

parser.feed(主机)已经没有任何意义,你需要调用饲料方法与url.read(4096)返回的HTML。 – lcastillov

+0

@lcastillov我现在明白了,但是我应该做新课还是什么? – MishaVacic

+0

在run方法内部使用解析器,并将URL插入到队列中。在ThreadUrl.run方法内创建一个MyHTMLParser类并处理传入主机。 – lcastillov

回答

1

下面是一个例子:

from queue import Queue 
from threading import Thread 
from html.parser import HTMLParser 
import urllib.request 


NUMBER_OF_THREADS = 3 


HOSTS = ["http://yahoo.com", "http://google.com", "http://ibm.com"] 


class MyHTMLParser(HTMLParser): 
    def handle_starttag(self, tag, attrs): 
     print("Start tag:", tag) 
     for attr in attrs: 
      print("\tattr:", attr) 


class ThreadUrl(Thread): 
    def __init__(self, queue): 
     Thread.__init__(self) 
     self.queue = queue 

    def run(self): 
     while True: 
      host = self.queue.get() 
      url = urllib.request.urlopen(host) 
      content = str(url.read(4096)) 
      parser = MyHTMLParser() 
      parser.feed(content) 
      self.queue.task_done() 


def consumer(): 
    queue = Queue() 
    for i in range(NUMBER_OF_THREADS): 
     thread = ThreadUrl(queue) 
     thread.setDaemon(True) 
     thread.start() 
    for host in HOSTS: 
     queue.put(host) 
    queue.join() 


if __name__ == '__main__': 
    consumer()