2017-10-20 58 views
0

这是我的网页scrapy的简单结构。如何包装在scrapy中创建start_urls的过程?

import scrapy,urllib.request  
class TestSpider(scrapy.Spider): 
    def __init__(self, *args, **kw): 
     self.timeout = 10 

    name = "quotes" 
    allowed_domains = ["finance.yahoo.com"] 

    url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt" 
    s = urllib.request.urlopen(url_nasdaq).read().decode('ascii') 
    s1 = s.split('\r\n')[1:-2] 
    namelist = [] 
    for item in s1: 
     if "NASDAQ TEST STOCK" not in item:namelist.append(item) 
    s2 = [s.split('|')[0] for s in namelist] 
    s3=[] 
    for symbol in s2: 
     if "." not in symbol : 
      s3.append(symbol) 

    start_urls = ["https://finance.yahoo.com/quote/"+s+"/financials?p="+s for s in s2] 


    def parse(self, response): 
     content = response.body 
     target = response.url 
     #doing somthing ,omitted code 

将其保存为test.py并与scrapy runspider test.py运行它。

现在我想包装所有创建start_urls的代码。
我在这里试试。

class TestSpider(scrapy.Spider): 
    def __init__(self, *args, **kw): 
     self.timeout = 10 
     url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt" 
     s = urllib.request.urlopen(url_nasdaq).read().decode('ascii') 
     s1 = s.split('\r\n')[1:-2] 
     namelist = [] 
     for item in s1: 
      if "NASDAQ TEST STOCK" not in item : namelist.append(item) 
     s2 = [s.split('|')[0] for s in namelist] 
     s3=[] 
     for symbol in s2: 
      if "." not in symbol : s3.append(symbol) 
     self.start_urls = ["https://finance.yahoo.com/quote/"+s+"/financials?p="+s for s in s3] 

它不能工作。

回答

1

这是什么start_requests蜘蛛的方法是。它用于创建初始请求。以您为例,建议如下:

class TestSpider(scrapy.Spider): 
    def __init__(self, *args, **kw): 
     self.timeout = 10 

    def start_requests(self): 
     url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt" 
     s = urllib.request.urlopen(url_nasdaq).read().decode('ascii') 
     s1 = s.split('\r\n')[1:-2] 
     namelist = [] 
     for item in s1: 
      if "NASDAQ TEST STOCK" not in item : namelist.append(item) 
     s2 = [s.split('|')[0] for s in namelist] 
     s3=[] 
     for symbol in s2: 
      if "." not in symbol : s3.append(symbol) 
     for s in s3: 
      yield scrapy.Request("https://finance.yahoo.com/quote/"+s+"/financials?p="+s, callback=self.parse)