2017-09-26 104 views
0

我想要抓取township directory of China。该网站分为4个层次,分别为省页面,城市页面,县页面和乡镇页面。例如,在省份页面上列出了所有省份。如果我们点击一​​个省份的链接,那么它会将我们带到城市页面,并显示该省的城市列表。Scrapy - 每个项目抓取4级页面,不能先深入

我希望我的每件物品都是乡镇。它包括town_name,town_id(gbcode)和相应的县名,city_name,prov_name。所以当蜘蛛进入乡镇页面时,蜘蛛应该收集信息。但是,我目前使用for循环的方法似乎不起作用。 prov_name没有问题。但市县名称大多不正确,它们始终是其对应页面列表中的最后一个城市/县。我认为问题在于蜘蛛不够深,只能在循环结束时进入parse_county请求。但是,改变设置中的深度优先级并不能解决问题。

---------- Sample Result -------- 
town_name, year, gbcode, city, province, county 
建国门街道办事处,2016,110101008000,市辖区,北京市,延庆区 
东直门街道办事处,2016,110101009000,市辖区,北京市,延庆区 
和平里街道办事处,2016,110101010000,市辖区,北京市,延庆区 
前门街道办事处,2016,110101011000,市辖区,北京市,延庆区 
崇文门外街道办事处,2016,110101012000,市辖区,北京市,延庆区 



import scrapy 
import re 
from scrapy.spiders import Spider 
from admincode.items import AdmincodeItem 

class StatsSpider(Spider): 
    name = 'stats' 
    allowed_domains = ['stats.gov.cn'] 
    start_urls = [ 
     'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)] 

    def parse(self, response): 
     for item in self.parse_provincetr(response, response.selector.css(".provincetr")): 
      yield item 

    def get_text_href(self, td): 
     if not td.xpath('a'): 
      return td.xpath('text()').extract()[0], None 
     else: 
      return td.xpath('a/text()').extract()[0], td.xpath('a/@href').extract()[0] 

    def parse_provincetr(self, response, trs): 
     year_pattern = re.compile('(tjyqhdmhcxhfdm/)([0-9][0-9][0-9][0-9])') 
     year = year_pattern.search(response.url).group(2) 
     for td in trs.xpath('td'): 
      scraped = {} 
      scraped['year'] = year 
      scraped['prov_name'], href = self.get_text_href(td) 
      url = response.urljoin(href) 
      yield scrapy.Request(url, callback=self.parse_citytr, 
           meta={'scraped': scraped}) 

    def parse_2td(self, response, trs, var_name, nextparse): 
     for tr in trs: 
      scraped = response.meta['scraped'] 
      scraped[var_name], href = self.get_text_href(tr.xpath('td')[1]) 
      if nextparse: 
       url = response.urljoin(href) 
       yield scrapy.Request(url, callback=nextparse, meta={'scraped': scraped}) 
      else: 
       item = AdmincodeItem() 
       item['year'] = scraped['year'] 
       item['prov_name'] = scraped['prov_name'] 
       item['city_name'] = scraped['city_name'] 
       item['county_name'] = scraped['county_name'] 
       item['town_name'] = scraped['town_name'] 
       item['gbcode'], href = self.get_text_href(
        tr.xpath('td')[0]) 
       yield item 

    def parse_citytr(self, response): 
     for city in self.parse_2td(response, response.selector.css(".citytr"), 'city_name', self.parse_countytr): 
      yield city 

    def parse_countytr(self, response): 
     for county in self.parse_2td(response, response.selector.css(".countytr"), 'county_name', self.parse_towntr): 
      yield county 

    def parse_towntr(self, response): 
     for town in self.parse_2td(response, response.selector.css(".towntr"), 'town_name', None): 
      yield town 
+0

是的,就像这样。 –

回答

0

我想你只是让事情有点复杂。这是一个简单的刮板,你需要做的是使用元数据将信息从一个页面传递到另一个页面。由于meta是内存中的字典,我们需要确保为项目创建信息的副本。为此,我们使用copy.deepcopy。这将确保数据生成项目之前不会被覆盖

下面是刮板这确实是

class StatsSpider(Spider): 
    name = 'stats' 
    allowed_domains = ['stats.gov.cn'] 
    start_urls = [ 
     'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)] 

    def parse(self, response): 
     for item in response.css(".provincetr a"): 
      name = item.xpath("./text()").extract_first().strip() 
      link = item.xpath("./@href").extract_first().strip() 
      yield response.follow(link, callback=self.parse_province, meta={'item':{'province':name}}) 

    def parse_province(self, response): 
     meta = response.meta['item'] 

     for cityrow in response.css(".citytr"): 
      city_link = cityrow.xpath("./td[2]/a/@href").extract_first() 
      city_name = cityrow.xpath("./td[2]/a/text()").extract_first() 
      city_code = cityrow.xpath("./td[1]/a/text()").extract_first() 

      meta_new = deepcopy(meta) 

      meta_new['city_name'] = city_name 
      meta_new['city_code'] = city_code 

      yield response.follow(city_link, callback=self.parse_city, meta = {'item':meta_new}) 

    def parse_city(self, response): 

     meta = response.meta['item'] 

     for countyrow in response.css(".countytr"): 
      county_link = countyrow.xpath("./td[2]/a/@href").extract_first() 
      county_name = countyrow.xpath("./td[2]/a/text()").extract_first() 
      county_code = countyrow.xpath("./td[1]/a/text()").extract_first() 

      meta_new = deepcopy(meta) 

      meta_new['county_name'] = county_name 
      meta_new['county_code'] = county_code 

      yield response.follow(county_link, callback=self.parse_county, meta = {"item": meta_new}) 

    def parse_county(self, response): 

     meta = response.meta['item'] 

     for townrow in response.css(".towntr"): 
      town_link = townrow.xpath("./td[2]/a/@href").extract_first() 
      town_name = townrow.xpath("./td[2]/a/text()").extract_first() 
      town_code = townrow.xpath("./td[1]/a/text()").extract_first() 

      meta_new = deepcopy(meta) 

      meta_new['town_name'] = town_name 
      meta_new['town_code'] = town_code 

      yield meta_new