2017-08-16 78 views
0

我想抓取Catalog的分页列表,其中正常工作Scrapy如何抓取二级分页或嵌套分页

每个Catalog存在的DataSet只有第一页那边在结果出现一个分页列表。我试图得到看起来像这样的结果,但所有24节点都应该在那里对应于24 DataSet跨越每个页面上的6个项目。

[{'data_sets_count': 24, 
    'description': 'The catalog contains data regarding various indicators of ' 
       'HMIS like Health, Abortions, Immunisation, AEFI, Adolescent, ' 
       'Bite, Sting, Disease, Diarrhoeal, Hypertension, HIV, AIDS, ' 
       'Malaria, Neurological, Stroke, Fever, Respiratory, ' 
       'Infection, suicide, Trauma, Accident, Burn, Tuberculosis, ' 
       'VHND, ASHA, JSY, CHC, PHC, SDH, DH, Hospital.', 
    'last_updated': '11/08/17', 
    'ministry_department': 'Ministry of Health and Family Welfare, Department of ' 
         'Health and Family Welfare', 
    'nodes': [{'node': '3183861', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'April-2014-15'}, 
      {'node': '3183881', 
      'title': 'Item-wise report for North Goa of Goa upto May-2014-15'}, 
      {'node': '3183981', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'October-2014-15'}, 
      {'node': '3184021', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'December-2014-15'}, 
      {'node': '3184061', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'February-2014-15'}, 
      {'node': '3183961', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'September-2014-15'}], 
    'state_department': None, 
    'title': 'HMIS sub district level item-wise monthly report of Goa', 
    'url': '/catalog/hmis-sub-district-level-item-wise-monthly-report-goa'}] 

import scrapy 
class Category(scrapy.Item): 
    title = scrapy.Field() 
    url = scrapy.Field() 
    ministry_department = scrapy.Field() 
    description = scrapy.Field() 
    state_department = scrapy.Field() 
    last_updated = scrapy.Field() 
    data_sets_count = scrapy.Field() 
    data_sets = scrapy.Field() 
    item = scrapy.Field() 
    nodes = scrapy.Field() 

class CatalogSpider(scrapy.Spider): 
    name = 'catalogspider' 
    start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'] 

    def parse(self, response): 
     for catalog in response.css('.view-catalogs > div > .views-row-6'): 
      category = Category() 
      category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first() 
      category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first() 
      category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first() 
      category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first() 
      category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first() 
      category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first() 
      category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0]) 
      category['nodes'] = [] 
      request = scrapy.Request(response.urljoin(category['url']), callback=self.parseDataSets) 
      request.meta['item'] = category 
      yield request 

     for next_page in response.css('li.pager-next > a'): 
      yield response.follow(next_page, self.parse) 


    def parseDataSets(self, response): 
     item = response.meta['item'] 

     for dataset in response.css('.view-resource-detail-popup > div > .views-row'): 
      item['nodes'].append({ 
       'node' : dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0], 
       'title' : dataset.css('.views-field-title .field-content .title-content::text').extract_first() 
       }) 

     for next_page in response.css('li.pager-next'): 
      print('here') 
      request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parseDataSets) 
      request.meta['item'] = item 

     yield item 
+0

你能张贴爬网日志?您可以通过'scrapy crawl spider --logfile output.log'或'scrapy crawl spider 2> 1 | tee output.log'命令(后者将输出放到屏幕和文件中)。 – Granitosaurus

+0

@Granitosaurus我刚刚在代码中做了一些修改,现在我将发布工作代码,但不知道它是否是正确的方法。 – sabithpocker

+0

我将元变量添加到每个子页面中的项目并产生零,最后在最后一页产生元变量。听起来有点冒失,但现在起作用。 – sabithpocker

回答

0

我把它用下面的代码的工作,我不知道它这是应该做的正确方法。我将DataSet添加到元变量category,并产生None,最后得到元变量category当它是最后一页。听起来有点冒失,但现在起作用。

import scrapy 
class Category(scrapy.Item): 
    title = scrapy.Field() 
    url = scrapy.Field() 
    ministry_department = scrapy.Field() 
    description = scrapy.Field() 
    state_department = scrapy.Field() 
    last_updated = scrapy.Field() 
    data_sets_count = scrapy.Field() 
    data_sets_actual_count = scrapy.Field() 
    data_sets = scrapy.Field() 
    item = scrapy.Field() 
    nodes = scrapy.Field() 

class CatalogSpider(scrapy.Spider): 
    name = 'catalogspider' 
    start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'] 

    def parse(self, response): 
     for catalog in response.css('.view-catalogs > div > .views-row-6'): 
      category = Category() 
      category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first() 
      category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first() 
      category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first() 
      category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first() 
      category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first() 
      category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first() 
      category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0]) 
      category['nodes'] = [] 
      request = scrapy.Request(response.urljoin(category['url']), callback=self.parse_data_sets) 
      request.meta['category'] = category 
      yield request 

     #for next_page in response.css('li.pager-next > a'): 
     # yield response.follow(next_page, self.parse) 


    def parse_data_sets(self, response): 
     category = response.meta['category'] 
     datasets = response.css('.view-resource-detail-popup > div > .views-row') 
     if datasets: 
      for dataset in datasets: 
       node = dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0] 
       title = dataset.css('.views-field-title .field-content .title-content::text').extract_first() 
       url = 'https://data.gov.in/node/' + node + '/download' 
       category['nodes'].append({ 
        'node' : node, 
        'title' : title, 
        'url' : url 
        }) 
       yield None 
     else: 
      yield category 

     if len(response.css('li.pager-next').extract()) == 0: 
      category['data_sets_actual_count'] = len(category['nodes']) 
      yield category 

     #pagination 
     for next_page in response.css('li.pager-next'): 
      request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parse_data_sets) 
      request.meta['category'] = category 
      yield request 

我的一个问题是的设置错误的深度在我的命令,我改变了一个更大的数字后,随机的问题时,在未知的领域:

scrapy parse --spider=catalogspider -d 60 'https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'