我需要使用Scrapy报废每个项目的数据(http://example.com/itemview)。我有一个itemID列表,我需要通过example.com中的表单传递它。 每个项目都没有网址更改。因此,对于我的蜘蛛中的每个请求,url将始终是相同的。但内容会有所不同。如何在调用item_scraped scrapy信号后开始新请求?
我不想为处理每个请求的for循环。所以我按照下面提到的步骤。
- 开始蜘蛛与上述URL
- 加入item_scraped和spider_closed信号
- 通过几个功能传递
- 通过刮下数据到管道
- trigerred的item_scraped信号
之后它会自动调用spider_closed信号。但是我想要继续上述步骤直到完成itemID。
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
itemIDs = [11111,22222,33333]
current_item_num = 0
def __init__(self, itemids=None, *args, **kwargs):
super(ExampleSpider, self).__init__(*args, **kwargs)
dispatcher.connect(self.item_scraped, signals.item_scraped)
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.quit()
def start_requests(self):
request = self.make_requests_from_url('http://example.com/itemview')
yield request
def parse(self,response):
self.driver = webdriver.PhantomJS()
self.driver.get(response.url)
first_data = self.driver.find_element_by_xpath('//div[@id="itemview"]').text.strip()
yield Request(response.url,meta={'first_data':first_data},callback=self.processDetails,dont_filter=True)
def processDetails(self,response):
itemID = self.itemIDs[self.current_item_num]
..form submission with the current itemID goes here...
...the content of the page is updated with the given itemID...
yield Request(response.url,meta={'first_data':response.meta['first_data']},callback=self.processData,dont_filter=True)
def processData(self,response):
...some more scraping goes here...
item = ExamplecrawlerItem()
item['first_data'] = response.meta['first_data']
yield item
def item_scraped(self,item,response,spider):
self.current_item_num += 1
#i need to call the processDetails function here for the next itemID
#and the process needs to contine till the itemID finishes
self.parse(response)
我piepline:
class ExampleDBPipeline(object):
def process_item(self, item, spider):
MYCOLLECTION.insert(dict(item))
return