使用CsvItemExporter轻松地将数据导出到csv。
在items.py文件:
import scrapy
class YourItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
field1 = scrapy.Field()
field2 = scrapy.Field()
extradata = scrapy.Field()
在pipelines.py文件:
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from scrapy.exceptions import DropItem
class YourPipeline(object):
def __init__(self):
# Initialize PIPELINE
self.files = {}
self.ids_seen = list()
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
crawler.signals.connect(pipeline.spider_error, signals.spider_error)
crawler.signals.connect(pipeline.item_dropped, signals.item_dropped)
return pipeline
def item_dropped(self,item, response, exception, spider):
# ITEM dropped from pipeline
def spider_error(self,failure, response, spider):
# SPIDER encountered error
def spider_opened(self, spider):
# SPIDER opened
file = open('filename.csv', 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
# SPIDER closed
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
# Process ITEM
if item['UNIQUEIDOFYOURCHOICE'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.append(item['UNIQUEIDOFYOURCHOICE'])
self.exporter.export_item(item)
return item
激活settings.py中
ITEM_PIPELINES = {'PROJECTNAME.pipelines.YourPipeline': 300,}
上述管道的管道将导出到指定的CSV文件,其中项目字段名称为标题以及删除重复项。根据您要指定任何唯一键TES(http://doc.scrapy.org/en/latest/topics/exporters.html?highlight=csvitemexporter)
从SPIDER将数据发送到管道:
from items import YourItem
for VALUE in SETOFVALUES:
item = YourItem()
item['field1'] = 'SOME VALUE'
item['field2'] = 'SOME VALUE2'
item['extradata'] = 'SOMEEXTRADATA'
# yield WILL SEND THE ITEM WITH DATA CURRENTLY ASSIGNED TO IT TO PIPELINE
yield item
它取决于形势的所有网站的结构是否相同?你会把所有的计算都塞进一个蜘蛛吗?一个网站的蜘蛛?请具体对你的工作 – Vasim
我想废一个网站,很多链接都来自同一个网站。 –
好吧,那么为什么你不使用物品管道?在你的scrapy项目下,你必须写下蜘蛛的定义。对???只需使用物品管道将您的报废结果写入CSV或JSON文件即可。 – Vasim