我从python script
运行scrapy
。Scrapy - 用管道加工物品
有人告诉我,在scrapy
,responses
建在parse()
和进一步处理在pipeline.py
。
这是我的framework
怎么是迄今为止设置:
python脚本
def script(self):
process = CrawlerProcess(get_project_settings())
response = process.crawl('pitchfork_albums', domain='pitchfork.com')
process.start() # the script will block here until the crawling is finished
蜘蛛
class PitchforkAlbums(scrapy.Spider):
name = "pitchfork_albums"
allowed_domains = ["pitchfork.com"]
#creates objects for each URL listed here
start_urls = [
"http://pitchfork.com/reviews/best/albums/?page=1",
"http://pitchfork.com/reviews/best/albums/?page=2",
"http://pitchfork.com/reviews/best/albums/?page=3"
]
def parse(self, response):
for sel in response.xpath('//div[@class="album-artist"]'):
item = PitchforkItem()
item['artist'] = sel.xpath('//ul[@class="artist-list"]/li/text()').extract()
item['album'] = sel.xpath('//h2[@class="title"]/text()').extract()
yield item
items.py
class PitchforkItem(scrapy.Item):
artist = scrapy.Field()
album = scrapy.Field()
settings.py
ITEM_PIPELINES = {
'blogs.pipelines.PitchforkPipeline': 300,
}
pipelines.py
class PitchforkPipeline(object):
def __init__(self):
self.file = open('tracks.jl', 'wb')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
for i in item:
return i['album'][0]
,如果我只是在return item
pipelines.py
,我得到像这样的数据(一个response
每个html
页) :
{'album': [u'Sirens',
u'I Had a Dream That You Were Mine',
u'Sunergy',
u'Skeleton Tree',
u'My Woman',
u'JEFFERY',
u'Blonde/Endless',
u' A Mulher do Fim do Mundo (The Woman at the End of the World) ',
u'HEAVN',
u'Blank Face LP',
u'blackSUMMERS\u2019night',
u'Wildflower',
u'Freetown Sound',
u'Trans Day of Revenge',
u'Puberty 2',
u'Light Upon the Lake',
u'iiiDrops',
u'Teens of Denial',
u'Coloring Book',
u'A Moon Shaped Pool',
u'The Colour in Anything',
u'Paradise',
u'HOPELESSNESS',
u'Lemonade'],
'artist': [u'Nicolas Jaar',
u'Hamilton Leithauser',
u'Rostam',
u'Kaitlyn Aurelia Smith',
u'Suzanne Ciani',
u'Nick Cave & the Bad Seeds',
u'Angel Olsen',
u'Young Thug',
u'Frank Ocean',
u'Elza Soares',
u'Jamila Woods',
u'Schoolboy Q',
u'Maxwell',
u'The Avalanches',
u'Blood Orange',
u'G.L.O.S.S.',
u'Mitski',
u'Whitney',
u'Joey Purp',
u'Car Seat Headrest',
u'Chance the Rapper',
u'Radiohead',
u'James Blake',
u'White Lung',
u'ANOHNI',
u'Beyonc\xe9']}
什么,我想在pipelines.py
做的是能够为每个item
获取个人songs
,就像这样:
[u'Sirens']
请帮助?
您能否提供更清晰的输出部分? –