2
我使用Scrapy Framework创建了一个webscraper,以从website获取演唱会门票数据。我已经能够成功地为少数几个选择器刮取数据,这些选择器基本上只是HTML文本,但其他一些选择器正在收集任何东西。当我尝试从每张票据中刮取音乐会日期时,尽管事实上我使用的xpath在开发者控制台中运行时返回了所有正确的日期,但在响应中会返回一个空数组。在类定义中定义项目的方式有什么问题吗?任何帮助将不胜感激:Xpath在控制台中正确定位html元素,但在scrapy响应中使用时返回空数组
from scrapy.contrib.spiders import CrawlSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from concert_comparator.items import ComparatorItem
bandname = raw_input("Enter a bandname \n")
vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html"
class MySpider(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
#rules = (Rule(LinkExtractor(allow=('-tickets/.*',)), callback='parse_item'))
# item = ComparatorItem()
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
item_fields = {
'eventName' : './/*[@class="productionsEvent"]/text()',
#'ticketPrice' : '//*[@class="eventTickets lastChild"]/div/div/@data-origin-price',
'eventLocation' : './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()',
'ticketsLink' : './/a/@href',
#returns empty set
'eventDate' : './/*[@class = "productionsDateCol productionsDateCol sorting_3"]/div[@class = "productionsDate"]/text()',
'eventCity' : './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()',
'eventState' : './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()',
#returns empty set
'eventTime' : './/*[@class = "productionsDateCol productionsDateCol sorting_3"]/div[@class = "productionsTime"]/text()'
}
def parse(self, response):
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
for field, xpath in self.item_fields.iteritems():
loader.add_xpath(field, xpath)
yield loader.load_item()