试图废下一页与Scrapy,Python的3.5使用urlib Python库使用的urllib与Scrapy为分页
import datetime
import urllib.request
import urllib.error
import urllib.parse
import socket
import scrapy
from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from properties.items import PropertiesItem
class BasicSpider(scrapy.Spider):
name = "manual"
allowed_domains = ["web"]
# Start on the first index page
start_urls = (
'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
)
def parse(self, response):
# Get the next index URLs and yield Requests
next_selector = response.xpath('//*[contains(@class,"next")]//@href')
for url in next_selector.extract():
yield Request(urllib.parse.urljoin(response.url, url))
# Get item URLs and yield Requests
item_selector = response.xpath('//*[@itemprop="url"]/@href')
for url in item_selector.extract():
yield Request(urllib.parse.urljoin(response.url, url), callback=self.parse_item)
def parse(self, response):
l = ItemLoader(item=PropertiesItem(), response=response)
l.add_xpath('title', '//*[@itemprop="name"]/text()')
return l.load_item()
一切都工作得很好,没有错误,但Scrapy只读取第一页,但根据代码它应该获取所有接下来的页面
这里是输出
[{
"title": [
"bermondsey ec kennington drive acton seven rm",
.......
"mary conversion borders eastham with gas"
}]
// Only Page 0 Titles :(
什么不对的请求或urllib的调用语法?
PS:XPath时,Scrapy壳牌 'URL'
哇完全GIT的例子!男人,你很棒,谢谢你。 –