1
我试图抓取从网站图片与以下scrapy代码:抓取图像失败
import urlparse
from PIL import Image
from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.contrib.pipeline.images import ImagesPipeline
from mobile.items import Website
class MobileSpider(CrawlSpider):
name = "mobile"
allowed_domains = ["mobile-store.ro"]
start_urls = ["http://www.mobile-store.ro/produse/"]
rules = (
Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
)
def parse(self, response, response2):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//ul[@class='products']/li/a/@href").extract()
if not not next_page:
yield Request(next_page[0], self.parse)
sites = hxs.select('//div[@id="wrapper"]/div[@id="content"]')
items = []
for site in sites:
item = Website()
item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
#item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
settings.py:
SPIDER_MODULES = ['mobile.spiders']
NEWSPIDER_MODULE = 'mobile.spiders'
DEFAULT_ITEM_CLASS = 'mobile.items.Website'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
items.py:
from scrapy.item import Item, Field
class Website(Item):
nume = Field()
descriere = Field()
categorie = Field()
brand = Field()
pret = Field()
url = Field()
image_urls = Field()
images = Field()
image_paths = Field()
pipelines.py:
from mobile.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
的问题是当我尝试去获取图片的URL,通过使用下面的代码:
for site in sites:
item = Website()
item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract()
item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract()
item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract()
item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract()
image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
item['image_urls'] = [urlparse.urljoin(response.url2,image_relative_url)]
#item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract()
item['url'] = response.url
items.append(item)
for item in items:
yield item
这将返回页面url而不是图片url。所有其他字段都被正确抓取。有关如何解决此问题并正确获取图像URL的任何线索?
用您的代码修改并得到以下内容: 'File“/opt/mobile/mobile/spiders/mobilestore.py”,第39行,解析 item ['image_urls'] = [urlparse.urljoin(response。 url,image_relative_url)] 文件“/usr/lib/python2.7/urlparse.py”,第253行,在urljoin中 urlparse(url,bscheme,allow_fragments) 文件“/usr/lib/python2.7/urlparse。 (url,scheme,allow_fragments) 文件“/usr/lib/python2.7/urlparse.py”,第168行,在urlsplit中 cached = _parse_cache.get(key,无) exceptions.TypeError:不可用类型:'list'' – laguna
是的,'extract()'返回一个列表,只是得到t他是第一个元素,如果它不是空的。应该管用。 – alecxe
对不起,但它并没有收集图片的网址,实际上根本没有抓取任何物品,它只是抛出上面的异常,任何线索?谢谢 – laguna