2013-05-26 46 views
1

我试图抓取从网站图片与以下scrapy代码:抓取图像失败

import urlparse 
from PIL import Image 
from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest 
from scrapy.spider import BaseSpider 
from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.contrib.loader import XPathItemLoader 
from scrapy.selector import HtmlXPathSelector 
from scrapy.http.request import Request 

from scrapy.contrib.pipeline.images import ImagesPipeline 
from mobile.items import Website 

class MobileSpider(CrawlSpider): 
    name = "mobile" 
    allowed_domains = ["mobile-store.ro"] 
    start_urls = ["http://www.mobile-store.ro/produse/"] 
    rules = (
     Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True), 
     Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item') 
    ) 

    def parse(self, response, response2): 
     hxs = HtmlXPathSelector(response) 
     next_page = hxs.select("//ul[@class='products']/li/a/@href").extract() 
     if not not next_page: 
      yield Request(next_page[0], self.parse) 
     sites = hxs.select('//div[@id="wrapper"]/div[@id="content"]') 
     items = [] 

     for site in sites: 
      item = Website() 
      item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract() 
      item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract() 
      item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract() 
      item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract() 
      image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract() 
      item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)] 
      #item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract() 
      item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract() 
      item['url'] = response.url 
      items.append(item) 
     for item in items: 
      yield item 

settings.py: 
SPIDER_MODULES = ['mobile.spiders'] 
NEWSPIDER_MODULE = 'mobile.spiders' 
DEFAULT_ITEM_CLASS = 'mobile.items.Website' 

ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline'] 

items.py: 
from scrapy.item import Item, Field 

class Website(Item): 
    nume = Field() 
    descriere = Field() 
    categorie = Field() 
    brand = Field() 
    pret = Field() 
    url = Field() 
    image_urls = Field() 
    images = Field() 
    image_paths = Field() 

pipelines.py: 
from mobile.contrib.pipeline.images import ImagesPipeline 
from scrapy.exceptions import DropItem 
from scrapy.http import Request 

class MyImagesPipeline(ImagesPipeline): 

    def get_media_requests(self, item, info): 
     for image_url in item['image_urls']: 
      yield Request(image_url) 

    def item_completed(self, results, item, info): 
     image_paths = [x['path'] for ok, x in results if ok] 
     if not image_paths: 
      raise DropItem("Item contains no images") 
     item['image_paths'] = image_paths 
     return item 

的问题是当我尝试去获取图片的URL,通过使用下面的代码:

for site in sites: 
     item = Website() 
     item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract() 
     item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract() 
     item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract() 
     item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract() 
     image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract() 
     item['image_urls'] = [urlparse.urljoin(response.url2,image_relative_url)] 
     #item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract() 
     item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract() 
     item['url'] = response.url 
     items.append(item) 
    for item in items: 
     yield item 

这将返回页面url而不是图片url。所有其他字段都被正确抓取。有关如何解决此问题并正确获取图像URL的任何线索?

回答

0

这是因为图像(以及整个内容的ad-image-wrapper DIV)通过JavaScript动态填充。

倾销response.bodyparse方法帮助我找出实际图像链接最初保存在ad-thumb-list列表中。所以,请尝试使用以下获取图片网址:

image_relative_url = site.select('//ul[@class="ad-thumb-list"]/li[@class="first_item"]/a/@href').extract() 
if image_relative_url: 
    image_relative_url = image_relative_url[0] 

希望这就是您所需要的。

+0

用您的代码修改并得到以下内容: 'File“/opt/mobile/mobile/spiders/mobilestore.py”,第39行,解析 item ['image_urls'] = [urlparse.urljoin(response。 url,image_relative_url)] 文件“/usr/lib/python2.7/urlparse.py”,第253行,在urljoin中 urlparse(url,bscheme,allow_fragments) 文件“/usr/lib/python2.7/urlparse。 (url,scheme,allow_fragments) 文件“/usr/lib/python2.7/urlparse.py”,第168行,在urlsplit中 cached = _parse_cache.get(key,无) exceptions.TypeError:不可用类型:'list'' – laguna

+0

是的,'extract()'返回一个列表,只是得到t他是第一个元素,如果它不是空的。应该管用。 – alecxe

+0

对不起,但它并没有收集图片的网址,实际上根本没有抓取任何物品,它只是抛出上面的异常,任何线索?谢谢 – laguna