2017-04-21 361 views
0

嗨,我尝试使用下面的代码抓取digg.com上的首页图像。问题是0.jpg到6.jpg是正常的。从7.jpg开始到47.jpg都是腐败的。不知道为什么。损坏的图像已损坏

这是代码。 Github上的位置:https://github.com/kenpeter/py_mm

# os 
import os 
# http request 
import requests 
# 
import pprint 

import time 

# import html from lxml 
from lxml import html 

# global 
global_page_num = 0 
pp = pprint.PrettyPrinter(indent=4) 

# write to file 
def download_image(img_urls): 
    # total img urls 
    amount = len(img_urls) 

    # loop 
    for index, value in enumerate(img_urls, start=0): 
     # file name 
     filename = 'img/%s.jpg' % (index) 
     # dir 
     os.makedirs(os.path.dirname(filename), exist_ok=True) 

     print('--- start ---') 
     print('filename: %s' % filename) 
     print('Downloading: %s out of %s' % (index, amount)) 

     # open file 
     with open(filename, 'wb') as f: 
      # f write 
      # time.sleep(1) 
      f.write(requests.get(value).content) 


def get_page_number(num): 
    url = 'http://digg.com' 
    response = requests.get(url).content 
    selector = html.fromstring(response) 

    img_urls = [] 
    img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") 

    news_texts = [] 
    news_texts = selector.xpath("//div[@itemprop='description']/text()") 

    # test 
    # print('--- something ---') 
    # pp.pprint(img_urls) 
    # pp.pprint(news_texts) 

    download_image(img_urls) 

    return img_urls 


if __name__ == '__main__': 
    # input, page_number, everything into the var 
    # page_number = input('Please enter the page number that you want to scrape:') 

    # global_page_num 
    # global_page_num = page_number; 
    # print('hell world!'); 

    page_number = 4 # hardcode 
    get_page_number(page_number) 

回答

0

为什么图像是“腐败”的原因是,在页面内的方案变化和图像开始“隐藏”在你与你的代码抓取其内容属性的data-src代替src 。看到这里所抓取网页的源代码的例子有两个属性:

<img 
class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset" 
data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg" 
src="http://static.digg.com/static/fe/944294/images/x_455x248.png" 
width="312" 
height="170" 
alt="" 
/> 

在您需要检查这两个属性srcdata-src给予data-src优先src在创建图像的URL列表等字样。

此代码的“绝招”,并下载正确的图像:

# os 
import os 
# http request 
import requests 
# 
import pprint 

import time 

# import html from lxml 
from lxml import html 

# global 
global_page_num = 0 
pp = pprint.PrettyPrinter(indent=4) 

# write to file 
def download_image(img_urls): 
    # total img urls 
    amount = len(img_urls) 

    # loop 
    for index, value in enumerate(img_urls, start=0): 
     # file name 
     filename = 'img/%s.jpg' % (index) 
     # dir 
     os.makedirs(os.path.dirname(filename), exist_ok=True) 

     print('--- start ---') 
     print('filename: %s' % filename) 
     print('Downloading: %s out of %s' % (index, amount)) 

     # open file 
     with open(filename, 'wb') as f: 
      # f write 
      # time.sleep(1) 
      f.write(requests.get(value).content) 


def get_page_number(num): 
    url = 'http://digg.com' 
    response = requests.get(url).content 
    selector = html.fromstring(response) 

    img_urls = [] 
    img_urls_1a = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src") 
    img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item] 
    img_urls_2 = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@data-src") 
    img_urls = img_urls_1b + img_urls_2 
    # print(img_urls) 
    news_texts = [] 
    news_texts = selector.xpath("//div[@itemprop='description']/text()") 

    # test 
    # print('--- something ---') 
    # pp.pprint(img_urls) 
    # pp.pprint(news_texts) 

    download_image(img_urls) 

    return img_urls 


if __name__ == '__main__': 
    # input, page_number, everything into the var 
    # page_number = input('Please enter the page number that you want to scrape:') 

    # global_page_num 
    # global_page_num = page_number; 
    # print('hell world!'); 

    page_number = 4 # hardcode 
    get_page_number(page_number)