我知道我很晚了,但以防万一别人需要它。 Glassdoor动态生成这些属性,所以我使用了splash请求来处理它们。 下面是代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
id = 1
class GlassdoorData(scrapy.Spider):
name = 'glassdoordata'
#allowed_domains = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
start_urls = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(
url,
self.parse,
args={'wait': 10},
)
def parse(self, response):
#main_url = "https://www.glassdoor.ca"
urls = response.css('li.jl > div > div.flexbox > div > a::attr(href)').extract()
for url in urls:
url = "https://www.glassdoor.ca" + url
yield SplashRequest(url = url, callback = self.parse_details,args={'wait': 10})
global id
id = id+1
#if id < 2 :
next_page_url = "https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP{}.htm".format(id)
if next_page_url:
#next_page_url = response.urljoin(next_page_url)
#self.log("reached22: "+ next_page_url)
yield SplashRequest(url = next_page_url, callback = self.parse,args={'wait': 10},)
def parse_details(self,response):
yield{
'Job_Title' : response.css('div.header.cell.info > h2::text').extract_first(),
'Company' : response.css('div.header.cell.info > span.ib::text').extract_first(),
'Location' : response.css('div.header.cell.info > span.subtle.ib::text').extract_first(),
'Website' : response.xpath("//div[@class = 'infoEntity']/span/a/text()").extract(),
'Size' : response.xpath("//div[@class = 'infoEntity']/label[contains(text(),'Size')]/following-sibling::span/text()").extract(),
'Industry' : (response.xpath("//div[@class = 'infoEntity']/label[contains(text(),'Industry')]/following-sibling::span/text()").extract_first()).lstrip(),
'Type' : (response.xpath("//div[@class = 'infoEntity']/label[contains(text(),'Type')]/following-sibling::span/text()").extract_first()).lstrip(),
'Revenue' : (response.xpath("//div[@class = 'infoEntity']/label[contains(text(),'Revenue')]/following-sibling::span/text()").extract_first()).lstrip(),
'Competitors' : (response.xpath("//div[@class = 'infoEntity']/label[contains(text(),'Competitors')]/following-sibling::span/text()").extract_first()).lstrip(),
}
编辑settings.py这样的:
BOT_NAME = 'glassdoordata'
SPIDER_MODULES = ['glassdoordata.spiders']
NEWSPIDER_MODULE = 'glassdoordata.spiders'
# Obey robots.txt rules
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPLASH_URL = 'http://192.168.99.100:8050'
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
ROBOTSTXT_OBEY = False
你需要运行这个程序之前安装飞溅。
谢谢
您正在抓取的网页是动态的(需要由JavaScript引擎呈现)。 Scrapy只能看到简单的源代码。 – kev
@kev是正确的,网页使用XHR调用'http://www.glassdoor.com/Overview/companyOverviewBasicInfoAjax.htm?&employerId = 20496&title = Company + Info&linkCompetitors = true'来加载公司的其他信息。 '20496'编号可以在页面HTML源代码中找到。 –