2017-09-27 118 views
0

我想从iframe获取内容,因此我将splash请求端点从execute更改为render.json。 Howerver,splash.wait根本不起作用。这是蜘蛛代码。当scrapy endpoint ='render.json'时,splash不会等待

import scrapy 
from scrapy_splash import SplashRequest 
from scrapy.http import HtmlResponse 
src=""" 
function main(splash, args) 
    assert(splash:go(args.url)) 
    assert(splash:wait(10)) 
    return { 
    html = splash:html() 
    } 
end 

""" 

class Lafarge (scrapy.Spider): 
    name = "lafargespider" 

    def __init__(self, *args, **kwargs): 
     self.root_url = "https://cacareers-lafarge-na.icims.com/jobs/search?pr=0&searchRelation=keyword_all&schemaId=&o=" 

    def start_requests(self): 
      yield SplashRequest(self.root_url, self.parse_detail, 
       endpoint='render.json', 
       args={ 
        'iframes': 1, 
        'html' : 1, 
        'lua_source': src, 
        'timeout': 90 
       } 
      ) 
    def parse_detail(self, response): 
     #response decoded 
     rs = response.data['childFrames'][0]['html'] 
     response = HtmlResponse(url="my HTML string", body=rs, encoding='utf-8') 
     print("next page ===>",response.xpath('//a[@class="glyph "]/@href').extract_first()) 

回答

1

在Splash.request参数中传递等待时间解决了我的问题。

def start_requests(self): 
     yield SplashRequest(self.root_url, self.parse_detail, 
      endpoint='render.json', 
      args={ 
       'wait': 5, 
       'iframes': 1, 
       'html' : 1, 
       'lua_source': src, 
      } 
     ) 
def parse_detail(self, response): 
    rs = response.data['childFrames'][0]['html'] 
1

在args中传递等待参数。它应该是 -

ARGS = { '等待':5, '内部框架':1, 'HTML':1, 'lua_source':SRC, '超时':90 }