2012-07-05 99 views
124

我使用PhantomJS v1.4.1加载一些网页。我没有访问他们的服务器端,我只是得到指向他们的链接。我使用Phantom的过时版本,因为我需要在该网页上支持Adobe Flash。phantomjs不等待“全部”页面加载

问题是许多网站正在加载他们的次要内容异步,这就是为什么Phantom的onLoadFinished回调(类似于HTML中的onLoad)在未加载任何东西时触发得太早。任何人都可以建议我如何等待网页的全部加载,例如,所有动态内容(如广告)的截图?

回答

12

也许你可以使用onResourceRequested and onResourceReceived callbacks来检测异步加载。下面是使用这些回调from their documentation的例子:

var page = require('webpage').create(); 
page.onResourceRequested = function (request) { 
    console.log('Request ' + JSON.stringify(request, undefined, 4)); 
}; 
page.onResourceReceived = function (response) { 
    console.log('Receive ' + JSON.stringify(response, undefined, 4)); 
}; 
page.open(url); 

此外,你可以看看examples/netsniff.js的工作示例。

+0

但在这种情况下,我不能使用PhantomJS的一个实例一次加载多个页面,对吗? – nilfalse

+0

onResourceRequested是否适用于AJAX /跨域请求?还是它只适用于像CSS,图像..等? – CMCDragonkai

+0

@CMCDragonkai我从来没有使用过它,但基于[this](https://github.com/ariya/phantomjs/wiki/Network-Monitoring),它似乎包含所有请求。 Quote:'所有的资源请求和响应可以使用onResourceRequested和onResourceReceived嗅探' – Supr

18

你可以尝试的WAITFOR和光栅化实例的组合:

/** 
* See https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js 
* 
* Wait until the test condition is true or a timeout occurs. Useful for waiting 
* on a server response or for a ui change (fadeIn, etc.) to occur. 
* 
* @param testFx javascript condition that evaluates to a boolean, 
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or 
* as a callback function. 
* @param onReady what to do when testFx condition is fulfilled, 
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or 
* as a callback function. 
* @param timeOutMillis the max amount of time to wait. If not specified, 3 sec is used. 
*/ 
function waitFor(testFx, onReady, timeOutMillis) { 
    var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s 
     start = new Date().getTime(), 
     condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()), //< defensive code 
     interval = setInterval(function() { 
      if ((new Date().getTime() - start < maxtimeOutMillis) && !condition) { 
       // If not time-out yet and condition not yet fulfilled 
       condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code 
      } else { 
       if(!condition) { 
        // If condition still not fulfilled (timeout but condition is 'false') 
        console.log("'waitFor()' timeout"); 
        phantom.exit(1); 
       } else { 
        // Condition fulfilled (timeout and/or condition is 'true') 
        console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms."); 
        typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled 
        clearInterval(interval); //< Stop this interval 
       } 
      } 
     }, 250); //< repeat check every 250ms 
}; 

var page = require('webpage').create(), system = require('system'), address, output, size; 

if (system.args.length < 3 || system.args.length > 5) { 
    console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]'); 
    console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"'); 
    phantom.exit(1); 
} else { 
    address = system.args[1]; 
    output = system.args[2]; 
    if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") { 
     size = system.args[3].split('*'); 
     page.paperSize = size.length === 2 ? { 
      width : size[0], 
      height : size[1], 
      margin : '0px' 
     } : { 
      format : system.args[3], 
      orientation : 'portrait', 
      margin : { 
       left : "5mm", 
       top : "8mm", 
       right : "5mm", 
       bottom : "9mm" 
      } 
     }; 
    } 
    if (system.args.length > 4) { 
     page.zoomFactor = system.args[4]; 
    } 
    var resources = []; 
    page.onResourceRequested = function(request) { 
     resources[request.id] = request.stage; 
    }; 
    page.onResourceReceived = function(response) { 
     resources[response.id] = response.stage; 
    }; 
    page.open(address, function(status) { 
     if (status !== 'success') { 
      console.log('Unable to load the address!'); 
      phantom.exit(); 
     } else { 
      waitFor(function() { 
       // Check in the page if a specific element is now visible 
       for (var i = 1; i < resources.length; ++i) { 
        if (resources[i] != 'end') { 
         return false; 
        } 
       } 
       return true; 
      }, function() { 
       page.render(output); 
       phantom.exit(); 
      }, 10000); 
     } 
    }); 
} 
+3

似乎它不适用于使用任何服务器推送技术的网页,因为资源会onLoad发生后仍然在使用。 – nilfalse

+0

做任何驱动程序,例如。 [poltergeist](https://github.com/jonleighton/poltergeist),有这样的功能? –

+0

是否可以使用waitFor轮询整个HTML文本并搜索定义的关键字?我试图实现这一点,但似乎轮询不会刷新到最新下载的HTML源代码。 – fpdragon

66

另一种方法是只问PhantomJS等待了一下页面已经做渲染之前加载后,按常规rasterize.js例如,但较长时间的超时允许JavaScript来完成加载额外的资源:

page.open(address, function (status) { 
    if (status !== 'success') { 
     console.log('Unable to load the address!'); 
     phantom.exit(); 
    } else { 
     window.setTimeout(function() { 
      page.render(output); 
      phantom.exit(); 
     }, 1000); // Change timeout as required to allow sufficient time 
    } 
}); 
+1

是的,目前我坚持这种方法。 – nilfalse

+7

您应该接受回复 – alex88

+89

这是一个可怕的解决方案,对不起(这是PhantomJS的错!)。如果等待一秒钟,但需要20ms加载,这完全浪费时间(想想批处理作业),或者如果花费时间超过一秒钟,它仍然会失败。专业工作无法忍受这种无效率和不可靠性。 – CoDEmanX

13

在我的计划,我用一些逻辑来判断,如果它是有载:看它的网络请求,如果没有新的请求在过去的200毫秒,我tre在它上载。

在onLoadFinish()之后使用这个。

function onLoadComplete(page, callback){ 
    var waiting = []; // request id 
    var interval = 200; //ms time waiting new request 
    var timer = setTimeout(timeout, interval); 
    var max_retry = 3; // 
    var counter_retry = 0; 

    function timeout(){ 
     if(waiting.length && counter_retry < max_retry){ 
      timer = setTimeout(timeout, interval); 
      counter_retry++; 
      return; 
     }else{ 
      try{ 
       callback(null, page); 
      }catch(e){} 
     } 
    } 

    //for debug, log time cost 
    var tlogger = {}; 

    bindEvent(page, 'request', function(req){ 
     waiting.push(req.id); 
    }); 

    bindEvent(page, 'receive', function (res) { 
     var cT = res.contentType; 
     if(!cT){ 
      console.log('[contentType] ', cT, ' [url] ', res.url); 
     } 
     if(!cT) return remove(res.id); 
     if(cT.indexOf('application') * cT.indexOf('text') != 0) return remove(res.id); 

     if (res.stage === 'start') { 
      console.log('!!received start: ', res.id); 
      //console.log(JSON.stringify(res)); 
      tlogger[res.id] = new Date(); 
     }else if (res.stage === 'end') { 
      console.log('!!received end: ', res.id, (new Date() - tlogger[res.id])); 
      //console.log(JSON.stringify(res)); 
      remove(res.id); 

      clearTimeout(timer); 
      timer = setTimeout(timeout, interval); 
     } 

    }); 

    bindEvent(page, 'error', function(err){ 
     remove(err.id); 
     if(waiting.length === 0){ 
      counter_retry = 0; 
     } 
    }); 

    function remove(id){ 
     var i = waiting.indexOf(id); 
     if(i < 0){ 
      return; 
     }else{ 
      waiting.splice(i,1); 
     } 
    } 

    function bindEvent(page, evt, cb){ 
     switch(evt){ 
      case 'request': 
       page.onResourceRequested = cb; 
       break; 
      case 'receive': 
       page.onResourceReceived = cb; 
       break; 
      case 'error': 
       page.onResourceError = cb; 
       break; 
      case 'timeout': 
       page.onResourceTimeout = cb; 
       break; 
     } 
    } 
} 
47

我宁愿定期检查document.readyState状态(https://developer.mozilla.org/en-US/docs/Web/API/document.readyState)。虽然这种方法有点笨拙,但您可以确定在onPageReady函数中使用了完全加载的文档。

var page = require("webpage").create(), 
    url = "http://example.com/index.html"; 

function onPageReady() { 
    var htmlContent = page.evaluate(function() { 
     return document.documentElement.outerHTML; 
    }); 

    console.log(htmlContent); 

    phantom.exit(); 
} 

page.open(url, function (status) { 
    function checkReadyState() { 
     setTimeout(function() { 
      var readyState = page.evaluate(function() { 
       return document.readyState; 
      }); 

      if ("complete" === readyState) { 
       onPageReady(); 
      } else { 
       checkReadyState(); 
      } 
     }); 
    } 

    checkReadyState(); 
}); 

附加说明:

使用嵌套setTimeout代替setInterval防止checkReadyState从“重叠”和竞态条件时其执行被延长一段随机的原因。 setTimeout的默认延迟为4ms(https://stackoverflow.com/a/3580085/1011156),因此活动轮询不会对程序性能造成严重影响。

document.readyState === "complete"表示文档已完全加载所有资源(https://html.spec.whatwg.org/multipage/dom.html#current-document-readiness)。

+4

setTimeout的注释vs setInterval很棒。 –

+0

'readyState'将只在DOM触发已经满载,但是任何''