2016-07-29 32 views
1

我正在使用PhantomJS抓取从输入流生成的页面。 但resultjson和Headers总是{},甚至状态是'成功'(第一个url可以正常,但通常第二个会开始{})。谁能告诉我它有什么问题。phantomJS抓取多页得到错误执行

var page = require('webpage').create(); 
var system = require('system'); 
var args = system.args; 
var fs = require('fs'); 
var resultjson = {}; 
var Headers = {}; 
var urlType, url, path; 
var isInUse = false; 


function pageInit(page){ 
    page.settings.resourceTimeout = 500000; 

    page.onResourceTimeout = function (request) { 
     console.log("fail"); 
    } 

    page.onConsoleMessage = function (msg, lineNum, sourceID) { 
     //console.log(msg + "at line " + lineNum); 
    } 


    page.onResourceReceived = function (response) { 
     if (response.url === url && response.stage === "end") { 
      if (response.status === 301 && response.redirectURL !== null) { 
       url = response.redirectURL; 
      } 
      else { 
       resultjson.Id = response.id; 
       resultjson.Url = response.url; 
       resultjson.Time = response.time; 
       resultjson.BodySize = response.bodySize; 
       resultjson.ContentType = response.contentType; 
       resultjson.RedirectURL = response.redirectURL; 
       resultjson.Stage = response.stage; 
       resultjson.Status = response.status; 
       resultjson.StatusText = response.statusText; 
       response.headers.forEach(function (header) { 
        Headers[header.name] = header.value; 
       }); 
      } 
     } 
    }; 
} 
function GetOtherPage(url, path) { 
    page.open(url, function (status) { 
     console.log(JSON.stringify(resultjson)); 
     console.log(JSON.stringify(Headers)); 
     resultjson = {}; 
     Headers = {}; 
     if (status !== "success") { 
      //fs.write(path, page.content, 'w'); 
      console.log("fail"); 
     } 
     else { 
      fs.write(path, page.content, 'w'); 
      console.log("success"); 
     } 
     DoTask(); 
    }); 
} 
function DoTask() { 
    page.close(); 
    page = require('webpage').create(); 
    pageInit(page); 
    //page.content = ""; 
    urlType = parseInt(system.stdin.readLine()); 
    if (urlType === -1) 
     phantom.exit(); 
    url = system.stdin.readLine(); 
    path = system.stdin.readLine(); 
    if (urlType === 3) { 
     GetOtherPage(url, path); 
    } 
} 
DoTask(); 

结果:

3(input) 
https://www.google.com(input) 
output.html(input) 
{} 
{} 
success 
3(input) 
https://www.google.com(input) 
output.html(input) 
{} 
{} 
success 

更新:看来,当你试图通过一个phantomJS过程中得到相同的URL,phantomJS将让他们在你的磁盘高速缓存,所以它不要求第二次访问url,所以resultjson和Headers是{}。

回答

2

该属性response.urlnormalized,所以它添加了一个尾随'/'键入。

由于该比较response.url === url是错误的,因此它永远不会输入if

测试与https://www.google.com/(与后'/'),给了我下面的输出:

3 
https://www.google.com/ 
output.html 
{"Id":1,"Url":"https://www.google.com/","Time":"2016-07-29T03:56:24.294Z","ContentType":"text/html; charset=UTF-8","RedirectU 
RL":"https://www.google.com.br/?gfe_rd=cr&ei=BNSaV_KoL6KB8QfJrYPoDA","Stage":"end","Status":302,"StatusText":"Found"} 
{"Cache-Control":"private","Content-Type":"text/html; charset=UTF-8","Location":"https://www.google.com.br/?gfe_rd=cr&ei=BNSa 
V_KoL6KB8QfJrYPoDA","Content-Length":"263","Date":"Fri, 29 Jul 2016 03:56:52 GMT","Alternate-Protocol":"443:quic","Alt-Svc":" 
quic=\":443\"; ma=2592000; v=\"36,35,34,33,32,31,30,29,28,27,26,25\""} 
success 

与phantomjs 2.1.1

+0

是的测试,这是有原因的,但在我的包装代码,我已经完成了URL的规范化。我在更新中找到了原因。不管怎么说,还是要谢谢你。 – QShengyao

+0

我想你可以用参数['disk-cache']来控制缓存(http://stackoverflow.com/questions/22822617/how-do-i-make-phantom-js-cache-resources-like-a浏览器)或使用方法['page.clearMemoryCache'](http://stackoverflow.com/questions/23390974/phantomjs-keeping-cache)(但我没有测试它):-) – Gomiero