1
我正在使用PhantomJS抓取从输入流生成的页面。 但resultjson和Headers总是{},甚至状态是'成功'(第一个url可以正常,但通常第二个会开始{})。谁能告诉我它有什么问题。phantomJS抓取多页得到错误执行
var page = require('webpage').create();
var system = require('system');
var args = system.args;
var fs = require('fs');
var resultjson = {};
var Headers = {};
var urlType, url, path;
var isInUse = false;
function pageInit(page){
page.settings.resourceTimeout = 500000;
page.onResourceTimeout = function (request) {
console.log("fail");
}
page.onConsoleMessage = function (msg, lineNum, sourceID) {
//console.log(msg + "at line " + lineNum);
}
page.onResourceReceived = function (response) {
if (response.url === url && response.stage === "end") {
if (response.status === 301 && response.redirectURL !== null) {
url = response.redirectURL;
}
else {
resultjson.Id = response.id;
resultjson.Url = response.url;
resultjson.Time = response.time;
resultjson.BodySize = response.bodySize;
resultjson.ContentType = response.contentType;
resultjson.RedirectURL = response.redirectURL;
resultjson.Stage = response.stage;
resultjson.Status = response.status;
resultjson.StatusText = response.statusText;
response.headers.forEach(function (header) {
Headers[header.name] = header.value;
});
}
}
};
}
function GetOtherPage(url, path) {
page.open(url, function (status) {
console.log(JSON.stringify(resultjson));
console.log(JSON.stringify(Headers));
resultjson = {};
Headers = {};
if (status !== "success") {
//fs.write(path, page.content, 'w');
console.log("fail");
}
else {
fs.write(path, page.content, 'w');
console.log("success");
}
DoTask();
});
}
function DoTask() {
page.close();
page = require('webpage').create();
pageInit(page);
//page.content = "";
urlType = parseInt(system.stdin.readLine());
if (urlType === -1)
phantom.exit();
url = system.stdin.readLine();
path = system.stdin.readLine();
if (urlType === 3) {
GetOtherPage(url, path);
}
}
DoTask();
结果:
3(input)
https://www.google.com(input)
output.html(input)
{}
{}
success
3(input)
https://www.google.com(input)
output.html(input)
{}
{}
success
更新:看来,当你试图通过一个phantomJS过程中得到相同的URL,phantomJS将让他们在你的磁盘高速缓存,所以它不要求第二次访问url,所以resultjson和Headers是{}。
是的测试,这是有原因的,但在我的包装代码,我已经完成了URL的规范化。我在更新中找到了原因。不管怎么说,还是要谢谢你。 – QShengyao
我想你可以用参数['disk-cache']来控制缓存(http://stackoverflow.com/questions/22822617/how-do-i-make-phantom-js-cache-resources-like-a浏览器)或使用方法['page.clearMemoryCache'](http://stackoverflow.com/questions/23390974/phantomjs-keeping-cache)(但我没有测试它):-) – Gomiero