2017-02-12 149 views
1

问题

我试图刮图像和自动分页。我使用的是网页上项目的span说明对比总项目:1 - 20 of 83,829 results。我想通过这个while循环运行梦魇,但它会挂起并给我一个Javascript heap out of memory错误。有没有办法让每次都执行它,而不是推到堆栈上,因为我觉得它就是这样做的。如何在while循环中执行nightmarejs

代码来解决

function scrapeEach(paginate) { 
// while paginate has next scrapeEach 
while (paginate.next) { 
    nightmare 
    .wait(4000) 
    .realClick('#pageNext .results-next') 
    .wait(4000) 
    .evaluate(() => { 
     return document.body.innerHTML 
    }) 
    .then(result => { 
     scrapeImages(result); 
     paginate.update(); 
     paginate.state(); 
    }) 
    .catch(error => { 
     console.error('Search failed:', error); 
    }); 
    } 
return nightmare.end() 
} 

这里是额外的代码与scrapeEach()工作, 我创造了这个分页对象跟踪的网页是这样的:

function Paginate(url, pgd) { 

    this.url = url; 
    this.array = pgd.split(" "); 

    this.currentPage = Number(this.array[0]); 
    this.totalItems = Number(_.join(_.filter(this.array[4], char => char != ","), '')); 
    this.itemsPerPage = Number(this.array[2]); 
    this.totalPages = Math.floor(this.totalItems/this.itemsPerPage) 
    this.next = true; 

    this.update =() => { 
    this.currentPage += 1; 
    if (this.currentPage >= this.totalPages) 
    this.next = false; 
    } 

    this.state =() => { 
    console.log("-------- Pagination ----------") 
    console.log("current page: " + this.currentPage); 
    console.log("total pages: " + this.totalPages); 
    console.log("total items: " + this.totalItems); 
    console.log("items per page: " + this.itemsPerPage); 
    console.log("has next page: " + this.next); 
    console.log("------------------------------\n"); 
    } 
} 

这刮削图像关闭一页

// scrapes all image data on one page and updates to db 
function scrapeImages(html) { 
    xr(html, '#returns > li', [ 
    { 
    img: 'dl.return-art > dd > a > [email protected]', 
    title: 'dl.return-art > dt > [email protected]', 
    created: 'dl.return-art > .created', 
    medium: 'dl.return-art > .medium', 
    dimensions: 'dl.return-art > .dimensions', 
    credit: 'dl.return-art > .credit', 
    accession: 'dl.return-art > .accession' 
    } 
    ])((err, res) => { 
    if (err) 
    throw err; 
    Artwork.addArt(res); 
}) 
} 

此功能sta RTS的全过程

// the onview endpoint 
function onView() { 
    nightmare.goto(config.NGA.online) 
    nightmare 
    .wait(3000) 
    .evaluate(() => { 
     return [document.location.href, document.querySelector('span.results-span').innerHTML] 
    }) 
    .then(([url, pgd]) => scrapeEach(new Paginate(url, pgd))) 
    } 

错误消息

FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory 
1: node::Abort() [/usr/local/bin/node] 
2: node::FatalException(v8::Isolate*, v8::Local<v8::Value>, v8::Local<v8::Message>) [/usr/local/bin/node] 
3: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [/usr/local/bin/node] 
4: v8::internal::Factory::NewRawOneByteString(int, v8::internal::PretenureFlag) [/usr/local/bin/node] 
5: v8::internal::Factory::NumberToString(v8::internal::Handle<v8::internal::Object>, bool) [/usr/local/bin/node] 
6: v8::internal::Runtime_NumberToStringSkipCache(int, v8::internal::Object**, v8::internal::Isolate*) [/usr/local/bin/node] 
7: 0x18caa0a079a7 
8: 0x18caa0f37cbc 
Abort trap: 6 

回答

0

所以我计算出来的过程。问题是异步问题,while循环异步运行直到完成,然后噩梦实例永远不会运行。我将自动点击转到下一页的解决方案进行了更新,以计算网址&pageNumber=中的下一页,并使用该网站的自定义页面对象传递有关页面,网址和网页上的项目的数据。我还添加了一些调试信息来显示。

function Paginate(url, pgd) { 

this.url = url; 
this.array = pgd.split(" "); 

this.currentPage = Number(this.array[0]); 
this.totalItems = Number(_.join(_.filter(this.array[4], char => char != ","), '')); 
this.itemsPerPage = Number(this.array[2]); 
this.totalPages = Math.floor(this.totalItems/this.itemsPerPage) 
this.next = true; 

this.update =() => { 
    let chunks = url.split("&").filter(segment => !segment.includes('Number=')); 

    this.currentPage += 1; 
    if (this.currentPage >= this.totalPages) 
     this.next = false; 

    this.url = _.join(chunks, "") + '&pageNumber=' + this.currentPage; 
} 

this.state =() => { 
    console.log("-------- Pagination ----------") 
    console.log("current page: " + this.currentPage); 
    console.log("total pages: " + this.totalPages); 
    console.log("total items: " + this.totalItems); 
    console.log("items per page: " + this.itemsPerPage); 
    console.log("has next page: " + this.next); 
    console.log("current url: " + this.url); 
    console.log("------------------------------\n"); 
} 

} 

我用异步的whilst同步使用while循环,并在每次迭代执行的噩梦。

function scrapeEach(paginate) { 
    // while paginate has next scrapeEach 
    let hasNext =() => paginate.next && paginate.currentPage < 10 
    async.whilst(hasNext, next => { 

    nightmare 
    .goto(paginate.url) 
    .wait(4000) 
    .evaluate(() => { 
     return document.body.innerHTML 
    }) 
    .then(result => { 
     scrapeImages(result); 
     paginate.update(); 
     paginate.state(); 
     next(); 
    }) 
    .catch(error => { 
     console.error('Search failed:', error); 
    }); 
    }, err => { 
    if (err) 
     throw err; 
    console.log("finished!"); 
    }) 
    return nightmare; 
} 
0

限制并行运行

var limit = 10;  // concurrent read // this can be increased 
    var running = 0; 

    function scrapeEach(paginate) { 
    // while paginate has next scrapeEach 
    while (paginate.next && running < limit) { 
     running++; 
     nightmare 
     .wait(4000) 
     .realClick('#pageNext .results-next') 
     .wait(4000) 
     .evaluate(() => { 
      return document.body.innerHTML 
     }) 
     .then(result => { 

      scrapeImages(result , function(){ 

      paginate.update(); 
      paginate.state(); 
      running--; 

      }); 
     }) 
     .catch(error => { 
      console.error('Search failed:', error); 
      running--; 
     }); 
     } 
    return nightmare.end() 
    } 


    // scrapes all image data on one page and updates to db 
    function scrapeImages(html ,cb) { 
     xr(html, '#returns > li', [ 
     { 
     img: 'dl.return-art > dd > a > [email protected]', 
     title: 'dl.return-art > dt > [email protected]', 
     created: 'dl.return-art > .created', 
     medium: 'dl.return-art > .medium', 
     dimensions: 'dl.return-art > .dimensions', 
     credit: 'dl.return-art > .credit', 
     accession: 'dl.return-art > .accession' 
     } 
     ])((err, res) => { 
     if (err) 
     throw err; 
     Artwork.addArt(res); 
     cb(); 
    }) 
    } 
+0

我不知道如何使用这个,我有一个条件需要在'while'循环中遇到,然后如果有下一页,它会自动点击下一页。 –

+0

哪个条件? –

+0

如果有下一个页面的话有下一个按钮 –