1
我想创建一个Web刮板,用户在表单中输入一个URL,当它们提交时,刮板获取URL,然后返回关于我指定的URL的数据。如何组合两个需要侦听端口的Express模块?
我主要app.js文件是:
// Dependencies
var express = require('express');
var path = require('path');
var fs = require('fs');
// Custom Libraries - ./ signals to node not to look in the node_modules directory
var scraper = require('./scraper');
// App.js Variables
var app = express();
var viewsPath = path.join(__dirname, '/app/views');
app.use(express.static(__dirname + '/app/public'));
// set the port - 3000
app.set('port', process.env.PORT || 3000);
// Form handling
app.use(require('body-parser').urlencoded({
extended:true }));
app.get('/the_test');
// Writes the domain entered in the form to app/data/domain.txt
app.post('/process', function(request, response){
var domain = request.body.domain;
fs.writeFile('app/data/domain.txt', domain, function (err) {
if (err) return console.log(err);
console.log('Your domain has been saved!');;
});
response.redirect(303, '/results');
});
// Routes require
var routes = require('./routes');
app.use('/', routes);
app.use('/results', routes);
app.listen(app.get('port'), function(){
console.log('Express started on http://localhost:' + app.get('port') + '; press Ctrl-C to terminate.');
});
我刮板文件是:
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var scraper = express();
// Scrape the url that was posted
scraper.get('/scrape', function(req, res){
// Scrape this
var url = fs.readFileSync('./app/data/domain.txt', 'utf8');
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var header;
var json = { header : ""};
$('.hero-message').filter(function(){
var data = $(this);
header = data.children().first().text();
json.header = header;
});
} else {
console.log(error);
}
fs.writeFile('./app/data/results.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
});
res.send('Check your console!')
});
});
scraper.listen(4000);
console.log('Magic happens on port 4000');
exports = module.exports = scraper;
当我去到本地主机:3000,用户可以输入网址并点击提交,它们被重定向到localhost:3000 /结果,并且URL被记录在data/domain.txt中。
当我到localhost:4000/scrape时,scraper激活,从domain.txt抓取域并将其擦除。
我的问题是我该如何制作这个流体程序和/或如何自动激活刮刀而不是每次都去localhost:4000 /刮?我对Node.js和Express很陌生,意识到这是很多难看的代码。
任何提示将不胜感激。