2012-10-26 75 views
9
var http = require('http'); 
var urlOpts = {host: 'www.nodejs.org', path: '/', port: '80'}; 
http.get(urlOpts, function (response) { 
response.on('data', function (chunk) { 
var str=chunk.toString(); 
var re = new RegExp("(<\s*title[^>]*>(.+?)<\s*/\s*title)\>", "g") 
console.log(str.match(re)); 
}); 

}); 

输出从刮网页获取页面标题

[email protected] ~ $ node app.js [ 'node.js' ] null null

我只需要拿到冠军。

回答

7

我会使用的RegEx.exec代替String.match建议。您还可以定义使用文本语法的正则表达式,只有一次:

var http = require('http'); 
var urlOpts = {host: 'www.nodejs.org', path: '/', port: '80'}; 
var re = /(<\s*title[^>]*>(.+?)<\s*\/\s*title)>/gi; 
http.get(urlOpts, function (response) { 
    response.on('data', function (chunk) { 
     var str=chunk.toString(); 
     var match = re.exec(str); 
     if (match && match[2]) { 
      console.log(match[2]); 
     } 
    });  
}); 

代码还假定title将在一个块完全和两个块之间不分裂。或许,这将是最好保持大块的聚集,以防title是块之间的分裂。一旦找到它,您可能还想停止寻找title

+0

@argonius在他的示例中有一个好处,那就是除了'g'之外,您应该也可以使用'i'标志来使正则表达式不区分大小写(因为''标签的外壳不是'保证是小写的,特别是如果文档不是XHTML)。当使用JavaScript设置标题时, – <span class="text-secondary"> <small> <a rel="noopener" target="_blank" href="https://stackoverflow.com/users/2688/">bdukes</a></span> <span></span> </small> </span> </p> </div> </div> </div> <div itemprop="comment" class="post-comment"> <div class="row"> <div class="col-lg-1"><span class="text-secondary">+0</span></div> <div class="col-lg-11"> <p class="commenttext">不适用于SPA网页。你应该使用谷歌浏览器之类的无头浏览器 – <span class="text-secondary"> <small> <span></span> </small> </span> </p> </div> </div> </div> </div> </div> </article> <div> <script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script> <ins class="adsbygoogle" style="display:block" data-ad-client="ca-pub-6208739752673518" data-ad-slot="4319274062" data-ad-format="auto" data-full-width-responsive="true"></ins> <script> (adsbygoogle = window.adsbygoogle || []).push({}); </script> </div> <article class="board-top-1 padding-top-10"> <div class="post-col vote-info"> <span class="count">2<i class="fa fa-thumbs-up"></i></span> </div> <div class="post-offset"> <div class="answer fmt"> <p>试试这个:</p> <pre><code class="prettyprint-override">var re = new RegExp("<title>(.*?)</title>", "i"); console.log(str.match(re)[1]); </code></pre> </div> <div class="post-info"> <div class="post-meta row"> <p class="text-secondary col-lg-6"> <span class="source"> <a rel="noopener" target="_blank" href="https://stackoverflow.com/q/13088064">来源</a> </span> </p> <p class="text-secondary col-lg-6"> <span class="float-right date"> <span>2012-10-26 13:40:43</span> <a rel="noopener" target="_blank" href="https://stackoverflow.com/users/915320/">gradosevic</a></span> </p> <p class="col-12"></p> <p class="col-12"></p></div> </div> <!-- comments --> <div class="comments"> <div itemprop="comment" class="post-comment"> <div class="row"> <div class="col-lg-1"><span class="text-secondary">+0</span></div> <div class="col-lg-11"> <p class="commenttext">'E:\╨а╨░╨▒╨╛╤З╨╕╨╣╤Б╤В╨╛╨╗\ dev的\ app.js:7 的console.log(str.match(RE)[1 ]); ^ 类型错误:无法在IncomingMessage看空 的特性“1”。 <anonymous>(E:\╨а╨░╨▒╨╛╤З╨╕╨╣╤Б╤В╨╛╨╗\ dev的\ app.js:7:26) 在IncomingMessage.EventEmitter.emit(events.js: (http.js:359:10) at HTTPParser.parserOnBody [as onBody](http.js:123:21) at Socket.socketOnData [as ondata](http.js: 1367:20) at TCP.onread(net.js:403:27)' – <span class="text-secondary"> <small> <a rel="noopener" target="_blank" href="https://stackoverflow.com/users/1777212/">user1777212</a></span> <span></span> </small> </span> </p> </div> </div> </div> <div itemprop="comment" class="post-comment"> <div class="row"> <div class="col-lg-1"><span class="text-secondary">+0</span></div> <div class="col-lg-11"> <p class="commenttext">为我工作,谢谢! – <span class="text-secondary"> <small> <a rel="noopener" target="_blank" href="https://stackoverflow.com/users/2162226/">gnB</a></span> <span></span> </small> </span> </p> </div> </div> </div> </div> </div> </article> </div> <div class="clearfix"> </div> <div class="relative-box"> <div class="relative">相关问题</div> <ul class="relative_list"> <li> 1. <a href="http://cn.voidcc.com/question/p-hnzhodoc-kd.html" target="_blank" title="从前一页获取页面标题"> 从前一页获取页面标题 </a> </li> <li> 2. <a href="http://cn.voidcc.com/question/p-ncsbfvsl-et.html" target="_blank" title="从网页上刮取网页数据"> 从网页上刮取网页数据 </a> </li> <li> 3. <a href="http://cn.voidcc.com/question/p-ylxlsfew-uh.html" target="_blank" title="rvest从网页的html页面刮"> rvest从网页的html页面刮 </a> </li> <li> 4. <a href="http://cn.voidcc.com/question/p-unvunrti-bb.html" target="_blank" title="从网页刮取HTML? - VB.NET"> 从网页刮取HTML? - VB.NET </a> </li> <li> 5. <a href="http://cn.voidcc.com/question/p-bmuchift-we.html" target="_blank" title="如何获取网页的网站名称和页面标题"> 如何获取网页的网站名称和页面标题 </a> </li> <li> 6. <a href="http://cn.voidcc.com/question/p-sqwuxxku-sq.html" target="_blank" title="获取页面标题"> 获取页面标题 </a> </li> <li> 7. <a href="http://cn.voidcc.com/question/p-bbcicuok-z.html" target="_blank" title="从页面表中获取标题"> 从页面表中获取标题 </a> </li> <li> 8. <a href="http://cn.voidcc.com/question/p-wjnucjfp-bap.html" target="_blank" title="从redirrected页面获取标题信息"> 从redirrected页面获取标题信息 </a> </li> <li> 9. <a href="http://cn.voidcc.com/question/p-gdkxybew-rh.html" target="_blank" title="如何从Google表格中的网址获取页面标题?"> 如何从Google表格中的网址获取页面标题? </a> </li> <li> 10. <a href="http://cn.voidcc.com/question/p-nnxfhvrs-kr.html" target="_blank" title="刮内容从网站页面"> 刮内容从网站页面 </a> </li> <li> 11. <a href="http://cn.voidcc.com/question/p-vadicrfl-cg.html" target="_blank" title="从网页从刮HTML"> 从网页从刮HTML </a> </li> <li> 12. <a href="http://cn.voidcc.com/question/p-ayfwqxei-sn.html" target="_blank" title="网页刮伤问题"> 网页刮伤问题 </a> </li> <li> 13. <a href="http://cn.voidcc.com/question/p-mpofpomx-uy.html" target="_blank" title="刮网页的问题"> 刮网页的问题 </a> </li> <li> 14. <a href="http://cn.voidcc.com/question/p-ouomgmfp-uv.html" target="_blank" title="获取Facebook页面ID(刮擦)"> 获取Facebook页面ID(刮擦) </a> </li> <li> 15. <a href="http://cn.voidcc.com/question/p-cotnvpgx-bbr.html" target="_blank" title="问题与CSV格式从网页刮"> 问题与CSV格式从网页刮 </a> </li> <li> 16. <a href="http://cn.voidcc.com/question/p-glkkrsxs-nv.html" target="_blank" title="PHP从表HTML标记网页刮"> PHP从表HTML标记网页刮 </a> </li> <li> 17. <a href="http://cn.voidcc.com/question/p-pfdbgiii-xb.html" target="_blank" title="php从simplehtmldom获取错误,当试图获取网页的下一页刮"> php从simplehtmldom获取错误,当试图获取网页的下一页刮 </a> </li> <li> 18. <a href="http://cn.voidcc.com/question/p-rfcnereo-md.html" target="_blank" title="从网页表中刮取值"> 从网页表中刮取值 </a> </li> <li> 19. <a href="http://cn.voidcc.com/question/p-zfxnqvvl-kg.html" target="_blank" title="从网页刮取数字值?"> 从网页刮取数字值? </a> </li> <li> 20. <a href="http://cn.voidcc.com/question/p-ttfdtdej-vd.html" target="_blank" title="使用C从网页中刮取JSON#"> 使用C从网页中刮取JSON# </a> </li> <li> 21. <a href="http://cn.voidcc.com/question/p-ubezputi-dw.html" target="_blank" title="刮屏|网页抓取"> 刮屏|网页抓取 </a> </li> <li> 22. <a href="http://cn.voidcc.com/question/p-sklczuek-eo.html" target="_blank" title="用jsoup从页面中刮取文本"> 用jsoup从页面中刮取文本 </a> </li> <li> 23. <a href="http://cn.voidcc.com/question/p-wsnplpxm-px.html" target="_blank" title="使用JavaScript从R刮取页面"> 使用JavaScript从R刮取页面 </a> </li> <li> 24. <a href="http://cn.voidcc.com/question/p-spzjyitk-pe.html" target="_blank" title="Android:从网址获取“标题”而无需获取整个页面"> Android:从网址获取“标题”而无需获取整个页面 </a> </li> <li> 25. <a href="http://cn.voidcc.com/question/p-nslzweeq-ky.html" target="_blank" title="从网页上刮 - python"> 从网页上刮 - python </a> </li> <li> 26. <a href="http://cn.voidcc.com/question/p-uukvtxlf-qh.html" target="_blank" title="从网页上刮信息"> 从网页上刮信息 </a> </li> <li> 27. <a href="http://cn.voidcc.com/question/p-fatsloyd-wu.html" target="_blank" title="从单独的[EXTERNAL]页面(使用Javascript?)刮取/获取IMG Src"> 从单独的[EXTERNAL]页面(使用Javascript?)刮取/获取IMG Src </a> </li> <li> 28. <a href="http://cn.voidcc.com/question/p-crsytqud-oc.html" target="_blank" title="刮网页"> 刮网页 </a> </li> <li> 29. <a href="http://cn.voidcc.com/question/p-eiehelcs-pc.html" target="_blank" title="PHP网页刮"> PHP网页刮 </a> </li> <li> 30. <a href="http://cn.voidcc.com/question/p-gkcgcflk-qh.html" target="_blank" title="刮网页"> 刮网页 </a> </li> </ul> </div> <div> <script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script> <ins class="adsbygoogle" style="display:block" data-ad-format="autorelaxed" data-ad-client="ca-pub-6208739752673518" data-ad-slot="3534119089"></ins> <script> (adsbygoogle = window.adsbygoogle || []).push({}); </script> </div> <div class="padding-top-10"></div> </div> </div> <script type="text/javascript" src="http://img2.voidcc.com/voidso/script/side.js?t=1652515421853"></script> <script type="text/javascript" src="http://img2.voidcc.com/voidso/plugin/highlight/highlight.pack.js"></script> <link href="http://img2.voidcc.com/voidso/plugin/highlight/styles/docco.css" media="screen" rel="stylesheet" type="text/css" /> <script type="text/javascript"> $('pre').each(function(i, e) { hljs.highlightBlock(e, "<span class='indent'> </span>", false) }); </script> <div class="col-lg-3 col-md-4 col-sm-5"> <div id="rightTop"> <div class="row sidebar panel panel-default"> <div class="panel-heading font-bold"> 每日一句 </div> <div class="panel-body m-b-sm m-t-sm clearfix"> 每一个你不满意的现在,都有一个你没有努力的曾经。 </div> </div> <div class="row"> <script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script> <!-- VOIDCC问答侧边栏广告 --> <ins class="adsbygoogle" style="display:block" data-ad-client="ca-pub-6208739752673518" data-ad-slot="3862022848" data-ad-format="auto" data-full-width-responsive="true"></ins> <script> (adsbygoogle = window.adsbygoogle || []).push({}); </script> </div> <div class="row sidebar panel panel-default"> <div class="panel-heading font-bold"> 最新问题 </div> <div class="m-b-sm m-t-sm clearfix"> <ul class="side_article_list"> <li class="side_article_list_item"> 1. <a href="http://cn.voidcc.com/question/p-czgszqdj-wt.html" target="_blank" title="在树状视图中构建远程机器(LAN)的目录结构"> 在树状视图中构建远程机器(LAN)的目录结构 </a> </li> <li class="side_article_list_item"> 2. <a href="http://cn.voidcc.com/question/p-njteqrks-bax.html" target="_blank" title="PHP类:写对象内的函数?"> PHP类:写对象内的函数? </a> </li> <li class="side_article_list_item"> 3. <a href="http://cn.voidcc.com/question/p-hcnaqvtb-xg.html" target="_blank" title="想法解决一个简单的任务与PHP和jQuery"> 想法解决一个简单的任务与PHP和jQuery </a> </li> <li class="side_article_list_item"> 4. <a href="http://cn.voidcc.com/question/p-ojmkotdv-ur.html" target="_blank" title="如何在使用Parcelable的活动之间传递具有其他对象列表的对象?"> 如何在使用Parcelable的活动之间传递具有其他对象列表的对象? </a> </li> <li class="side_article_list_item"> 5. <a href="http://cn.voidcc.com/question/p-cwchhupg-uq.html" target="_blank" title="Laravel究竟是如何处理交易死锁的?"> Laravel究竟是如何处理交易死锁的? </a> </li> <li class="side_article_list_item"> 6. <a href="http://cn.voidcc.com/question/p-enafqfpp-ug.html" target="_blank" title="测试与具有动态地产生的场的形式的控制器"> 测试与具有动态地产生的场的形式的控制器 </a> </li> <li class="side_article_list_item"> 7. <a href="http://cn.voidcc.com/question/p-byvaodju-uh.html" target="_blank" title="为什么我的单元测试试图插入一个记录,当我不问它?"> 为什么我的单元测试试图插入一个记录,当我不问它? </a> </li> <li class="side_article_list_item"> 8. <a href="http://cn.voidcc.com/question/p-afudgtda-uh.html" target="_blank" title="ArangoDB:Foxx服务中的交易限制"> ArangoDB:Foxx服务中的交易限制 </a> </li> <li class="side_article_list_item"> 9. <a href="http://cn.voidcc.com/question/p-bqscbchu-bar.html" target="_blank" title="org.w3c.dom.Document gwt转换器在GWT中的字符串"> org.w3c.dom.Document gwt转换器在GWT中的字符串 </a> </li> <li class="side_article_list_item"> 10. <a href="http://cn.voidcc.com/question/p-mfkpyrts-bbu.html" target="_blank" title="验证字符串consiting只有字母,数字和可选的空间"> 验证字符串consiting只有字母,数字和可选的空间 </a> </li> </ul> </div> </div> </div> <p class="article-nav-bar"></p> <div class="row sidebar article-nav"> <div class="row box_white visible-sm visible-md visible-lg margin-zero"> <div class="top"> <h3 class="title"><i class="glyphicon glyphicon-th-list"></i> 相关问题</h3> </div> <div class="article-relative-content"> <ul class="side_article_list"> <li class="side_article_list_item"> 1. <a href="http://cn.voidcc.com/question/p-hnzhodoc-kd.html" target="_blank" title="从前一页获取页面标题"> 从前一页获取页面标题 </a> </li> <li class="side_article_list_item"> 2. <a href="http://cn.voidcc.com/question/p-ncsbfvsl-et.html" target="_blank" title="从网页上刮取网页数据"> 从网页上刮取网页数据 </a> </li> <li class="side_article_list_item"> 3. <a href="http://cn.voidcc.com/question/p-ylxlsfew-uh.html" target="_blank" title="rvest从网页的html页面刮"> rvest从网页的html页面刮 </a> </li> <li class="side_article_list_item"> 4. <a href="http://cn.voidcc.com/question/p-unvunrti-bb.html" target="_blank" title="从网页刮取HTML? - VB.NET"> 从网页刮取HTML? - VB.NET </a> </li> <li class="side_article_list_item"> 5. <a href="http://cn.voidcc.com/question/p-bmuchift-we.html" target="_blank" title="如何获取网页的网站名称和页面标题"> 如何获取网页的网站名称和页面标题 </a> </li> <li class="side_article_list_item"> 6. <a href="http://cn.voidcc.com/question/p-sqwuxxku-sq.html" target="_blank" title="获取页面标题"> 获取页面标题 </a> </li> <li class="side_article_list_item"> 7. <a href="http://cn.voidcc.com/question/p-bbcicuok-z.html" target="_blank" title="从页面表中获取标题"> 从页面表中获取标题 </a> </li> <li class="side_article_list_item"> 8. <a href="http://cn.voidcc.com/question/p-wjnucjfp-bap.html" target="_blank" title="从redirrected页面获取标题信息"> 从redirrected页面获取标题信息 </a> </li> <li class="side_article_list_item"> 9. <a href="http://cn.voidcc.com/question/p-gdkxybew-rh.html" target="_blank" title="如何从Google表格中的网址获取页面标题?"> 如何从Google表格中的网址获取页面标题? </a> </li> <li class="side_article_list_item"> 10. <a href="http://cn.voidcc.com/question/p-nnxfhvrs-kr.html" target="_blank" title="刮内容从网站页面"> 刮内容从网站页面 </a> </li> </ul> </div> </div> </div> </div> </div> </div> </div><!-- wrap end--> <!-- footer --> <footer id="footer"> <div class="bg-simple lt"> <div class="container"> <div class="row padder-v m-t"> <div class="col-xs-8"> <ul class="list-inline"> <li><a href="http://cn.voidcc.com/contact">联系我们</a></li> <li>© 2020 CN.VOIDCC.COM</li> <li><a rel="nofollow" href="https://beian.miit.gov.cn/" target="_blank">沪ICP备13005482号-13</a></li> <li><script type="text/javascript" src="https://s9.cnzz.com/z_stat.php?id=1280098168&web_id=1280098168"></script></li> <li><a href="http://cn.voidcc.com/" target="_blank" title="程序问答园区">简体中文</a></li> <li><a href="http://hk.voidcc.com/" target="_blank" title="程序問答園區">繁體中文</a></li> <li><a href="http://ru.voidcc.com/" target="_blank" title="поле вопросов и ответов">Русский</a></li> <li><a href="http://de.voidcc.com/" target="_blank" title="Frage - und - antwort - Park">Deutsch</a></li> <li><a href="http://es.voidcc.com/" target="_blank" title="Preguntas y respuestas">Español</a></li> <li><a href="http://hi.voidcc.com/" target="_blank" title="कार्यक्रम प्रश्न और उत्तर पार्क">हिन्दी</a></li> <li><a href="http://it.voidcc.com/" target="_blank" title="IL Programma di chiedere Park">Italiano</a></li> <li><a href="http://ja.voidcc.com/" target="_blank" title="プログラム問答園区">日本語</a></li> <li><a href="http://ko.voidcc.com/" target="_blank" title="프로그램 문답 단지">한국어</a></li> <li><a href="http://pl.voidcc.com/" target="_blank" title="program o park">Polski</a></li> <li><a href="http://tr.voidcc.com/" target="_blank" title="Program soru ve cevap parkı">Türkçe</a></li> <li><a href="http://vi.voidcc.com/" target="_blank" title="Đáp ứng viên">Tiếng Việt</a></li> <li><a href="http://fr.voidcc.com/" target="_blank" title="Programme interrogation Park">Française</a></li> </ul> </div> </div> </div> </div> </div> </footer> <!-- / footer --> <script async src="https://www.googletagmanager.com/gtag/js?id=UA-77509369-5"></script> <script> window.dataLayer = window.dataLayer || []; function gtag() { dataLayer.push(arguments); } gtag('js', new Date()); gtag('config', 'UA-77509369-5'); </script> <script> var _hmt = _hmt || []; (function () { var hm = document.createElement("script"); hm.src = "https://hm.baidu.com/hm.js?67d4731349f0b00136755b80364ce381"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); </script> </body> </html>