2016-12-15 100 views
1

我正在使用webbrowser抓取网站。但是,它需要花费很多时间。我需要阅读并获取源代码页。正因为如此,我决定使用Httpwebrequest。第一次我可以得到我想要的元素,但下次我没有刮。我凑的网站有SSL保护,当我尝试刮我得到以下抓取时绕过incapsula

<html> 
<head> 
<META NAME="robots" CONTENT="noindex,nofollow"> 
<script> 
(function(){function getSessionCookies(){var cookieArray=new Array();var cName=/^\s?incap_ses_/;var c=document.cookie.split(";");for(var i=0;i<c.length;i++){var key=c[i].substr(0,c[i].indexOf("="));var value=c[i].substr(c[i].indexOf("=")+1,c[i].length);if(cName.test(key)){cookieArray[cookieArray.length]=value}}return cookieArray}function setIncapCookie(vArray){var res;try{var cookies=getSessionCookies();var digests=new Array(cookies.length);for(var i=0;i<cookies.length;i++){digests[i]=simpleDigest((vArray)+cookies[i])}res=vArray+",digest="+(digests.join())}catch(e){res=vArray+",digest="+(encodeURIComponent(e.toString()))}createCookie("___utmvc",res,20)}function simpleDigest(mystr){var res=0;for(var i=0;i<mystr.length;i++){res+=mystr.charCodeAt(i)}return res}function createCookie(name,value,seconds){var expires="";if(seconds){var date=new Date();date.setTime(date.getTime()+(seconds*1000));var expires="; expires="+date.toGMTString()}document.cookie=name+"="+value+expires+"; path=/"}function test(o){var res="";var vArray=new Array();for(var j=0;j<o.length;j++){var test=o[j][0];switch(o[j][1]){case"exists":try{if(typeof(eval(test))!="undefined"){vArray[vArray.length]=encodeURIComponent(test+"=true")}else{vArray[vArray.length]=encodeURIComponent(test+"=false")}}catch(e){vArray[vArray.length]=encodeURIComponent(test+"=false")}break;case"value":try{try{res=eval(test);if(typeof(res)==="undefined"){vArray[vArray.length]=encodeURIComponent(test+"=undefined")}else if(res===null){vArray[vArray.length]=encodeURIComponent(test+"=null")}else{vArray[vArray.length]=encodeURIComponent(test+"="+res.toString())}}catch(e){vArray[vArray.length]=encodeURIComponent(test+"=cannot evaluate");break}break}catch(e){vArray[vArray.length]=encodeURIComponent(test+"="+e)}case"plugin_extentions":try{var extentions=[];try{i=extentions.indexOf("i")}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext=indexOf is not a function");break}try{var num=navigator.plugins.length if(num==0||num==null){vArray[vArray.length]=encodeURIComponent("plugin_ext=no plugins");break}}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext=cannot evaluate");break}for(var i=0;i<navigator.plugins.length;i++){if(typeof(navigator.plugins[i])=="undefined"){vArray[vArray.length]=encodeURIComponent("plugin_ext=plugins[i] is undefined");break}var filename=navigator.plugins[i].filename var ext="no extention";if(typeof(filename)=="undefined"){ext="filename is undefined"}else if(filename.split(".").length>1){ext=filename.split('.').pop()}if(extentions.indexOf(ext)<0){extentions.push(ext)}}for(i=0;i<extentions.length;i++){vArray[vArray.length]=encodeURIComponent("plugin_ext="+extentions[i])}}catch(e){vArray[vArray.length]=encodeURIComponent("plugin_ext="+e)}break}}vArray=vArray.join();return vArray}var o=[["navigator","exists"],["navigator.vendor","value"],["navigator.appName","value"],["navigator.plugins.length==0","value"],["navigator.platform","value"],["navigator.webdriver","value"],["platform","plugin_extentions"],["ActiveXObject","exists"],["webkitURL","exists"],["_phantom","exists"],["callPhantom","exists"],["chrome","exists"],["yandex","exists"],["opera","exists"],["opr","exists"],["safari","exists"],["awesomium","exists"],["puffinDevice","exists"],["navigator.cpuClass","exists"],["navigator.oscpu","exists"],["navigator.connection","exists"],["window.outerWidth==0","value"],["window.outerHeight==0","value"],["window.WebGLRenderingContext","exists"],["document.documentMode","value"],["eval.toString().length","value"]];try{setIncapCookie(test(o));document.createElement("img").src="/_Incapsula_Resource?SWKMTFSR=1&e="+Math.random()}catch(e){img=document.createElement("img");img.src="/_Incapsula_Resource?SWKMTFSR=1&e="+e}})(); 
</script> 
<script> 
(function() { 
var z="";var b="7472797B766172207868723B76617220743D6E6577204461746528292E67657454696D6528293B766172207374617475733D227374617274223B7661722074696D696E673D6E65772041727261792833293B77696E646F772E6F6E756E6C6F61643D66756E6374696F6E28297B74696D696E675B325D3D22723A222B286E6577204461746528292E67657454696D6528292D74293B646F63756D656E742E637265617465456C656D656E742822696D6722292E7372633D222F5F496E63617073756C615F5265736F757263653F4553324C555243543D363726743D373826643D222B656E636F6465555249436F6D706F6E656E74287374617475732B222028222B74696D696E672E6A6F696E28292B222922297D3B69662877696E646F772E584D4C4874747052657175657374297B7868723D6E657720584D4C48747470526571756573747D656C73657B7868723D6E657720416374697665584F626A65637428224D6963726F736F66742E584D4C4854545022297D7868722E6F6E726561647973746174656368616E67653D66756E6374696F6E28297B737769746368287868722E72656164795374617465297B6361736520303A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2072657175657374206E6F7420696E697469616C697A656420223B627265616B3B6361736520313A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2073657276657220636F6E6E656374696F6E2065737461626C6973686564223B627265616B3B6361736520323A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2072657175657374207265636569766564223B627265616B3B6361736520333A7374617475733D6E6577204461746528292E67657454696D6528292D742B223A2070726F63657373696E672072657175657374223B627265616B3B6361736520343A7374617475733D22636F6D706C657465223B74696D696E675B315D3D22633A222B286E6577204461746528292E67657454696D6528292D74293B6966287868722E7374617475733D3D323030297B706172656E742E6C6F636174696F6E2E72656C6F616428297D627265616B7D7D3B74696D696E675B305D3D22733A222B286E6577204461746528292E67657454696D6528292D74293B7868722E6F70656E2822474554222C222F5F496E63617073756C615F5265736F757263653F535748414E45444C3D3234303839313831303338343234373432322C31343334313032313035373436353238383130322C31343539333639393136373936303235393635352C313935393639222C66616C7365293B7868722E73656E64286E756C6C297D63617463682863297B7374617475732B3D6E6577204461746528292E67657454696D6528292D742B2220696E6361705F6578633A20222B633B646F63756D656E742E637265617465456C656D656E742822696D6722292E7372633D222F5F496E63617073756C615F5265736F757263653F4553324C555243543D363726743D373826643D222B656E636F6465555249436F6D706F6E656E74287374617475732B222028222B74696D696E672E6A6F696E28292B222922297D3B";for (var i=0;i<b.length;i+=2){z=z+parseInt(b.substring(i, i+2), 16)+",";}z = z.substring(0,z.length-1); eval(eval('String.fromCharCode('+z+')'));})(); 
</script></head> 
<body> 
<iframe style="display:none;visibility:hidden;" src="//content.incapsula.com/jsTest.html" id="gaIframe"></iframe> 
</body></html> 

这里我的代码

  ServicePointManager.ServerCertificateValidationCallback += (sender, certificate, chain, sslPolicyErrors) => { return true; }; 

      string address = string.Format(@"https://www.example.com"); 

      HttpWebRequest request = (HttpWebRequest)WebRequest.Create(address); 
      //request.Proxy = WebProxy.GetDefaultProxy(); 

       request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.12) Gecko/20101026 Firefox/3"; 
       request.Accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*;q=0.8"; 
       request.Accept = "Accept-Language: tr-TR,tr;q=0.8,en-US;q=0.5,en;q=0.3"; 
       request.Accept = "Accept-Encoding: gzip, deflate, br"; 

       //request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; 
       request.Headers.Add("Upgrade-Insecure-Requests", "1"); 
       request.Referer = "https://www.example.com/page.html"; 
       string strData = ""; 

      HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 
      System.IO.Stream stream = response.GetResponseStream(); 
       System.Text.Encoding ec = System.Text.Encoding.GetEncoding("utf-8"); 
       System.IO.StreamReader reader = new System.IO.StreamReader(stream, ec); 
       strData = reader.ReadToEnd(); 

这里请求头错误

Host: www.example.com 
User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0 
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 
Accept-Language: tr-TR,tr;q=0.8,en-US;q=0.5,en;q=0.3 
Accept-Encoding: gzip, deflate, br 
Referer: https://www.example.com/mgrp115.html 
Cookie: visid_incap_969915=n/UA1sPWSRqcLHS8izlZl/vJOlgAAAAAQkIPAAAAAACAbNh4AS7Fy71tyrvY4hm5/8klCVy0ZPw6; last_domain_id=26; __utma=185813676.385095112.1480247807.1481740765.1481816400.14; __utmz=185813676.1480247807.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); incap_ses_473_969915=lZEeZH8zLDzHexwQH2+QBka5UlgAAAAAiNkwWJCksFcH1rQFP4yccA==; GAMBLINGSESS=mii1g4hdedjjimpatgd1p93gld3b5h8l; nlbi_969915=3CsSHBl0mTjavKlP18U7bQAAAADGlJZO8Hu2ocuraCIlqUwK; __utmb=185813676.16.10.1481816400; __utmc=185813676; docscrollltop=0; live_box_sport_status1=true; __utmt=1 
Connection: keep-alive 
Upgrade-Insecure-Requests: 1 

HERE响应头

Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0 
Content-Encoding: gzip 
Content-Length: 12889 
Content-Type: text/html; charset=UTF-8 
Date: Thu, 15 Dec 2016 17:09:44 GMT 
Expires: Thu, 19 Nov 1981 08:52:00 GMT 
Last-Modified: Thu, 15 Dec 2016 17:09:44 GMT 
Pragma: no-cache 
Server: Apache/2.2.22 (Linux/SUSE) 
Set-Cookie: last_domain_id=26; expires=Fri, 15-Dec-2017 17:09:44 GMT; path=/; domain=.example.com 
Vary: Accept-Encoding 
X-Cdn: Incapsula 
X-Firefox-Spdy: h2 
x-iinfo: 8-44586852-44579876 PNNN RT(1481821783171 0) q(0 0 0 -1) r(1 1) U2 

你能帮助解决这个问题吗?

在此先感谢。

+0

@AlfieGoodacre当然。 – Quicksilver

+0

这是一个笑话,指的是这个问题不一定是巨大的事实,请看一下[this](http://stackoverflow.com/help/mcve) –

+0

您的代码在第一个代码片段中可以达到超过6500字符供我们查看,其中3700个是可能真正导致问题的JavaScript函数。 –

回答

1

你被Incapsula封锁,它检查你可以在你用来发送请求的工具上运行javascript。我看到三个选项:

  1. 使用第三方工具:在github上使用htmlagilitypack与方法HtmlWeb.LoadFromBrowser或此行吟项目在GitHub上incapsula,饼干-PY 3
  2. 构建使用在您的网站工作的变通办法自己的工具。 (这不太可能作为本论坛的回复)
  3. 使用浏览器引擎刮取数据。在浏览器中发送您的请求,保存页面,并使用您的工作刮.net
+0

虽然这个链接可能回答这个问题,但最好在这里包含答案的重要部分并提供参考链接。如果链接页面更改,则仅链接答案可能会失效。 - [来自评论](/ review/low-quality-posts/17642403) – TryingToImprove

+0

谢谢。我试图变得更有建设性 – sofsntp

+0

@sofsntp我可以在私人信息中问你一个问题,或者我可以怎样与你联系,先生?顺便谢谢你的回答 – Quicksilver