2016-03-04 64 views
0

我想刮一个aspx网站:https://www.aae.org/patients/find.aspx。为了测试目的,请使用33133作为邮编& 100作为半径。我想刮一个aspx网站,但无法超越page2

最初我被遍历搜索页面收集的个人资料的链接,我成功获得在第一页,但无法超越第1页第20个链接,源说 - “我们\”很抱歉,页面或文件你要找的人无法找到”

请参阅下面我的代码:

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 
import sys, re 
import urllib.request, urllib.parse, time, csv 
from bs4 import BeautifulSoup 
from lxml import html 
from sys import argv 

profile_links = [] 

def result_checker(self): 
    No_results = self.xpath('//td[@colspan="3"]//p//text()') 
    if "No results" in str(No_results): 
     print (str(No_results).replace("['","").replace(".']","")+" for other zipcodes") 
     time.sleep(10) 
     sys.exit() 
    else: 
     pass 

def Get_data(zipcode, radius): 
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 
       'Accept-Encoding':'gzip, deflate', 
       'Accept-Language':'en-US,en;q=0.8,pt;q=0.6', 
       'Connection':'keep-alive', 
       'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 
       'Host':'www.tcms.com', 
       'Origin':'https://www.aae.org', 
       'Referer':'https://www.aae.org/patients/find.aspx'} 

    class MyOpener(urllib.request.FancyURLopener): 
     version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17' 

    myopener = MyOpener() 
    url = 'https://www.aae.org/patients/find.aspx' 
    f = myopener.open(url) 
    soup = BeautifulSoup(f,'lxml') 
    viewstate = soup.select("#__VIEWSTATE")[0]['value'] 
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value'] 
    EktronClientManager = soup.select("#EktronClientManager")[0]['value'] 

    formData = (
     ('__EVENTVALIDATION', eventvalidation), 
     ('__VIEWSTATE', viewstate), 
     ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'), 
     ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius', radius), 
     ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode',zipcode), 
     ('EktronClientManager',EktronClientManager), 
     ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind','SEARCH')) 

    encodedFields = urllib.parse.urlencode(formData) 
    f1 = myopener.open(url, encodedFields) 
    source = f1.read() 
    target = open('sample.txt','w') 
    target.write(str(source)) 
    target.close() 
    source1 = html.fromstring(source) 
    result_checker(source1) 
    links = source1.xpath("//table[@class='Results']//tr//a//@href") 
    for each in links: 
     if "MemberID" and "AddressID" in each: 
      print (each) 
      profile_links.append("https://www.aae.org/patients/"+str(each)) 
     else: 
      pass 

    j = 2 
    soup2 = BeautifulSoup(source,'lxml') 
    viewstate = soup2.select("#__VIEWSTATE")[0]['value'] 
    eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value'] 

    while j < 5: 
     pages = 'Page$'+str(j) 
     print (pages,'\n---------------') 
     formData1 = (('__EVENTTARGET','ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults'), 
        ('__EVENTARGUMENT',pages), 
        ('__VIEWSTATE',viewstate), 
        ('__EVENTVALIDATION',eventvalidation), 
        ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search')) 

     encodedFields1 = urllib.parse.urlencode(formData1) 
     f2 = myopener.open(url, encodedFields1) 
     source2 = f2.read() 
     target = open('sample.txt','w') 
     target.write(str(source2)) 
     target.close() 
     source3 = html.fromstring(source2) 
     links2 = source3.xpath("//table[@class='Results']//tr//a//@href") 
     for each1 in links2: 
      if "MemberID" and "AddressID" in each1: 
       print (each1) 
       profile_links.append("https://www.aae.org/patients/"+str(each1)) 
      else: 
       pass 
     soup3 = BeautifulSoup(source2,'lxml') 
     viewstate = soup3.select("#__VIEWSTATE")[0]['value'] 
     eventvalidation = soup3.select("#__EVENTVALIDATION")[0]['value'] 
     j+=1 

if __name__ == "__main__": 
    #Get_data('38132', 5) 
    Get_data('33133', 100) 
+0

有什么建议吗? –

+0

@Greg,感谢您编辑建议,可以请看看我的代码并确定我做错了什么? –

+0

代码有点长,所以很难快速找到问题。刮蹭ASP.NET网站是一个痛苦的屁股...你是否确定你要保存并在请求之间传递你的cookies? –

回答

0

是格雷格Sadetsky,你是绝对正确的饼干,与所需的数据参数创建一个会话,然后通过所有POST请求是需要的。

在Requests lib的帮助下,我能够创建一个存储可以跨请求使用的cookie的会话。

import requests 
from bs4 import BeautifulSoup 
from requests import Request, Session 
from lxml import html 

def Get_data(zipcode, radius): 
    All_links = [] 
    url = 'https://www.aae.org/patients/find.aspx' 
    s = requests.Session() 
    r = s.get(url) 
    #print (r.text.encode('utf-8')) 
    soup = BeautifulSoup(r.content,'lxml') 
    viewstate = soup.select("#__VIEWSTATE")[0]['value'] 
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value'] 
    EktronClientManager = soup.select("#EktronClientManager")[0]['value'] 
    params = {'EktronClientManager':EktronClientManager, 
       '__VIEWSTATE':viewstate, 
       '__EVENTVALIDATION':eventvalidation, 
       'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search', 
       'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius':radius, 
       'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode':zipcode, 
       'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind':'SEARCH'} 
    r2 = s.post(url,data=params) 
    source = html.fromstring(r2.content) 
    links = source.xpath("//table[@class='Results']//tr//a//@href") 
    for each in links: 
     if "MemberID" and "AddressID" in each: 
      print (each) 
      All_links.append("https://www.aae.org/patients/"+str(each)) 
    #print (r2.content) 
    soup1 = BeautifulSoup(r2.content,'lxml') 
    viewstate = soup1.select("#__VIEWSTATE")[0]['value'] 
    eventvalidation = soup1.select("#__EVENTVALIDATION")[0]['value'] 
    EktronClientManager = soup1.select("#EktronClientManager")[0]['value'] 
    j = 2 
    while j < 7: 
     page = 'Page$'+str(j) 
     print (page) 
     params1 = {'__EVENTTARGET':'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults', 
        '__EVENTARGUMENT':page, 
        'EktronClientManager':EktronClientManager, 
        '__VIEWSTATE':viewstate, 
        '__EVENTVALIDATION':eventvalidation, 
        'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search'} 
     r3 = s.post(url,data=params1) 
     source1 = html.fromstring(r3.content) 
     links1 = source1.xpath("//table[@class='Results']//tr//a//@href") 
     for each1 in links1: 
      if "MemberID" and "AddressID" in each1: 
       print (each1) 
       All_links.append("https://www.aae.org/patients/"+str(each1)) 
     soup2 = BeautifulSoup(r3.content,'lxml') 
     viewstate = soup2.select("#__VIEWSTATE")[0]['value'] 
     eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value'] 
     EktronClientManager = soup2.select("#EktronClientManager")[0]['value'] 
     j+=1 

Get_data(33133, 100)