2017-09-02 68 views
0

我的代码应该从特定的json像url(输出网页提供的不是JSON是必需的)。当我连接一个得到它返回我以下错误:Urllib数据获取错误Python

Traceback (most recent call last): 
     File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner 
     self.run() 
     File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 20, in run 
     main_func(self.counter) 
     File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 166, in main_func 
     total=url_to_dict(url) 
     File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 79, in url_to_dict 
     data = urllib.request.urlopen(url).read().decode('utf-8') 
     File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 163, in urlopen 
     return opener.open(url, data, timeout) 
     File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 472, in open 
     response = meth(req, response) 
     File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 582, in http_response 
     'http', request, response, code, msg, hdrs) 
     File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 510, in error 
     return self._call_chain(*args) 
     File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 444, in _call_chain 
     result = func(*args) 
     File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 590, in http_error_default 
     raise HTTPError(req.full_url, code, msg, hdrs, fp) 
    urllib.error.HTTPError: HTTP Error 404: Not Found 

有趣的是,当我试图让与连接信息B检查后10000-20000迭代工作正常,但我得到以下错误:

Exception in thread Thread-9: 
Traceback (most recent call last): 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 1254, in do_open 
    h.request(req.get_method(), req.selector, req.data, headers) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1106, in request 
    self._send_request(method, url, body, headers) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1151, in _send_request 
    self.endheaders(body) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1102, in endheaders 
    self._send_output(message_body) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 934, in _send_output 
    self.send(msg) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 877, in send 
    self.connect() 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 849, in connect 
    (self.host,self.port), self.timeout, self.source_address) 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 711, in create_connection 
    raise err 
    File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 702, in create_connection 
    sock.connect(sa) 
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond 

我搜索了几个小时互联网连接错误B连接B错误主要是由于连接问题或代理。我试着用几个不同的代理这个解决方案也没有工作,要么经过一番万次给了同样的错误:

proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"}) 
opener = urllib.request.build_opener(proxy_support) 
urllib.request.install_opener(opener) 

有问题的部分如下:

class myThread (threading.Thread): 

    def __init__(self, threadID, name, counter): 
     threading.Thread.__init__(self) 
     self.threadID = threadID 
     self.name = name 
     self.counter = counter 
    def run(self): 
     main_func(self.counter) 

def url_to_dict(url): 
    hdr = { 
     'User-Agent': 'Chrome/60.0.3112.101 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Safari/537.11 Mozilla/55.0.2', 
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 
     'Accept-Encoding': 'none', 
     'Accept-Language': 'en-US,en;q=0.8', 
     'Connection': 'keep-alive'} 


    data2= urllib.request.Request(url,headers= {'User-Agent': 'Mozilla/5.0'}) 
    # proxy_support = urllib.request.ProxyHandler({"http": "http://61.233.25.166:80"}) 
    # opener = urllib2.build_opener(proxy_support) 
    # urllib2.install_opener(opener) 



    data = urllib.request.urlopen(url).read().decode('utf-8') 
    json_type_string = re.findall('({.*})', data)[0] 
    json_data = json.loads(json_type_string) 
    total_page = json_data['data']['totalPage'] 
    return json_data,total_page 


def main_func(counter): 
    proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"}) 
    opener = urllib.request.build_opener(proxy_support) 
    urllib.request.install_opener(opener) 
    for x in range(len(url_list)): 
     url=url_list[x] 
     company_name=company_list[x] 
     total=url_to_dict(url) 
     total_page=total[1] 
     for y in range(int(total_page/10)): 
      index = url.find('config[page]=') 
      index2 = url.find('&config[reply') 
      k = y*10 

      url = url[:index+13] + str(counter+k) + url[index2:] 
      print(url) 
      data = url_to_dict(url) 
      parsed_data = get_data(data) 
      add_to_mongo(parsed_data,company_name) 

我能做些什么来解决这个问题?此外,什么原因得到错误404没有找到? 在此先感谢

回答

0

这不是答案(仍然不能评论),但你尝试'请求'库吗?我想这是更强大和最新的,所以..

+1

感谢它帮助! :) – azcoder