2017-09-27 50 views
0

这是获取餐馆评论的代码。我正在收集评论。如何指定范围而不限于数字

指定并导入范围。但有一个问题。每家商店都有不同数量的评论。很少有评论的商店应该很快去下一个商店。

我受到的范围太大。但它不能缩小范围。这是因为一些商店在这个范围内有评论。

我该如何有效地工作?

我看到了所有?(元素)搜索此代码。但我不知道我是否错误地应用了我的代码。

#python3 
import sys 
from bs4 import BeautifulSoup 
import urllib.request 
import requests 
from urllib.parse import quote 
import time 
import os 
import xlwt 
import random 

import re 

FISRT_URL = "https://www.yelp.com/search? 
find_desc=Korean+Food&find_loc=Seattle,+WA&start=" 
LAST_URL = "&cflt=korean" 
def get_link(URL, doc_name): 
    global jisu_i 
    global num 
    global page 
    for jisu_i in range(1): 
     current_page_num = 20 + jisu_i*10 
     position = URL.index('t=') 
     URL_with_page_num = URL[: position+2] + str(current_page_num) \ 
          + URL[position+2 :] 
    print(URL_with_page_num) 
    importurl = URL_with_page_num 
    r = requests.get(importurl) 
    soup = BeautifulSoup(r.content.decode('euc-kr','replace'), "lxml") 
    time.sleep(random.randint(10, 15)) 

    for title in soup.find_all('h3')[page+2:21]: 

     page = page + 1 
     title_link = title.select('a') 
     for jisu_m in range(130): 
      print(page) 
      last_URL = title_link[0]['href'] 
      print(last_URL) 

      first_URL = "https://www.yelp.com" 
      global article_URL 
      article_URL = first_URL + last_URL 
      time.sleep(random.randint(15, 30)) 



      jisuurl = article_URL 
      for k in range(99): # 
       jisu_page_num = 0 + k * 20 # 
       position = jisuurl.index('?') 
       URL_with_page_num = jisuurl[: position + 1] + str("start=") + str(jisu_page_num) 

       jisu_with_page_num = URL_with_page_num 
       print(jisu_with_page_num) 

       jisu_importurl = jisu_with_page_num 
       get_text(URL, jisu_importurl, doc_name) 
       time.sleep(random.randint(40,180)) 

回答