1
我与Python的硒网络驱动器(Chrome)Python中硒的网络驱动器多处理
我可以使用多个驱动器,并让每个驾驶员的抓取图像抓取图像?
我想多处理做以下事情
源代码
def crawl(searchText):
driver = webdriver.Chrome('C:\\Users\\HYOWON\\Desktop\\Desktop\\Graduation\\Code\\Crawling\\chromedriver.exe')
searchUrl = "https://www.google.com/search?q={}&site=webhp&tbm=isch".format(searchText)
driver.get(searchUrl)
imgs_urls = [] # Url 저장 배열
cnt = 0
for j in range(20):
element = driver.find_element_by_css_selector("div[data-ri = '" + str(cnt + j) + "'] img")
element.click()
sleep(1)
soup = create_soup()
for img in soup.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except:
pass
driver.close()
return(imgs_urls)
修改代码
def crawl():
imgs_urls = []
for j in range(50):
element1 = driver1.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element2 = driver2.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element3 = driver3.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element1.click()
WebDriverWait(driver1, 1)
soup1 = create_soup(driver1)
for img in soup1.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'): # http로 시작 jpg로 끝나는것만
imgs_urls.append(img['src'])
except: # 예외 pass
pass
element2.click()
WebDriverWait(driver2, 1)
soup2 = create_soup(driver2)
for img in soup2.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except: # 예외 pass
pass
element3.click()
WebDriverWait(driver3, 1)
soup3 = create_soup(driver3)
for img in soup3.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except: # 예외 pass
pass
cnt += 3
return (imgs_urls)
def download_img(url, filename):
full_name = str(filename) + ".jpg"
urllib.request.urlretrieve(url, 'C:/Python/' + full_name)
for url in crawl():
download_img(url, filename)
您需要实现一个实际的多处理队列。硒阻塞意味着它会阻止你的蟒蛇做其他事情。驱动程序1请求一个页面,驱动程序2在驱动程序1完成之前不能执行任何操作。这是通过多处理库解决的。 – eusid