2017-12-27 409 views
2

我有波纹管代码:熊猫导出为CSV

from xlsxwriter import Workbook 
import os,shutil 
import requests 
import pandas 
from bs4 import BeautifulSoup 
MAX_RETRIES = 20 
base_url='https://pagellapolitica.it/politici/sfoggio/9/matteo-renzi?page=' 

for page in range(1,32,1): 
    l=[] 
    session = requests.Session() 
    adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES) 
    session.mount('https://', adapter) 
    session.mount('http://', adapter) 

    site=(base_url+str(page)+".html") 
    # print(site) 
    c=session.get(site) 
    r=c.content 
    soup=BeautifulSoup(r,'html.parser') 
    all=soup.find_all("div",{"class":"clearfix"}) 


for d in all: 
    links=d.find_all("a") 
    len(links) 
    l=[] 
    # workbook = Workbook('bbb.xlsx') 
    # worksheet = workbook.add_worksheet() 
    # row +=0 
    # worksheet.write(row,0,'Link') 
    # worksheet.write(row,1,'Name ') 
    # row+=1 
    for a in links[5:17]: 

     d={} 
     href=(a["href"]) 
     basic_url=('https://pagellapolitica.it/') 
     site =basic_url + href 
     #print(site) 

     c=requests.get(site) 
     r=c.content 
     soup=BeautifulSoup(r,'html.parser') 
     Name=soup.find("h3",{"class":"pull-left"}).text 
     Fact_checking=soup.find("label",{"class":"verdict-analisi"}).text 
     quote=soup.find("div",{"class":"col-xs-12 col-sm-6 col-smm-6 col-md-5 col-lg-5"}).text 
     all=soup.find_all("span",{"class":"item"}) 
     Topic=all[0].text 
     Date=all[2].text 
     a=all[3].find("a",{"class":"" ""}) 
     Link=a["href"] 
     Text=soup.find("div",{"class":"col-xs-12 col-md-12 col-lg-12"}).text 
     d["Name"]=Name 
     d["Fact_checking"]=Fact_checking 
     d["Quote"]=quote 
     d["Economic_topic"]=Topic 
     d["Date"]=Date 
     d["Link"]=Link 
     d["Text"]=Text 
     l.append(d) 

     df=pandas.DataFrame(l) 
     df.to_csv("outing.csv") 

,当我在CSV导出数据我只得到6行results.When我做印刷(DF)和打印(L)的问题它会打印列表中的所有数据,但是当我检查len(l)时,我只能得到6的长度。任何想法为什么会发生这种情况? 提前谢谢!

+0

您可以在问题的底部添加'df.info()'输出。请务必在添加之前将其格式化为可读。 – Nitred

+5

你在内循环的每次迭代中覆盖你的CSV文件... – MaxU

+0

在循环的外部移动'df = pandas.DataFrame(l)'和'df.to_csv(“outing.csv”)' –

回答

2

考虑使用pandas.DataFrame()构建绑定到各个数据框中的字典列表,然后将最终数据框的各个数据框列表与pandas.concat()连接起来。

df_list = [] 

for d in all: 
    links=d.find_all("a") 
    len(links) 
    l=[] 

    for a in links[5:17]:  
     d={} 
     ... 
     d["Name"]=Name 
     d["Fact_checking"]=Fact_checking 
     d["Quote"]=quote 
     d["Economic_topic"]=Topic 
     d["Date"]=Date 
     d["Link"]=Link 
     d["Text"]=Text 
     l.append(d) 

    df_list.append(pandas.DataFrame(l)) 

final_df = pandas.concat(df_list) 
final_df.to_csv("outing.csv")