2017-07-26 56 views
0

刮结合数据帧我有以下代码而网络使用rvest

library('rvest') 
library('tidyverse') 
test_url <- c('http://www.citact.org/senator-brent-waltz-r-greenwood- 
district-36', 
       'http://www.citact.org/senator-ron-grooms-r-new-albany- 
district-46', 
       'http://www.citact.org/representative-mike-speedy-r- 
indianapolis-district-90') 
test <- lapply(test_url, function(i){ 
    web <- read_html(i) 
    grades <- html_nodes(web, 'td') 
    test_grades <- data.frame(one = (as.data.frame(html_text(grades), two = 
'idk'))) 

    first <- as.data.frame(test_grades[2:11, ]) 
    second <- as.data.frame(test_grades[13:22, ]) 

    names(test_grades) <- names(test_grades) 
    testing <- data.frame(c(first, second)) 
}) 

test_names <- lapply(test_url, function(i){ 
    web <- read_html(i) 
    info <- html_nodes(web, 'h3') 
    text_info <- html_text(info) 

    names_test_df <- data_frame(member = text_info) 
    names_test_df <- separate(names_test_df, col = member, c('Useless', 
    'Info'), sep = ': ') 
    names_test_df <- separate(names_test_df, col = Info, c('names', 
'District'), sep = ',') 
    names_test_df <- separate(names_test_df, col = names, c('Position', 
    'First', 'Last', 'Party') 
         , sep = ' ') 
    names_test_df <- separate(names_test_df, col = Party, c('Party','District 
Name'), sep = '-') 
}) 

y <- do.call(cbind.fill, c(list(do.call(rbind, test)), do.call(rbind, 
test_names))) 

这个作品在这个意义上,所有的信息被收集并没有错误,但问题在于我最后的数据帧我称之为y。当我创建y时,数据框测试和数据框test_names不匹配。例如,来自test的一些年级和年数与test_names中的正确候选人不匹配。有没有办法确保这些对应正确?我试图在循环它们之前合并数据框,但是我没有成功完成这项工作。可能有更好的办法,那只是我最初的计划。

回答

1

尽量避免重塑和绑定,并在每隔一行做一次do.call,它会让你的代码难以阅读,因此也很难找到bug。我继续并决定让它更简单一些。

person_xpath <- "//h1[contains(@class, 'title gutter')]" 

url_tables <- lapply(test_url, function(x){ 
    # Scraping the information (note, connecting to URL once) 
    page   <- read_html(x) 
    vote_outcome <- t(html_table(page)[[1]]) 
    personal_info <- html_nodes(page, xpath = person_xpath) 
    personal_info <- html_text(personal_info) 

    # Remove some useless characters and split the string 
    personal_info <- gsub("\\(|\\) |,", "", personal_info) 
    split_person <- strsplit(personal_info, " ")[[1]] 

    # Prepare the personal info for a cbind: 
    dupe_info  <- sapply(split_person, rep, nrow(vote_outcome)) 
    if(ncol(dupe_info) == 7){ 
    dupe_info[,4] <- paste(dupe_info[,4], dupe_info[,5]) 
    dupe_info  <- dupe_info[,-5] 
    } 
    df    <- cbind(dupe_info, vote_outcome)[,-5] 
    colnames(df) <- c("Position", "First", "Last", "Party", 
         "District", "Year", "Outcome") 
    return(df) 
}) 

url_tables[[1]] 
#  Position First Last Party   District Year  
# X1 "Senator" "Brent" "Waltz" "R-Greenwood" "36"  " "  
# X2 "Senator" "Brent" "Waltz" "R-Greenwood" "36"  "2008" 
# X3 "Senator" "Brent" "Waltz" "R-Greenwood" "36"  "2009" 
# X4 "Senator" "Brent" "Waltz" "R-Greenwood" "36"  "2010" 
# ..  ...  ...  ...  ...  ...  ... 

# X1 "Pro-Consumer Voting Percentage" 
# X2 "0%"        
# X3 "57%"       
# X4 "66%"       
# .. ... 
+0

这工作,谢谢。任何想法,我的错误是什么?我很抱歉,这不是好的编码,我是R新手,并且仅仅为此目的而学习它。 – Jordan

+0

此外,当我在所有网址上运行此操作时,都收到此错误:UseMethod(“read_xml”)中的错误: 没有将'read_xml'应用于类“factor”类的对象的适用方法。我在上面的代码中改变的唯一的东西是url的 – Jordan

+0

我不知道错误在哪里。您必须在某个位置混合尺寸,或者在一个表中可能有空行。关于另一个错误,我不知道它来自哪里。难道网页不是完全相同的吗? –