2016-08-17 40 views
-2

我想从538拉数据,但我想通过点击“显示更多民意调查”到达的完整数据...有没有任何方法的功能访问表格的其他行?R拉HTML数据与额外的Java显示

http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/

在顶级数据拉动的代码是:

require(XML) 

polls.html <- htmlTreeParse("http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/", 
useInternalNodes = TRUE) 

parsedDoc <- readHTMLTable(polls.html, stringsAsFactors=FALSE) 

pollData <- data.frame(parsedDoc[4]) 
+0

RSelenium是去这里的最佳方式。 – hrbrmstr

回答

0

我,为什么这引起了downvoted有点糊涂了......似乎仍不明显,我!但对于任何想知道解决方案的人来说,我都明白(感谢@duncantl的帮助)。 (此外,完整的分析是:https://github.com/hardin47/prediction2016

require(XML) 
require(dplyr) 
require(tidyr) 
require(readr) 
require(mosaic) 
require(RCurl) 
require(ggplot2) 
require(lubridate) 
require(RJSONIO) 

url = "http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/" 
doc <- htmlParse(url, useInternalNodes = TRUE) 

sc = xpathSApply(doc, "//script[contains(., 'race.model')]", 
       function(x) c(xmlValue(x), xmlAttrs(x)[["href"]])) 

jsobj = gsub(".*race.stateData = (.*);race.pathPrefix.*", "\\1", sc) 

data = fromJSON(jsobj) 
allpolls <- data$polls 

#unlisting the whole thing 
indx <- sapply(allpolls, length) 
pollsdf <- as.data.frame(do.call(rbind, lapply(allpolls, `length<-`, max(indx)))) 

#unlisting the weights 
pollswt <- as.data.frame(t(as.data.frame(do.call(cbind, lapply(pollsdf$weight, data.frame, 
               stringsAsFactors=FALSE))))) 
names(pollswt) <- c("wtpolls", "wtplus", "wtnow") 
row.names(pollswt) <- NULL 

pollsdf <- cbind(pollsdf, pollswt) 

#unlisting the voting 
indxv <- sapply(pollsdf$votingAnswers, length) 
pollsvot <- as.data.frame(do.call(rbind, lapply(pollsdf$votingAnswers, 
               `length<-`, max(indxv)))) 
pollsvot1 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V1, data.frame, 
               stringsAsFactors=FALSE)))) 
pollsvot2 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V2, data.frame, 
               stringsAsFactors=FALSE)))) 


pollsvot1 <- cbind(polltype = rownames(pollsvot1), pollsvot1, 
        polltypeA = gsub('[0-9]+', '', rownames(pollsvot1)), 
        polltype1 = extract_numeric(rownames(pollsvot1))) 

pollsvot1$polltype1 <- ifelse(is.na(pollsvot1$polltype1), 1, pollsvot1$polltype1 + 1) 


pollsvot2 <- cbind(polltype = rownames(pollsvot2), pollsvot2, 
        polltypeA = gsub('[0-9]+', '', rownames(pollsvot2)), 
        polltype1 = extract_numeric(rownames(pollsvot2))) 

pollsvot2$polltype1 <- ifelse(is.na(pollsvot2$polltype1), 1, pollsvot2$polltype1 + 1) 


pollsdf <- pollsdf %>% 
    mutate(population = unlist(population), 
     sampleSize = as.numeric(unlist(sampleSize)), 
     pollster = unlist(pollster), 
     startDate = ymd(unlist(startDate)), 
     endDate = ymd(unlist(endDate)), 
     pollsterRating = unlist(pollsterRating)) %>% 
    select(population, sampleSize, pollster, startDate, endDate, pollsterRating, 
     wtpolls, wtplus, wtnow) 



allpolldata <- cbind(rbind(pollsdf[rep(seq_len(nrow(pollsdf)), each=3),], 
          pollsdf[rep(seq_len(nrow(pollsdf)), each=3),]), 
        rbind(pollsvot1, pollsvot2)) 

allpolldata <- allpolldata %>% 
    arrange(polltype1, choice)