2015-11-02 94 views
0

我正在尝试使用R背心为webscrape过去3个月的纳斯达克关闭日期,所以我可以玩弄数据。在R网上刮雅虎金融(带R背心)

问题是我似乎无法找到正确的xpath来返回表。我已经尝试了很多使用chrome的“检查元素”来查找xpath以及用于chrome的“SelectorGadget”插件。

似乎大多数人已经做到了这一点与蟒蛇,但我在R更舒服,特别是使用R背心网络刮,所以我希望我不孤单!

我已在下面发布我的代码。我相信问题在于识别xpath。 这里是网页的一个示例... http://finance.yahoo.com/q/hp?s=CSV

后,我得到一个工作,我希望把它放在一个循环是我下面的问题代码....

谢谢!

library("rvest") 
library("data.table") 
library("xlsx") 


#Problem Code 

company <- 'CSV' 
url <- paste("http://finance.yahoo.com/q/hp?s=",toString(company),sep="") 
url <-html(url) 
select_table <- '//table' #this is the line I think is incorrect 
fnames <- html_nodes(url, xpath=select_table) %>% html_table(fill=TRUE) 
STOCK <- fnames[[1]] 
STOCKS <- rbind(STOCK, STOCKS) 



#--------------------------------------------------------------------- 
#Loop for use later 

companylist <- read.csv('companylist.csv') #this is a list of all company tickers in the NASDAQ 
STOCK <- data.frame() 
STOCKS <- data.frame(Date=character(),Open=character(),High=character(),Low=character(),Close=character(),Volume=character(), AdjClose=character()) 
for (i in 1:3095) { 
    company <- companylist[i,1] 
    url <- paste("http://finance.yahoo.com/q/hp?s=",toString(company),sep="") 
    url <-html(url) 
    select_table <- '//*[@id="yfncsumtab"]/tbody/tr[2]/td[1]/table[4]' 
    fnames <- html_nodes(url,xpath = select_table) %>% html_table(fill=TRUE) 
    STOCK <- fnames[[1]] 
    STOCKS <- rbind(STOCK, STOCKS) 

} 
View(STOCKS) 
+4

如果您的目标只是让价格看一下'quantmod'包,它允许您请求大量数据。 – etienne

+0

@etienne这正是我正在寻找的。希望我以前知道那个包装!谢谢。 – bpheazye

回答

1

你想抢股价吗?

https://gist.github.com/jaehyeon-kim/356cf62b61248193db25#file-downloadstockdata

# assumes codes are known beforehand 
codes <- c("ABT", "ABBV", "ACE", "ACN", "ACT", "ADBE", "ADT", "AES", "AET", "AFL", "AMG", "A", "GAS", "APD", "ARG", "AKAM", "AA") 
urls <- paste0("http://www.google.com/finance/historical?q=NASDAQ:", 
codes,"&output=csv") 
paths <- paste0(codes,"csv") 
missing <- !(paths %in% dir(".", full.name = TRUE)) 
missing 

# simple error handling in case file doesn't exists 
downloadFile <- function(url, path, ...) { 
# remove file if exists already 
if(file.exists(path)) file.remove(path) 
# download file 
tryCatch(
download.file(url, path, ...), error = function(c) { 
# remove file if error 
if(file.exists(path)) file.remove(path) 
# create error message 
c$message <- paste(substr(path, 1, 4),"failed") 
message(c$message) 
} 
) 
} 
# wrapper of mapply 
Map(downloadFile, urls[missing], paths[missing]) 

你可以试试这一点。 。 。

library(knitr) 
library(lubridate) 
library(stringr) 
library(plyr) 
library(dplyr) 
{% endhighlight %} 

The script begins with creating a folder to save data files. 


{% highlight r %} 
# create data folder 
dataDir <- paste0("data","_","2014-11-20-Download-Stock-Data-1") 
if(file.exists(dataDir)) { 
     unlink(dataDir, recursive = TRUE) 
     dir.create(dataDir) 
} else { 
     dir.create(dataDir) 
} 
{% endhighlight %} 

After creating urls and file paths, files are downloaded using `Map` function - it is a warpper of `mapply`. Note that, in case the function breaks by an error (eg when a file doesn't exist), `download.file` is wrapped by another function that includes an error handler (`tryCatch`). 


{% highlight r %} 
# assumes codes are known beforehand 
codes <- c("MSFT", "TCHC") # codes <- c("MSFT", "1234") for testing 
urls <- paste0("http://www.google.com/finance/historical?q=NASDAQ:", 
       codes,"&output=csv") 
paths <- paste0(dataDir,"/",codes,".csv") # back slash on windows (\\) 

# simple error handling in case file doesn't exists 
downloadFile <- function(url, path, ...) { 
     # remove file if exists already 
     if(file.exists(path)) file.remove(path) 
     # download file 
     tryCatch(   
      download.file(url, path, ...), error = function(c) { 
        # remove file if error 
        if(file.exists(path)) file.remove(path) 
        # create error message 
        c$message <- paste(substr(path, 1, 4),"failed") 
        message(c$message) 
      } 
    ) 
} 
# wrapper of mapply 
Map(downloadFile, urls, paths) 
{% endhighlight %} 


Finally files are read back using `llply` and they are combined using `rbind_all`. Note that, as the merged data has multiple stocks' records, `Code` column is created. 



{% highlight r %} 
# read all csv files and merge 
files <- dir(dataDir, full.name = TRUE) 
dataList <- llply(files, function(file){ 
     data <- read.csv(file, stringsAsFactors = FALSE) 
     # get code from file path 
     pattern <- "/[A-Z][A-Z][A-Z][A-Z]" 
     code <- substr(str_extract(file, pattern), 2, nchar(str_extract(file, pattern))) 
     # first column's name is funny 
     names(data) <- c("Date","Open","High","Low","Close","Volume") 
     data$Date <- dmy(data$Date) 
     data$Open <- as.numeric(data$Open) 
     data$High <- as.numeric(data$High) 
     data$Low <- as.numeric(data$Low) 
     data$Close <- as.numeric(data$Close) 
     data$Volume <- as.integer(data$Volume) 
     data$Code <- code 
     data 
}, .progress = "text") 

data <- rbind_all(dataList) 
{% endhighlight %} 
+0

任何想法如何添加到此代码来选择特定范围的日期?该网站有能力选择日期,但我不知道如何通过代码改变。谢谢你的帮助! – bpheazye