2017-02-21 105 views
0

我想在网上抓取一个特定的资金在这种情况下从http://www.morningstar.com/funds/xnas/adafx/quote.html ADAFX总资产。但结果始终是结构(空);我究竟做错了什么?网页搜刮与Rvest和R

我之前用过rvest,结果不一样,所以我计算了一下时间从专家那里得到专家的帮助。

library(rvest)  
Symbol.i ="ADAFX" 
url <-Paste("http://www.morningstar.com/funds/xnas/",Symbol.i,"/quote.html",sep="") 
    tryCatch(NetAssets.i <- url %>% 
      read_html() %>% 
      html_nodes(xpath='//*[@id="gr_total_asset_wrap"]/span/span') %>% 
      html_text(), error = function(e) NetAssets.i = NA) 

谢谢你在前进, 干杯,

亚伦瑟德斯特伦

+0

你知道有一个morningstar API,对不对?请参阅[这里](https://gist.github.com/anonymous/c7d9c19cc67e03641966064d1518ed41)为例 – Jean

+0

谢谢,我知道这个API,但我试图构建一个API不包含的自定义基金筛选器。 –

回答

1

这对于通过XHR请求的各种sectinons加载数据的动态页面,所以你必须看看开发工具网络标签以获取目标内容URL。

library(httr) 
library(rvest) 

res <- GET(url = "http://quotes.morningstar.com/fundq/c-header", 
      query = list(
      t="XNAS:ADAFX", 
      region="usa", 
      culture="en-US", 
      version="RET", 
      test="QuoteiFrame" 
      ) 
) 

content(res) %>% 
    html_nodes("span[vkey='TotalAssets']") %>% 
    html_text() %>% 
    trimws() 
## [1] "20.6 mil" 
+0

看起来像我有我的作业切出来,谢谢你为我提供了一个开始! –

+0

后续问题,通过循环运行代码时,它会通过以下错误:'使用方法中的错误(“content”,x): 没有将'content'应用于类“response”对象的适用方法'对whats继续。即使您的原始代码一旦发生,就会出现此错误。我必须重新启动我的R会话来修复它。 –

+0

我们需要看到循环。成为后续SO q的候选人。 – hrbrmstr

0

Here是它调用的csv文件。

library(httr) 
library(rvest) 
library(tm) 
library(plyr) 
require("dplyr") 

MF.List <- read.csv("C:/Users/Aaron/Documents/Investment Committee/Screener/Filtered Funds.csv") 
Category.list <- read.csv("C:/Users/Aaron/Documents/Investment Committee/Screener/Category.csv") 
Category.list <- na.omit(Category.list) 

Category.name <- "Financial" 
MF.Category.List <- filter(MF.List, Category == Category.name) 

morningstar.scrape <- list() 

for(i in 1:nrow(MF.Category.List)){ 

    Symbol.i =as.character(MF.Category.List[i,"Symbol"]) 
    res <- GET(url = "http://quotes.morningstar.com/fundq/c-header", 
      query = list(
       t=paste("XNAS:",Symbol.i,sep=""), 
       region="usa", 
       culture="en-US", 
       version="RET", 
       test="QuoteiFrame" 
      ) 
) 

    tryCatch(
    TTM.Yield <- content(res) %>% 
     html_nodes("span[vkey='ttmYield']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) TTM.Yield<-NA) 

    tryCatch(
    Load <- content(res) %>% 
     html_nodes("span[vkey='Load']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Load = NA) 

    tryCatch(
    Total.Assets <- content(res) %>% 
     html_nodes("span[vkey='TotalAssets']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Total.Assets = NA) 

    tryCatch(
    Expense.Ratio <- content(res) %>% 
     html_nodes("span[vkey='ExpenseRatio']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Expense.Ratio = NA) 

    tryCatch(
    Fee.Level <- content(res) %>% 
     html_nodes("span[vkey='FeeLevel']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Fee.Level = NA) 

    tryCatch(
    Turnover <- content(res) %>% 
     html_nodes("span[vkey='Turnover']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Turnover = NA) 

    tryCatch(
    Status <- content(res) %>% 
     html_nodes("span[vkey='Status']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Status = NA) 

    tryCatch(
    Min.Investment <- content(res) %>% 
     html_nodes("span[vkey='MinInvestment']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Min.Investment = NA) 

    tryCatch(
    Yield.30day <- content(res) %>% 
     html_nodes("span[vkey='Yield']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Yield.30day = NA) 

    tryCatch(
    Investment.Style <- content(res) %>% 
     html_nodes("span[vkey='InvestmentStyle']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Investment.Style = NA) 

    tryCatch(
    Bond.Style <- content(res) %>% 
     html_nodes("span[vkey='BondStyle']") %>% 
     html_text() %>% 
     trimws() 
    , error = function(e) Bond.Style = NA) 

    x.frame <- c(Symbol =as.character(Symbol.i),TTM.Yield = as.character(TTM.Yield), Load = as.character(Load), 
       Total.Assets = as.character(Total.Assets),Expense.Ratio = as.character(Expense.Ratio), 
       Turnover = as.character(Turnover), Status = as.character(Status), 
       Yield.30day = as.character(Yield.30day), 
       Investment.Style = as.character(Investment.Style),Bond.Style = as.character(Bond.Style)) 

    morningstar.scrape[[i]] = x.frame 
    x.frame = NULL 
} 

MS.scrape <- do.call(rbind, morningstar.scrape) 
+0

类别此时代码中未使用csv –

+0

它看起来像在'library(tm)中添加引起了问题,我从代码中删除了tm包并使用gerpl代替过滤器。现在循环工作正常。所以问题在于这两个软件包玩的不好;至于为什么,我不知道。 –

0

工作码,

我添加了一个函数到web刮和除去库(TM)。

library(httr) 
library(rvest) 


    get.morningstar <- function(Symbol.i,htmlnode){ 
     res <- GET(url = "http://quotes.morningstar.com/fundq/c-header", 
       query = list(
        t=paste("XNAS:",Symbol.i,sep=""), 
        region="usa", 
        culture="en-US", 
        version="RET", 
        test="QuoteiFrame" 
       ) 
    ) 

     x <- content(res) %>% 
     html_nodes(htmlnode) %>% 
     html_text() %>% 
     trimws() 

     return(x) 
    } 



    MF.List <- read.csv("C:/Users/Aaron/Documents/Bitrix24/Investment Committee/Screener/Filtered Funds.csv") 
    Category.list <- read.csv("C:/Users/Aaron/Documents/Bitrix24/Investment Committee/Screener/Category.csv") 
    Category.list <- na.omit(Category.list) 

    Category.name <- "Small Growth" 
    MF.Category.List <- MF.List[grepl(Category.name,MF.List$Category), ] 
    morningstar.scrape <- list() 

    for(i in 1:nrow(MF.Category.List)){ 
     Symbol.i =as.character(MF.Category.List[i,"Symbol"]) 
     try(Total.Assets <- get.morningstar(Symbol.i,"span[vkey='TotalAssets']")) 
     print(Total.Assets) 
    }