2017-02-25 138 views
0

我想从下表中的列'Name'中提取超文本和超链接:European Medicines Agency。 我的目标是创建一个数据框,其中一列用于名称,另一列用于链接。 使用下面的代码,我能够收集超链接,但我失去了如何匹配实际名称的链接?使用rvest从表中的列中提取超文本和超链接

library(rvest) 
library(dplyr) 

page <- read_html('http://www.ema.europa.eu/ema/index.jsp?curl=pages/medicines/landing/smop_search.jsp&mid=WC0b01ac058001d127&startLetter=View%20all&applicationType=Initial%20authorisation&applicationType=Post%20authorisation&keyword=Enter%20keywords&keyword=Enter%20keywords&searchkwByEnter=false&searchType=Name&alreadyLoaded=true&status=Positive&status=Negative&jsenabled=false&orderBy=opinionDate&pageNo=1') %>% 
    html_nodes('tbody a') %>% html_attr('href') 

dfpage <- data.frame(page) 

回答

1
library(rvest) 
library(tidyverse) 

url_template <- "http://www.ema.europa.eu/ema/index.jsp?searchType=Name&applicationType=Initial+authorisation&applicationType=Post+authorisation&searchkwByEnter=false&mid=WC0b01ac058001d127&status=Positive&status=Negative&keyword=Enter+keywords&keyword=Enter+keywords&alreadyLoaded=true&curl=pages%%2Fmedicines%%2Flanding%%2Fsmop_search.jsp&startLetter=View+all&pageNo=%s" 

获取页面的总#:

first <- sprintf(url_template, 1) 

pg <- read_html(first) 

html_nodes(pg, "div.pagination > ul > li:not([class])") %>% 
    tail(1) %>% 
    html_text(trim = TRUE) %>% 
    as.numeric() -> total_pages 

只有3,但CLD很多在未来,所以建立一个进度条来招待你,并刮去表,然后提取链接并将其添加到表中:

pb <- progress_estimated(total_pages) 

sprintf(url_template, 1:total_pages) %>% 
    map_df(function(URL) { 

    pb$tick()$print() 

    pg <- read_html(URL) 

    html_table(pg, trim = TRUE) %>% 
     .[[1]] %>% 
     set_names(c("name", "active_substance", "inn", "adopted", "outcome")) %>% 
     as_tibble() %>% 
     mutate(url = html_nodes(pg, "th[scope='row'] > a") %>% html_attr("href")) 

    }) -> pending_df 

glimpse(pending_df) 
## Observations: 67 
## Variables: 6 
## $ name    <chr> "Lifmior", "Tamiflu", "Jylamvo", "Terrosa", "... 
## $ active_substance <chr> "etanercept", "oseltamivir", "methotrexate", ... 
## $ inn    <chr> "etanercept", "oseltamivir", "methotrexate", ... 
## $ adopted   <chr> "2016-12-15", "2015-03-26", "2017-01-26", "20... 
## $ outcome   <chr> "Positive", "Positive", "Positive", "Positive... 
## $ url    <chr> "index.jsp?curl=pages/medicines/human/medicin... 
0

我会用下面的代码:

library(rvest) 
library(tidyverse) 

page <- read_html('http://www.ema.europa.eu/ema/index.jsp?curl=pages/medicines/landing/smop_search.jsp&mid=WC0b01ac058001d127&startLetter=View%20all&applicationType=Initial%20authorisation&applicationType=Post%20authorisation&keyword=Enter%20keywords&keyword=Enter%20keywords&searchkwByEnter=false&searchType=Name&alreadyLoaded=true&status=Positive&status=Negative&jsenabled=false&orderBy=opinionDate&pageNo=1') %>% 
    html_nodes("table") %>% 
    rvest::html_table() 

data <- as_data_frame(page[[1]]) 

page_link <- read_html('http://www.ema.europa.eu/ema/index.jsp?curl=pages/medicines/landing/smop_search.jsp&mid=WC0b01ac058001d127&startLetter=View%20all&applicationType=Initial%20authorisation&applicationType=Post%20authorisation&keyword=Enter%20keywords&keyword=Enter%20keywords&searchkwByEnter=false&searchType=Name&alreadyLoaded=true&status=Positive&status=Negative&jsenabled=false&orderBy=opinionDate&pageNo=1') %>% 
    html_nodes(".key-detail a , .alt~ .alt th") %>% 
    html_attr('href') 

link <- as_data_frame(page_link) 
links <- as_data_frame(link$value[-1]) 

result <- cbind(data, links) 

final <- result[, c("Name", "value")] 

final产量第一行:

print(t(final[1,])) 

...

 1                             
Name "Natpar"                           
value "index.jsp?curl=pages/medicines/human/medicines/003861/smops/Positive/human_smop_001096.jsp&mid=WC0b01ac058001d127" 

我希望帮助。顺便说一下,我使用Chrome的SelectorGadget加载项获得了正确的代码。