R中

2016-08-11 41 views
0

高效,选择性地组合列,我有以下数据R中

countrycols = alljson[,c("country_gc_str","country_ipapi_str","country_tm_str")] 

head(countrycols) 
country_gc_str country_ipapi_str country_tm_str 
1   <NA>    RU    RU 
2   <NA>    CN    CN 
3    US    US    US 
4   <NA>    CD    CG 
5   <NA>    DE    DE 
6   <NA>    <NA>    NG 

我想创建它获取在以下优先顺序充满了传统数据的新列country_final_str:

country_gc_str 
country_ipapi_str 
country_tm_str 

我我还使用以下特征描述了国家收入水平:

wbURL <- "http://api.worldbank.org/countries?per_page=304" 
xmlAPI <- xmlParse(wbURL) 
xmlDF <- xmlToDataFrame(xmlAPI) 
xmlDF$iso2CodeChar <- as.character(xmlDF$iso2Code) 
xmlDF$incomeLevelChar <- as.character(xmlDF$incomeLevel) 
incomexml <- xmlDF[,c("iso2CodeChar","incomeLevelChar")] 
incomexmltable <- as.data.table(incomexml) 

我有环以下,但它正在采取永远因为我有一百万条记录:

alljson$country_final_str <- alljson$country_gc_str 
    alljson$income_level <- NA 

    for (i in 1:length (alljson$country_final_str)) 
    { 
    if (is.na(alljson$country_final_str [i])) 
    { 
     alljson$country_final_str [i] = alljson$country_ipapi_str [i]; 
    } 
    if (is.na(alljson$country_final_str [i])) 
    { 
     alljson$country_final_str [i] = alljson$country_tm_str [i]; 
    } 

    a<-incomexmltable[iso2CodeChar==alljson$country_final_str [i]]$incomeLevelChar 

    if(length(a)==0) 
    { 
     alljson$income_level [i] <- NA 
    } else { 
     alljson$income_level [i] <- a 
    } 
    } 

对于提高效率的任何想法/ for循环中摆脱的?我想不出一种方法来apply/lapply/tapply,我在Windows上,所以我努力使用doParalleldoSNOW并行我的代码失败。

请在@thelatemail下面查看列问题的正确答案。对于国家收入水平,我执行:

allcountries <- unique(alljson$country_final_str) 

alljson$country_income_str <- NA 
sum(!is.na(countrycode(allcountries, "iso2c", "country.name"))) 
for (i in 1:length(allcountries)) 
{ 
    a<-incomexmltable[iso2CodeChar==allcountries[i]]$incomeLevelChar 
    if(length(a)==0) 
    { 
    alljson$country_income_str[which(alljson$country_final_str==allcountries[i])] <- NA 
    } else { 
    alljson$country_income_str[which(alljson$country_final_str==allcountries[i])] <- a 
    } 

    alljson$country_income_str 
} 
+1

您拍照时看看'WDI'包?这是一个很好的。 – shayaa

回答

3

下面是三个变量选择的第一个非缺失值后,利用矩阵索引的尝试:

countrycols[ 
    cbind(
    seq_len(nrow(countrycols)), 
    max.col(replace(-col(countrycols), is.na(countrycols), -Inf)) 
) 
] 
#[1] "RU" "CN" "US" "CD" "DE" "NG" 

要解释的逻辑,打破各line:

-col(countrycols) 
#  [,1] [,2] [,3] 
#[1,] -1 -2 -3 
#[2,] -1 -2 -3 
#[3,] -1 -2 -3 
#[4,] -1 -2 -3 
#[5,] -1 -2 -3 
#[6,] -1 -2 -3 

replace(-col(countrycols), is.na(countrycols), -Inf) 
#  [,1] [,2] [,3] 
#[1,] -Inf -2 -3 
#[2,] -Inf -2 -3 
#[3,] -1 -2 -3 
#[4,] -Inf -2 -3 
#[5,] -Inf -2 -3 
#[6,] -Inf -Inf -3 

(colindex <- max.col(replace(-col(countrycols), is.na(countrycols), -Inf))) 
#[1] 2 2 1 2 2 3 

cbind(rowindex=seq_len(nrow(countrycols)), colindex) 
#  rowindex colindex 
#[1,]  1  2 
#[2,]  2  2 
#[3,]  3  1 
#[4,]  4  2 
#[5,]  5  2 
#[6,]  6  3 

该最终矩阵用于从原始列表中排除每个行/列组合。

其中countrycols是:

structure(list(country_gc_str = c(NA, NA, "US", NA, NA, NA), 
    country_ipapi_str = c("RU", "CN", "US", "CD", "DE", NA), 
    country_tm_str = c("RU", "CN", "US", "CG", "DE", "NG")), .Names = c("country_gc_str", 
"country_ipapi_str", "country_tm_str"), row.names = c("1", "2", 
"3", "4", "5", "6"), class = "data.frame")