2017-08-09 40 views
1

我有一个相当大的数据框,约1000万行,在我的例子中,这由向量x1和y1表示。R或向量化中的快速应变

set.seed(100) 
x1<-round(runif(10000,min=1,max=5),0) #random values [1;2;3;4;5] 
x2<-runif(10000,min=0,max=1) #random num (0,1] 

我要计算与下表“rvps”的帮助下,新的向量xx

rvps<-data.frame(Q_cat=c(1,2,2,2,3,3,3,4,4,5),prov_calc=c(0,1,10,20,21,30,50,51,60,100), 
     s3_from=c(0.00,0.00,0.90,0.99,0.00,0.60,0.65,0.00,0.99,0.00), 
     s3_to=c(1.00,0.90,0.99,1.00,0.60,0.65,1.00,0.99,1.00,1.00)) 

我做了几个解决方案:

#sol№1 
library(doParallel) 
xx1<-foreach(i=1:length(x1)) %do% {rvps$prov_calc[x1[i]==rvps$Q_cat & x2[i]>rvps$s3_from & x2[i]<=rvps$s3_to]} 
#system.time=2.87 

太慢

#sol№2 
xx2<-ifelse(x1==1,0, 
    ifelse(x1==2, 
      ifelse(x2>0 & x2<=0.9,1, 
      ifelse(x2>0.9 & x2<=0.99,10, 
      ifelse(x2>0.99 & x2<=1,20,20))), 
    ifelse(x1==3, 
      ifelse(x2>0 & x2<=0.6,21, 
      ifelse(x2>0.6 & x2<=0.65,30, 
      ifelse(x2>0.65 & x2<=1,50,50))), 
    ifelse(x1==4, 
      ifelse(x2>0 & x2<=0.99,51, 
      ifelse(x2>0.99 & x2<=1,60,60)), 
    ifelse(x1==5,100,100))))) 
#system.time=0.02 

没有我的表(所有边界manualy进入),但快速

#sol№3 
rvps.prob<-function(X,Y) {rvps$prov_calc[X==rvps$Q_cat & Y>rvps$s3_from & Y<=rvps$s3_to]} 
xx3<-mapply(rvps.prob,x1,x2) 
#system.time=0.59 

mapply解决方案。比我第一次尝试的速度更快,但速度并不像我需要的那么快。我如何矢量化我的任务? The same question in russian

更新:从我的同事几个更多的解决方案。全部失去矢量功能

#4 вариант #system.time=1.03 
system.time(for(i in 1:length(x1)) 
{ 
    if (rvps$prov_calc[x1[i]==rvps$Q_cat & x2[i]>rvps$s3_from & x2[i]<=rvps$s3_to]) 
    xx4[i] <- rvps$prov_calc[x1[i]==rvps$Q_cat & x2[i]>rvps$s3_from & x2[i]<=rvps$s3_to] 
    else xx4[i] <- 0 
}) 

#5 вариант #system.time=3.57 
system.time({ 
    xx5<-unlist(foreach(i=1:length(x1)) %do% {rvps$prov_calc[x1[i]==rvps$Q_cat & x2[i]>rvps$s3_from & x2[i]<=rvps$s3_to]}) 
    }) 

#6 вариант #system.time=2.24 
system.time(for(i in 1:length(x1)) 
{ 
    for(j in 1:length(rvps$prov_calc)) 
    if (x1[i]==rvps$Q_cat[j] & x2[i]>rvps$s3_from[j] & x2[i]<=rvps$s3_to[j]) {xx6[i] <- rvps$prov_calc[j];break} 
}) 

回答

0

我的工作精髓如下。

初始数据:

mm1<-round(runif(200000,min=1,max=5),0) #random values [1;2;3;4;5] 
mm2<-runif(200000,min=0,max=1) #random num (0,1] 

矢量与{dplur}№1:

system.time({ 
mm3<-if_else(mm1==1,0, 
    if_else(mm1==2 & mm2>0 & mm2<= 0.9,1, 
    if_else(mm1==2 & mm2>0.9 & mm2<= 0.99,10, 
    if_else(mm1==2 & mm2>0.99 & mm2<= 1,20, 
    if_else(mm1==3 & mm2>0.0 & mm2<= 0.6,21, 
    if_else(mm1==3 & mm2>0.6 & mm2<= 0.65,30, 
    if_else(mm1==3 & mm2>0.65 & mm2<= 1,50, 
    if_else(mm1==4 & mm2>0 & mm2<= 0.99,51, 
    if_else(mm1==4 & mm2>0.99 & mm2<= 1,60, 
    if_else(mm1==5,100,100)))))))))) 
}) #system.time=0.14 

矢量与{dplur}№2:

system.time({ 
mm3<-case_when(
    mm1==1 ~ 0, 
    mm1==2 & mm2>0 & mm2<= 0.9 ~ 1, 
    mm1==2 & mm2>0.9 & mm2<= 0.99 ~ 10, 
    mm1==2 & mm2>0.99 & mm2<= 1 ~ 20, 
    mm1==3 & mm2>0.0 & mm2<= 0.6 ~ 21, 
    mm1==3 & mm2>0.6 & mm2<= 0.65 ~ 30, 
    mm1==3 & mm2>0.65 & mm2<= 1 ~ 50, 
    mm1==4 & mm2>0 & mm2<= 0.99 ~ 51, 
    mm1==4 & mm2>0.99 & mm2<= 1 ~ 60, 
    mm1==5 ~ 100) #system.time=0.14 
}) #system.time=0.08