您可以使用which
找到drivenum
适用于给定值在drivedata$pid
行:
drivenum <- data.frame(fpid = c(2, 12, 19, 23, 36),
lpid = c(9, 17, 22, 34, 39),
drvn = c(1, 2, 3, 4, 5))
drivedata <- data.frame(pid = 1:20)
drvn.list <- sapply(drivedata$pid,
function(x){ drivenum$drvn[which((drivenum$fpid <= x) & (x <= drivenum$lpid))]})
。
> drvn.list
[[1]]
numeric(0)
[[2]]
[1] 1
[[3]]
[1] 1
[[4]]
[1] 1
[[5]]
[1] 1
[[6]]
[1] 1
[[7]]
[1] 1
[[8]]
[1] 1
[[9]]
[1] 1
[[10]]
numeric(0)
[[11]]
numeric(0)
[[12]]
[1] 2
[[13]]
[1] 2
[[14]]
[1] 2
[[15]]
[1] 2
[[16]]
[1] 2
[[17]]
[1] 2
[[18]]
numeric(0)
[[19]]
[1] 3
[[20]]
[1] 3
>
这里是一个替代的解决方案,存在drivenum$drvn
最多一个值,
列drivenum$fpid
和drivenum$lpid
越来越有序,即divenum$fpid[i]<drivenum$fpid[j]
如果为每个值工作,如果
- 在
drivedata$pid
i<j
和drivenum$lpid
类似。
尽管它包含一个循环,但速度更快。所以循环是并不总是那坏。使用具有尺寸8000的drivenum
和大小60000的drivedata
示例
drvn.list.2 <- lapply(as.list(as.integer(rep(0,nrow(drivedata)))),head,0)
pos <- rep(NA,max(drivenum$lpid))
pos[drivedata$pid] <- 1:nrow(drivedata)
for (i in 1:nrow(drivenum))
{
if (max(drivedata$pid)<drivenum$fpid[i]) { break() }
drvn.list.2[pos[drivenum$fpid[i]:drivenum$lpid[i]]] <-
drivenum$drvn[i]
}
速度比较:
#---------------------------------------------------------
# Generate example data:
set.seed(1)
n <- 8000
d1 <- sample(1:3,n,replace=TRUE)
d2 <- sample(1:10,n,replace=TRUE)
drivenum <- data.frame(fpid = cumsum(d1+(c(0,d2)[-n])),
lpid = cumsum(d1+d2),
drvn = sample(1:n))
drivedata <- data.frame(pid = sample(1:60000))
#----------------------------------------------------------
# Speed comparison:
system.time(
for (k in 1:10)
{
drvn.list.1 <- sapply(drivedata$pid,
function(x){ drivenum$drvn[which((drivenum$fpid <= x) & (x <= drivenum$lpid))] })
}
)
system.time(
for (k in 1:10)
{
drvn.list.2 <- lapply(as.list(as.integer(rep(0,nrow(drivedata)))),head,0)
pos <- rep(NA,max(drivenum$lpid))
pos[drivedata$pid] <- 1:nrow(drivedata)
for (i in 1:nrow(drivenum))
{
if (max(drivedata$pid)<drivenum$fpid[i]) { break() }
drvn.list.2[pos[drivenum$fpid[i]:drivenum$lpid[i]]] <-
drivenum$drvn[i]
}
}
)
。
> system.time(
+ for (k in 1:10)
+ {
+ drvn.list.1 <- .... [TRUNCATED]
user system elapsed
432.12 0.46 436.73
> system.time(
+ for (k in 1:10)
+ {
+ drvn.list.2 <- lapply(as.list(as.integer(rep(0,nrow(drivedata)))),head,0)
+ pos <- rep(NA,max(dr .... [TRUNCATED]
user system elapsed
51.07 0.03 51.41
>
结果一致:
> identical(drvn.list.1,drvn.list.2)
[1] TRUE
>
您可以检查'foverlaps'从'库(data.table)' – akrun