2017-02-27 54 views
9

假设我有一个数据帧,如下所示:在计算配对差异高效实现

> foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3)) 
> foo 
    x id 
1 1 1 
2 2 1 
3 3 2 
4 4 2 
5 5 2 
6 6 3 
7 7 3 
8 8 3 
9 9 3 

我想要一个非常有效的实现的H(A,B),计算总和所有(一个 - XI)*(B - xj)代表属于同一个id类的xi,xj。例如,我的当前的实现是

h(a, b, foo){ 
    a.diff = a - foo$x 
    b.diff = b - foo$x 
    prod = a.diff%*%t(b.diff) 
    id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo)) 
    return(sum(prod*id.indicator)) 
} 

例如,对于(A,B)=(0,1),这里是从每个步骤在功能

> a.diff 
[1] -1 -2 -3 -4 -5 -6 -7 -8 -9 
> b.diff 
[1] 0 -1 -2 -3 -4 -5 -6 -7 -8 
> prod 
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] 
[1,] 0 1 2 3 4 5 6 7 8 
[2,] 0 2 4 6 8 10 12 14 16 
[3,] 0 3 6 9 12 15 18 21 24 
[4,] 0 4 8 12 16 20 24 28 32 
[5,] 0 5 10 15 20 25 30 35 40 
[6,] 0 6 12 18 24 30 36 42 48 
[7,] 0 7 14 21 28 35 42 49 56 
[8,] 0 8 16 24 32 40 48 56 64 
[9,] 0 9 18 27 36 45 54 63 72 
> id.indicator 
    1 2 3 4 5 6 7 8 9 
1 1 1 0 0 0 0 0 0 0 
2 1 1 0 0 0 0 0 0 0 
3 0 0 1 1 1 0 0 0 0 
4 0 0 1 1 1 0 0 0 0 
5 0 0 1 1 1 0 0 0 0 
6 0 0 0 0 0 1 1 1 1 
7 0 0 0 0 0 1 1 1 1 
8 0 0 0 0 0 1 1 1 1 
9 0 0 0 0 0 1 1 1 1 

在现实输出,可以有多达1000个id的簇,每个簇至少有40个,这使得这个方法效率太低,因为id.indicator中的稀疏条目以及在不使用的块外对角线上的额外计算。

+2

简单,并且仍非常活泼:字中,h < - 功能(A,B,富){总和(tapply(FOO $ X,FOO $ ID,函数(X){总和(tcrossprod(一 - x,b - x))}))}' – alistaire

回答

3

tapply允许您跨矢量组应用函数,并将结果简化为矩阵或向量(如果可以)。使用tcrossprod乘以每个组的所有组合,并在一些合适的大数据也表现良好:

# setup 
set.seed(47) 
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3)) 
foo2 <- data.frame(id = sample(1000, 40000, TRUE), x = rnorm(40000)) 

h_OP <- function(a, b, foo){ 
    a.diff = a - foo$x 
    b.diff = b - foo$x 
    prod = a.diff %*% t(b.diff) 
    id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo)) 
    return(sum(prod * id.indicator)) 
} 

h3_AEBilgrau <- function(a, b, foo) { 
    a.diff <- a - foo$x 
    b.diff <- b - foo$x 
    ids <- unique(foo$id) 
    res <- 0 
    for (i in seq_along(ids)) { 
    indx <- which(foo$id == ids[i]) 
    res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx])) 
    } 
    return(res) 
} 

h_d.b <- function(a, b, foo){ 
    sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x)))) 
} 

h_alistaire <- function(a, b, foo){ 
    sum(tapply(foo$x, foo$id, function(x){sum(tcrossprod(a - x, b - x))})) 
} 

都返回同样的事情,而不是在小数据不同:

h_OP(0, 1, foo) 
#> [1] 891 
h3_AEBilgrau(0, 1, foo) 
#> [1] 891 
h_d.b(0, 1, foo) 
#> [1] 891 
h_alistaire(0, 1, foo) 
#> [1] 891 

# small data test 
microbenchmark::microbenchmark(
    h_OP(0, 1, foo), 
    h3_AEBilgrau(0, 1, foo), 
    h_d.b(0, 1, foo), 
    h_alistaire(0, 1, foo) 
) 
#> Unit: microseconds 
#>      expr  min  lq  mean median  uq  max neval cld 
#>   h_OP(0, 1, foo) 143.749 157.8895 189.5092 189.7235 214.3115 262.258 100 b 
#> h3_AEBilgrau(0, 1, foo) 80.970 93.8195 112.0045 106.9285 125.9835 225.855 100 a 
#>   h_d.b(0, 1, foo) 355.084 381.0385 467.3812 437.5135 516.8630 2056.972 100 c 
#> h_alistaire(0, 1, foo) 148.735 165.1360 194.7361 189.9140 216.7810 287.990 100 b 

尽管数据量较大,但差异变得更加严峻。最初扬言要崩溃我的笔记本电脑,但这里是最快的两个标杆:

# on 1k groups, 40k rows 
microbenchmark::microbenchmark(
    h3_AEBilgrau(0, 1, foo2), 
    h_alistaire(0, 1, foo2) 
) 
#> Unit: milliseconds 
#>      expr  min  lq  mean median  uq  max neval cld 
#> h3_AEBilgrau(0, 1, foo2) 336.98199 403.04104 412.06778 410.52391 423.33008 443.8286 100 b 
#> h_alistaire(0, 1, foo2) 14.00472 16.25852 18.07865 17.22296 18.09425 96.9157 100 a 

另一种可能性是使用data.frame按组进行总结,再总结相应的列。在R基础上,你可以用aggregate来做到这一点,但dplyr和data.table是流行的,因为使用更复杂的聚合可以使这种方法更简单。

aggregatetapply慢。 dplyr比aggregate快,但仍然比较慢。 data.table,专为速度而设计,几乎与tapply一样快。

library(dplyr) 
library(data.table) 

h_aggregate <- function(a, b, foo){sum(aggregate(x ~ id, foo, function(x){sum(tcrossprod(a - x, b - x))})$x)} 
tidy_h <- function(a, b, foo){foo %>% group_by(id) %>% summarise(x = sum(tcrossprod(a - x, b - x))) %>% select(x) %>% sum()} 
h_dt <- function(a, b, foo){setDT(foo)[, .(x = sum(tcrossprod(a - x, b - x))), by = id][, sum(x)]} 

microbenchmark::microbenchmark(
    h_alistaire(1, 0, foo2), 
    h_aggregate(1, 0, foo2), 
    tidy_h(1, 0, foo2), 
    h_dt(1, 0, foo2) 
) 
#> Unit: milliseconds 
#>      expr  min  lq  mean median  uq  max neval cld 
#> h_alistaire(1, 0, foo2) 13.30518 15.52003 18.64940 16.48818 18.13686 62.35675 100 a 
#> h_aggregate(1, 0, foo2) 93.08401 96.61465 107.14391 99.16724 107.51852 143.16473 100 c 
#>  tidy_h(1, 0, foo2) 39.47244 42.22901 45.05550 43.94508 45.90303 90.91765 100 b 
#>  h_dt(1, 0, foo2) 13.31817 15.09805 17.27085 16.46967 17.51346 56.34200 100 a 
3
sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x)))) 
#[1] 891 

#TESTING 
foo = data.frame(x = sample(1:9,10000,replace = TRUE), 
         id = sample(1:3, 10000, replace = TRUE)) 
system.time(sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))) 
# user system elapsed 
# 0.15 0.01 0.17 
6

我玩了一下。首先,您的实现:

foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3)) 

h <- function(a, b, foo){ 
    a.diff = a - foo$x 
    b.diff = b - foo$x 
    prod = a.diff%*%t(b.diff) 
    id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + 
    diag(nrow(foo)) 
    return(sum(prod*id.indicator)) 
} 

h(a = 1, b = 0, foo = foo) 
#[1] 891 

接下来,我试图用一个适当稀疏矩阵实现(通过Matrix包),该指数基体的变形和功能。我也使用tcrossprod,我经常发现它比a %*% t(b)快一点。

library("Matrix") 

h2 <- function(a, b, foo) { 
    a.diff <- a - foo$x 
    b.diff <- b - foo$x 
    prod <- tcrossprod(a.diff, b.diff) # the same as a.diff%*%t(b.diff) 
    id.indicator <- do.call(bdiag, lapply(table(foo$id), function(n) matrix(1,n,n))) 
    return(sum(prod*id.indicator)) 
} 

h2(a = 1, b = 0, foo = foo) 
#[1] 891 

请注意,此功能依赖foo$id被排序。

最后,我试着避免通过n矩阵创建完整的n。

h3 <- function(a, b, foo) { 
    a.diff <- a - foo$x 
    b.diff <- b - foo$x 
    ids <- unique(foo$id) 
    res <- 0 
    for (i in seq_along(ids)) { 
    indx <- which(foo$id == ids[i]) 
    res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx])) 
    } 
    return(res) 
} 

h3(a = 1, b = 0, foo = foo) 
#[1] 891 
您例如

标杆:

library("microbenchmark") 
microbenchmark(h(a = 1, b = 0, foo = foo), 
       h2(a = 1, b = 0, foo = foo), 
       h3(a = 1, b = 0, foo = foo)) 
# Unit: microseconds 
#      expr  min  lq  mean median  uq  max neval 
# h(a = 1, b = 0, foo = foo) 248.569 261.9530 493.2326 279.3530 298.2825 21267.890 100 
# h2(a = 1, b = 0, foo = foo) 4793.546 4893.3550 5244.7925 5051.2915 5386.2855 8375.607 100 
# h3(a = 1, b = 0, foo = foo) 213.386 227.1535 243.1576 234.6105 248.3775 334.612 100 

现在,在这个例子中,h3是最快h2实在是太慢了。但我想这对于更大的例子来说都会更快。可能h3仍然会赢得更大的例子。虽然有更多的优化空间,h3应该更快,更高效的内存。所以,我认为你应该去找一个h3的变体,它不会产生不必要的大矩阵。

+0

'tab'从哪里来? – Raad

+0

@NBATrends糟糕,应该是'ids';来自原型的剩余物。 –