2017-02-10 89 views
1

示例数据计算:标签为值从列表中的R

#Referece set 
Newyork <- c("ant", "bat", "cat", "dog", "unicorn", "camel", "snake", "monkey", "donkey") 
Tokyo <- c("unicorn") 
Yokohama <- c("ant", "bat", "cat", "dog") 
Chicago <- c("bird", "ant", "bat", "cat", "bear", "dog", "snake", "monkey", "mouse", " donkey", "octopus", "camel") 
Nashville <-c("ant", "bat", "octopus") 
DC <- c("ant", "dog", "cat", "bird") 
Boston <- c("ant", "bird", "cat", "bear", "camel", "snake", "mouse", "octopus") 

#query set 
Elendel <- c("wolf", "dog" ,"ant") 

#combining References 
Refcities <- list(Boston, Chicago, DC, Nashville, Newyork, Tokyo, Yokohama) 

代码:

#outersect 
outersect <- function(x, y) { 
    big.vec <- c(x, y) 
    duplicates <- big.vec[duplicated(big.vec)] 
    setdiff(big.vec, unique(duplicates)) 
} 

#combining intersect and outersect for unweighted similarity 
unweighted <- function(x, y,...){ 
    len.inter <- length (intersect(x, y)) 
    len.outer <- length (outersect(x, y)) 
    len.add <- len.inter-len.outer 
    len.add 
} 

#single line recursive  
UWshort <- function(x, y) { 
    i <- y 
    countermax <- length(Refcities) 
    while (i <= countermax) { 
    print (unweighted (x, Refcities[[i]])) 
    i = i+1 } 
} 

UWshort(Elendel, 1) 

我有上面的代码这给我一个数,其等于(共享动物#) - ( #不共享)来比较一个城市与7个参考城市的动物。

测试结果如下:

[1] -8 
[1] -9 
[1] -1 
[1] -3 
[1] -6 
[1] -4 
[1] -1 

如何添加回在城市的参考,并根据结果按数字顺序进行排序?

理想的输出看起来象下面这样:

City  Score 
[1] DC  -1 
[2] Yokohama -1 
[3] Nashville -3 
[4] Tokyo  -4 
[5] Newyork -6 
[6] Boston -8 
[7] Chicago -9 
+0

为什么不使用矩阵或数据框,并使用向量化调用计算动物:'table','tapply','ave','aggregate'?你能解释一下结果吗?例如,当它与至少一个其他城市共享其所有动物时,DC = -1如何? – Parfait

回答

0

这将返回城市名称为rownames,但是这是一个简单的副本,如果你宁愿它是一列

Refcities <- list(Boston=Boston, Chicago=Chicago, DC=DC, Nashville=Nashville, Newyork=Newyork, Tokyo=Tokyo, Yokohama=Yokohama) 

score <- sort(sapply(Refcities, unweighted, Elendel), decreasing = TRUE) 

df1 <- data.frame(score) 

> df1 
      score 
DC   -1 
Yokohama  -1 
Nashville -3 
Tokyo  -4 
Newyork  -6 
Boston  -8 
Chicago  -9 
0

保持您的outersect功能,但然后前往tidyverse

#outersect 
outersect <- function(x, y) { 
big.vec <- c(x, y) 
duplicates <- big.vec[duplicated(big.vec)] 
setdiff(big.vec, unique(duplicates)) 
} 

#Reference set 
Newyork <- c("ant", "bat", "cat", "dog", "unicorn", "camel", "snake", "monkey", "donkey") 
Tokyo <- c("unicorn") 
Yokohama <- c("ant", "bat", "cat", "dog") 
Chicago <- c("bird", "ant", "bat", "cat", "bear", "dog", "snake", "monkey", "mouse", " donkey", "octopus", "camel") 
Nashville <-c("ant", "bat", "octopus") 
DC <- c("ant", "dog", "cat", "bird") 
Boston <- c("ant", "bird", "cat", "bear", "camel", "snake", "mouse", "octopus") 

#query set 
Elendel <- c("wolf", "dog" ,"ant") 

library(tidyverse) 
#combining References 
cities <- c('Boston', 'Chicago', 'DC', 'Nashville', 'Newyork', 'Tokyo', 'Yokohama') 
Refcities <- list(Boston = Boston 
        , Chicago = Chicago 
        , DC = DC 
        , Nashville = Nashville 
        , Newyork = Newyork 
        , Tokyo = Tokyo 
        , Yokohama = Yokohama) 

df <- data_frame(City = cities 
        , inter = sapply(Refcities, function(x) { 
         length(intersect(x, Elendel)) 
         }) 
        , outer = sapply(Refcities, function(x){ 
         length(outersect(x, Elendel)) 
         }) 
        ) %>% 
    mutate(Score = inter - outer) %>% 
    arrange(desc(Score)) %>% select(City, Score)