2017-03-17 106 views
0

当我了解到merge()中默认为TRUE的附加参数'sort'时,我真的认为我解决了这个问题。但是,将其设置为false并不会有帮助。下面是我的代码演示,与我得到的结果和我想要的结果:在R中,合并2个数据帧,同时保持第一个数据帧的行顺序

df2 = structure(list(player = c("Marvin Williams", "Spencer Hawes", 
"Jeremy Lin", "Kemba Walker", "P.J. Hairston", "Rudy Gay", "Rajon Rondo", 
"DeMarcus Cousins", "Ben McLemore", "Willie Cauley-Stein"), global.player.id = c(263884L, 
329824L, 340730L, 462980L, 609567L, 266358L, 262882L, 509450L, 
604898L, 699950L), team.name = c("Hornets", "Hornets", "Hornets", 
"Hornets", "Grizzlies", "Kings", "Kings", "Kings", "Kings", "Kings" 
)), .Names = c("player", "global.player.id", "team.name"), class = "data.frame", row.names = c(47L, 
48L, 52L, 53L, 225L, 389L, 390L, 395L, 398L, 401L)) 

df1 = structure(list(global.player.id = c(-1L, 262882L, 266358L, 509450L, 
604898L, 699950L, 263884L, 329824L, 340730L, 462980L, 609567L, 
-1L, 262882L, 266358L, 509450L, 604898L, 699950L, 263884L, 329824L, 
340730L, 462980L, 609567L, -1L, 262882L, 266358L), x_loc = c(47.17753, 
13.57165, 46.45843, 26.68803, 52.16717, 47.20201, 60.097, 47.20201, 
52.16717, 65.1302, 46.45843, 47.19141, 13.61702, 46.5355, 26.71856, 
52.25433, 47.27324, 60.08215, 47.27324, 52.25433, 65.11267, 46.5355, 
46.82163, 13.66478, 46.57545), y_loc = c(26.44326, 25.18298, 
18.46573, 25.48557, 33.09177, 31.09372, 22.79717, 31.09372, 33.09177, 
26.39671, 18.46573, 26.5187, 25.17431, 18.42014, 25.53807, 33.11185, 
31.01197, 22.76307, 31.01197, 33.11185, 26.40227, 18.42014, 26.72834, 
25.17784, 18.35961), order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)), .Names = c("global.player.id", 
"x_loc", "y_loc", "order"), row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", 
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25"), class = "data.frame") 

以上是我与工作dataframes。当我将df2合并到df2时,我想保留df1的顺序。我正在处理时间序列数据,所以数据帧的顺序很重要。 df1中的order列仅用于测试df1是否正在被洗牌(我不想在合并之后使用额外的代码对顺序进行排序)。

这里是我试过:

merge(df1, df2, by = 'global.player.id', all.x = TRUE) 

    global.player.id x_loc y_loc order    player team.name 
1    -1 47.17753 26.44326  1    <NA>  <NA> 
2    -1 46.82163 26.72834 23    <NA>  <NA> 
3    -1 47.19141 26.51870 12    <NA>  <NA> 
4   262882 13.57165 25.18298  2   Rajon Rondo  Kings 
5   262882 13.61702 25.17431 13   Rajon Rondo  Kings 
6   262882 13.66478 25.17784 24   Rajon Rondo  Kings 
7   263884 60.08215 22.76307 18  Marvin Williams Hornets 
8   263884 60.09700 22.79717  7  Marvin Williams Hornets 
9   266358 46.53550 18.42014 14   Rudy Gay  Kings 
10   266358 46.45843 18.46573  3   Rudy Gay  Kings 
11   266358 46.57545 18.35961 25   Rudy Gay  Kings 
12   329824 47.27324 31.01197 19  Spencer Hawes Hornets 
13   329824 47.20201 31.09372  8  Spencer Hawes Hornets 
14   340730 52.16717 33.09177  9   Jeremy Lin Hornets 
15   340730 52.25433 33.11185 20   Jeremy Lin Hornets 
16   462980 65.13020 26.39671 10  Kemba Walker Hornets 
17   462980 65.11267 26.40227 21  Kemba Walker Hornets 
18   509450 26.71856 25.53807 15 DeMarcus Cousins  Kings 
19   509450 26.68803 25.48557  4 DeMarcus Cousins  Kings 
20   604898 52.16717 33.09177  5  Ben McLemore  Kings 
21   604898 52.25433 33.11185 16  Ben McLemore  Kings 
22   609567 46.53550 18.42014 22  P.J. Hairston Grizzlies 
23   609567 46.45843 18.46573 11  P.J. Hairston Grizzlies 
24   699950 47.20201 31.09372  6 Willie Cauley-Stein  Kings 
25   699950 47.27324 31.01197 17 Willie Cauley-Stein  Kings 

原本在DF1,以便进行了排序1-25,现在是全乱套了。显然,df1以一种我不想要的方式被洗牌。下面是输出的,当我通过分拣= FALSE到合并功能:

merge(df1, df2, by = 'global.player.id', all.x = TRUE, sort = FALSE) 

global.player.id x_loc y_loc为了球员team.name

1   262882 13.57165 25.18298  2   Rajon Rondo  Kings 
2   262882 13.61702 25.17431 13   Rajon Rondo  Kings 
3   262882 13.66478 25.17784 24   Rajon Rondo  Kings 
4   266358 46.53550 18.42014 14   Rudy Gay  Kings 
5   266358 46.45843 18.46573  3   Rudy Gay  Kings 
6   266358 46.57545 18.35961 25   Rudy Gay  Kings 
7   509450 26.71856 25.53807 15 DeMarcus Cousins  Kings 
8   509450 26.68803 25.48557  4 DeMarcus Cousins  Kings 
9   604898 52.16717 33.09177  5  Ben McLemore  Kings 
10   604898 52.25433 33.11185 16  Ben McLemore  Kings 
11   699950 47.20201 31.09372  6 Willie Cauley-Stein  Kings 
12   699950 47.27324 31.01197 17 Willie Cauley-Stein  Kings 
13   263884 60.08215 22.76307 18  Marvin Williams Hornets 
14   263884 60.09700 22.79717  7  Marvin Williams Hornets 
15   329824 47.27324 31.01197 19  Spencer Hawes Hornets 
16   329824 47.20201 31.09372  8  Spencer Hawes Hornets 
17   340730 52.16717 33.09177  9   Jeremy Lin Hornets 
18   340730 52.25433 33.11185 20   Jeremy Lin Hornets 
19   462980 65.13020 26.39671 10  Kemba Walker Hornets 
20   462980 65.11267 26.40227 21  Kemba Walker Hornets 
21   609567 46.53550 18.42014 22  P.J. Hairston Grizzlies 
22   609567 46.45843 18.46573 11  P.J. Hairston Grizzlies 
23    -1 47.17753 26.44326  1    <NA>  <NA> 
24    -1 46.82163 26.72834 23    <NA>  <NA> 
25    -1 47.19141 26.51870 12    <NA>  <NA> 

而且不是我想要的,因为秩序是一切再次失灵。

有没有办法调用合并函数没有完全洗牌第一个数据帧参数通过,或者我完全没有运气。如果是这样的话,这似乎是merge()函数的一个主要缺点。谢谢!

回答

1

你可以使用joinplyr

library(plyr) 
plyr::join(df1,df2, by='global.player.id') 

结果进行排序。