2017-04-12 70 views
0

我有一系列来自一系列曲棍球比赛的得分数据,并且我处于分析阶段。我试图在每场比赛中每10分钟画出主队的领先优势。根据得分数据定期计算球队领先优势

这里是哪里我到目前为止已经得到我的数据集的例子:

library(tidyverse) 

# Generate example data ordered by gameid and event_ts 
game <- tibble(event_type = "goal", event_ts = runif(n = 1000, min = 0, max = 60), 
     team = sample(c("home", "away"), size = 1000, replace = TRUE, prob = c(0.55,0.45)), 
     gameid = sample(100:300, size = 1000, replace = TRUE)) %>% 
    arrange(gameid, event_ts) 

我知道,我可以用summarise每场比赛的最终比分。下面是一个假设两队得分至少一个目标在每场比赛一个简单的例子:

game %>% 
    group_by(gameid, team) %>% 
    summarise(goals = n()) %>% 
    spread(key = team, value = goals) %>% 
    mutate(away = ifelse(is.null(away), 0, away)) 

我想在整个游戏10分钟间隔计算出主队的领先优势(正或负)。这需要总结那时发生的所有得分。这里有一个我想要得到的结构的例子:

finished_demo <- tibble(
    gameid = sort(rep_len(seq(100, 300, 1), 1206)), 
    timestamp = rep(seq(10, 60, 10), 201), 
    home_lead = round(runif(
    n = 1206, min = -5, max = 7 
)) 
) %>% arrange(gameid, timestamp) 
+1

'库(tidyverse);游戏%>%mutate(event_ts = ceiling(event_ts/10)* 10)%>%complete(event_ts,gameid,team)%>%group_by(gameid,team,event_ts)%>%summarize(score = coalesce(sum %>总结(ts = list(event_ts),score = list(cumsum(得分)))%>%unnest()%>%spread(团队,分数)%> %mutate(home_lead = home - away)' – alistaire

回答

4

下面是使用data.table完成它的一种方式,IIUC:

require(data.table) 
setDT(game) # generated with op's code but with a seed(1L) 

key <- CJ(gameid=unique(game$gameid), start=1L, end=(1:6)*10L) 
ans <- game[key, on=.(gameid, event_ts >= start, event_ts <= end), # (1) 
       .(home_lead=sum(team == "home")-sum(team == "away")), # (2) 
       by=.EACHI]           # (3) 

head(ans) 
# gameid event_ts event_ts home_lead 
# 1: 100  1  10  NA 
# 2: 100  1  20   1 
# 3: 100  1  30   0 
# 4: 100  1  40   0 
# 5: 100  1  50  -1 
# 6: 100  1  60  -2 

可以重命名重复的列名(我会当我得到的时间进行这项工作解决这个问题)。


(1)搜索在game行索引为每一行中key而上下on参数所提供的条件匹配的匹配。 (2)计算home组的领先优势。

(3).EACHI通知主队领先应该为key每一行匹配的game行来计算。

NA意味着没有匹配的事件。如果有必要,他们可以通过做代替,以0

ans[is.na(home_lead), home_lead := 0L] 
1

这个怎么样?

game %>% 
mutate(ten_min = event_ts %/% 10, 
     homegoal = if_else(team == 'home', 1, -1)) %>% 
group_by(ten_min, gameid) %>% 
summarize(home_lead_interval = sum(homegoal)) %>% 
ungroup() %>% 
group_by(gameid) %>% 
mutate(home_lead = cumsum(home_lead_interval)) %>% 
arrange(gameid, ten_min) 
# Source: local data frame [683 x 4] 
# Groups: gameid [198] 
# 
# ten_min gameid home_lead_interval home_lead 
#  <dbl> <int>    <dbl>  <dbl> 
# 1  0 100     0   0 
# 2  1 100     -1  -1 
# 3  2 100     -3  -4 
# 4  3 100     -1  -5 
# 5  4 100     2  -3 
# 6  5 100     -1  -4 
# 7  1 101     1   1 
# 8  2 101     1   2 
# 9  4 101     -2   0 
# 10  0 102     1   1 
# # ... with 673 more rows 
1

我很喜欢99%,肯定会有人可以与一些嵌入在purrr发现/嵌套(?)结构重写这个。不同的nrow()从上面的结果(使用相同的数据),所以不能保证解决方案是正确的。

game %>% 
     group_by(gameid) %>% 
     do(data.frame(time = 10 * (1:(max(.$event_ts) %/% 10)))) %>% 
     apply(1, function(x) { 
          g = x[1] %>% unlist 
          t = x[2] %>% unlist 
          game %>% 
           filter(gameid == g, event_ts < t) %>% 
           group_by(gameid, team) %>% 
           summarise(goals = n()) %>% 
           mutate(time = t) 
          }) %>% 
     bind_rows %>% 
     spread(key = team, value = goals) %>% 
     mutate_all(as.numeric) %>% 
     mutate(away = ifelse(is.na(away), 0, away), 
      home = ifelse(is.na(home), 0, home)) 


    gameid time away home 
    <int> <dbl> <dbl> <dbl> 
1  100 10  0  1 
2  100 20  1  3 
3  100 30  1  3 
4  101 20  0  1 
5  101 30  1  1 
6  101 40  1  2 
7  101 50  1  2 
1

我的想法是每10分钟获得主客场比分。然后你可以将基于gameid的data.frame分组,并创建你想要的结果。

set.seed(123) 
# Generate example data ordered by gameid and event_ts 
game <- tibble(event_type = "goal", event_ts = runif(n = 1000, min = 0, max = 60), 
       team = sample(c("home", "away"), size = 1000, replace = TRUE, prob = c(0.55,0.45)), 
       gameid = sample(100:300, size = 1000, replace = TRUE)) %>% 
    arrange(gameid, event_ts) 

# Change the event_ts and get all 10 minutes intervals 
hl <- game %>% 
    mutate(event_ts=ceiling(event_ts/10) * 10) %>% 
    dcast(gameid + event_ts ~ team, length) %>% 
    right_join(expand.grid(gameid=unique(game$gameid), event_ts=seq(10, 60, 10))) 
hl$away[is.na(hl$away)] <- 0 
hl$home[is.na(hl$home)] <- 0 
# Get the home lead 
hl <- hl %>% 
    arrange(gameid, event_ts) %>% 
    group_by(gameid) %>% 
    mutate(away=cumsum(away), 
     home=cumsum(home), 
     home_lead=home - away) 

# Check the game 100 and 101 
game %>% filter(gameid %in% 100:101) 
# A tibble: 7 × 4 
    event_type event_ts team gameid 
     <chr>  <dbl> <chr> <int> 
1  goal 30.460972 home 100 
2  goal 57.270219 home 100 
3  goal 1.126093 home 101 
4  goal 27.879957 home 101 
5  goal 33.086101 home 101 
6  goal 42.497419 away 101 
7  goal 45.649418 home 101 

hl %>% filter(gameid %in% 100:101) 
Source: local data frame [12 x 5] 
Groups: gameid [2] 

    gameid event_ts away home home_lead 
    <int> <dbl> <dbl> <dbl>  <dbl> 
1  100  10  0  0   0 
2  100  20  0  0   0 
3  100  30  0  0   0 
4  100  40  0  1   1 
5  100  50  0  1   1 
6  100  60  0  2   2 
7  101  10  0  1   1 
8  101  20  0  1   1 
9  101  30  0  2   2 
10 101  40  0  3   3 
11 101  50  1  4   3 
12 101  60  1  4   3