2016-04-21 81 views
2

考虑以下数据集:占美娜:: seqecmpgroup可变长度不同(发现 '基团')

SimulatedDated <- structure(list(CustumerId = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 
13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 
14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 
15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 
17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 
18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 20L, 
20L, 20L, 20L, 20L, 20L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 21L, 
22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L, 
23L, 23L, 23L, 23L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 
25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 26L, 26L, 26L, 
26L, 26L, 26L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 27L, 28L, 
28L, 28L, 28L, 28L, 28L, 28L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 
29L, 29L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 31L, 31L, 
31L, 31L, 31L, 31L, 31L, 31L, 31L, 32L, 32L, 32L, 32L, 32L, 32L, 
32L, 32L, 32L, 32L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 
34L, 34L, 34L, 34L, 34L), ProductId = c(6L, 3L, 4L, 9L, 8L, 10L, 
1L, 5L, 7L, 1L, 5L, 3L, 4L, 2L, 7L, 6L, 10L, 8L, 7L, 4L, 10L, 
5L, 1L, 3L, 8L, 6L, 2L, 9L, 6L, 1L, 2L, 4L, 7L, 8L, 5L, 9L, 10L, 
3L, 2L, 5L, 9L, 4L, 10L, 3L, 6L, 1L, 8L, 8L, 10L, 2L, 4L, 3L, 
9L, 5L, 6L, 5L, 6L, 4L, 9L, 10L, 8L, 2L, 7L, 1L, 3L, 10L, 3L, 
2L, 8L, 9L, 7L, 5L, 4L, 1L, 7L, 1L, 3L, 2L, 4L, 8L, 9L, 6L, 5L, 
10L, 1L, 9L, 2L, 4L, 7L, 3L, 8L, 7L, 9L, 8L, 4L, 10L, 3L, 5L, 
1L, 6L, 2L, 6L, 4L, 9L, 3L, 10L, 1L, 8L, 7L, 5L, 2L, 9L, 5L, 
7L, 4L, 10L, 1L, 3L, 2L, 6L, 5L, 9L, 2L, 4L, 3L, 8L, 1L, 10L, 
6L, 7L, 10L, 9L, 2L, 1L, 5L, 8L, 6L, 4L, 7L, 3L, 9L, 8L, 3L, 
5L, 6L, 10L, 1L, 7L, 4L, 1L, 6L, 9L, 10L, 3L, 4L, 2L, 8L, 7L, 
10L, 8L, 1L, 6L, 4L, 5L, 9L, 3L, 7L, 2L, 4L, 8L, 3L, 7L, 10L, 
1L, 6L, 5L, 5L, 6L, 4L, 7L, 1L, 10L, 3L, 10L, 8L, 3L, 1L, 4L, 
5L, 6L, 2L, 9L, 5L, 6L, 4L, 8L, 2L, 10L, 3L, 1L, 8L, 4L, 10L, 
6L, 9L, 7L, 2L, 3L, 8L, 3L, 6L, 7L, 9L, 4L, 5L, 2L, 10L, 1L, 
5L, 9L, 3L, 7L, 6L, 10L, 8L, 2L, 4L, 8L, 7L, 1L, 4L, 2L, 10L, 
10L, 3L, 8L, 1L, 7L, 5L, 4L, 6L, 2L, 10L, 6L, 1L, 2L, 5L, 4L, 
8L, 1L, 10L, 8L, 3L, 2L, 9L, 5L, 6L, 4L, 9L, 10L, 6L, 2L, 1L, 
7L, 4L, 8L, 5L, 1L, 5L, 9L, 10L, 3L, 8L, 7L, 2L, 4L, 10L, 1L, 
5L, 7L, 6L, 2L, 3L, 4L, 9L, 8L, 1L, 5L, 2L, 7L, 3L, 6L, 10L, 
4L, 9L, 9L, 5L, 10L, 8L, 2L), DaysSinceEpoch = c(7L, 20L, 31L, 
40L, 105L, 146L, 162L, 169L, 212L, 10L, 18L, 31L, 65L, 84L, 122L, 
156L, 202L, 206L, 1L, 4L, 7L, 11L, 14L, 24L, 25L, 100L, 148L, 
149L, 3L, 10L, 12L, 14L, 18L, 26L, 35L, 41L, 96L, 147L, 9L, 22L, 
66L, 80L, 102L, 104L, 170L, 199L, 234L, 10L, 24L, 36L, 38L, 75L, 
122L, 163L, 169L, 9L, 16L, 35L, 39L, 54L, 58L, 79L, 116L, 133L, 
224L, 27L, 35L, 37L, 49L, 73L, 91L, 105L, 141L, 252L, 16L, 28L, 
51L, 73L, 76L, 83L, 126L, 202L, 97L, 105L, 150L, 172L, 203L, 
207L, 223L, 256L, 259L, 25L, 28L, 38L, 40L, 63L, 100L, 120L, 
176L, 186L, 191L, 7L, 22L, 36L, 37L, 40L, 41L, 53L, 67L, 114L, 
233L, 1L, 16L, 17L, 23L, 40L, 52L, 125L, 184L, 186L, 12L, 42L, 
53L, 65L, 67L, 69L, 83L, 149L, 154L, 265L, 10L, 14L, 33L, 47L, 
67L, 106L, 133L, 181L, 247L, 258L, 6L, 21L, 26L, 41L, 49L, 68L, 
89L, 112L, 119L, 9L, 34L, 88L, 91L, 102L, 110L, 132L, 171L, 200L, 
6L, 14L, 21L, 36L, 40L, 60L, 64L, 88L, 109L, 208L, 8L, 17L, 21L, 
55L, 77L, 85L, 97L, 168L, 18L, 28L, 42L, 44L, 70L, 77L, 101L, 
14L, 23L, 33L, 84L, 107L, 123L, 124L, 125L, 25L, 29L, 33L, 57L, 
79L, 83L, 98L, 112L, 119L, 5L, 31L, 64L, 91L, 102L, 131L, 222L, 
234L, 27L, 46L, 48L, 60L, 61L, 64L, 72L, 103L, 161L, 8L, 24L, 
27L, 50L, 60L, 62L, 92L, 99L, 147L, 159L, 16L, 19L, 20L, 84L, 
175L, 202L, 17L, 21L, 25L, 46L, 69L, 121L, 161L, 175L, 267L, 
10L, 14L, 20L, 39L, 58L, 90L, 229L, 32L, 35L, 39L, 40L, 60L, 
66L, 98L, 153L, 173L, 2L, 3L, 25L, 46L, 51L, 80L, 96L, 166L, 
202L, 43L, 70L, 76L, 77L, 115L, 160L, 183L, 202L, 223L, 25L, 
33L, 61L, 72L, 74L, 77L, 85L, 91L, 152L, 265L, 16L, 62L, 63L, 
64L, 66L, 82L, 104L, 126L, 181L, 47L, 49L, 55L, 58L, 67L), BoughtPAD = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L)), .Names = c("CustumerId", 
"ProductId", "DaysSinceEpoch", "BoughtPAD"), row.names = c(NA, 
300L), class = "data.frame") 

然后,做

library(TraMineR) 
SimSeq <- seqecreate(id = SimulatedDated$CustumerId, 
         timestamp = SimulatedDated$DaysSinceEpoch, 
         event = SimulatedDated$ProductId) 
Cohort <- factor(SimulatedDated$BoughtPAD, labels = c("PAD", "NPAD")) 
Fsubseq <- seqefsub(seq = SimSeq, pMinSupport = .01) 
DiscrCohort <- seqecmpgroup(subseq = Fsubseq, group = Cohort) 

生产:

Error in model.frame.default(formula = ww ~ group + seqmatrix[, index]) : 
    variable lengths differ (found for 'group') 

和我想知道,什么可能导致这个问题?

回答

1

group变量的长度应等于序列的数量,即您的案例中的客户数量。此外,它应该始终保持不变(在您的示例中不是这种情况)。

作为group参数使用的Cohort变量的长度为事件总数(300),而您只有34个客户。所以你需要通过CustumerID来聚合它。

这里是你如何能做到这一点(通过采取组值的最大值为每一个客户在这里。)

bylist <- list(id = SimulatedDated$CustumerId) 
agg.PAD <- aggregate(SimulatedDated[,c("CustumerId","BoughtPAD")], by=bylist, FUN="max") 
Cohort <- agg.PAD$BoughtPAD 

现在你可以寻找那些最好的区分群体的子序列

DiscrCohort <- seqecmpgroup(subseq = Fsubseq, group = Cohort) 
print(DiscrCohort[1:10]) 

希望这有助于。

+0

谢谢百万Ritschard教授! – user189035