2016-11-07 62 views
1

我对R编程相对较新,所以如果这个问题太基础,我很抱歉。我有交易可以显示来自六种不同类型产品的收入。有三年的交易。我的目标是找出所有不同产品组合的销售产品总和,每年产品的总和为2^6 - 1 = 64 - 1 = 63。意思是,我会有63*3 = 189的组合。使用d中的dplyr对所有变量组合使用dplyr总和值

为了简单起见,我仅使用三个变量创建了测试数据,因为我使用while循环编写了一年的程序,这太臭了。我的目标是展示我正在努力完成的事情。尽管如此,我从下面的原始文件中发布了随机样本。

这里的测试数据只有三个变量CarTireServiceswhile环告诉你什么是我要找:

dput(Sample_File) 
structure(list(Order.ID = c(171, 173, 132, 174, 132, 174, 132, 
174, 174), Fiscal.Year = c(2017, 2016, 2016, 2016, 2016, 2016, 
2016, 2016, 2018), Car = c(2, 2, 3, 1, 0, 0, 0, 0, 1), Tire = c(0, 
0, 0, 1, 0, 1, 0, 1, 1), Services = c(3, 1, 4, 0, 4, 1, 4, 0, 
0)), .Names = c("Order.ID", "Fiscal.Year", "Car", "Tire", "Services" 
), row.names = c(NA, 9L), class = "data.frame") 

这里是我的代码:

i<-1 
    Csum <- matrix(rep(0,21),nrow = 7,ncol = 3) 
    # Row 1 is used when C is ON; T is ON ; S is ON 
    # Row 2 is used when C is ON; T is ON ; S is OFF 
    # Row 3 is used when C is ON; T is OFF ; S is ON 
    # Row 4 is used when C is OFF; T is ON ; S is ON 
    # Row 5 is used when C is ON; T is OFF ; S is OFF 
    # Row 6 is used when C is OFF; T is ON ; S is OFF 
    # Row 7 is used when C is OFF; T is OFF ; S is ON 

    while (i <= length(Sample_File$Order.ID)) 
    { 
     if (Sample_File$Fiscal.Year[i]!=2016) 
     { 
     i<-i+1 
     next 
     } 
     if (Sample_File$Car[i]!=0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]!=0)#1 
     { 
     Csum[1,1] <- Csum[1,1] + Sample_File$Car[i] 
     Csum[1,2] <- Csum[1,2] + Sample_File$Tire[i] 
     Csum[1,3] <- Csum[1,3] + Sample_File$Services[i] 

     } 
     else if (Sample_File$Car[i]!=0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]==0) #2 
     { 
     Csum[2,1] <- Csum[2,1] + Sample_File$Car[i] 
     Csum[2,2] <- Csum[2,2] + Sample_File$Tire[i] 
     Csum[2,3] <- Csum[2,3] + 0 
     } 
     else if(Sample_File$Car[i]!=0 & Sample_File$Tire[i]==0 & Sample_File$Services[i]!=0) #3 
     { 

     Csum[3,1] <- Csum[3,1] + Sample_File$Car[i] 
     Csum[3,2] <- Csum[3,2] + 0 
     Csum[3,3] <- Csum[3,3] + Sample_File$Services[i] 
     } 
     else if(Sample_File$Car[i]==0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]!=0) #4 
     { 
     Csum[4,1] <- Csum[4,1] + 0 
     Csum[4,2] <- Csum[4,2] + Sample_File$Tire[i] 
     Csum[4,3] <- Csum[4,3] + Sample_File$Services[i] 
     } 
     else if(Sample_File$Car[i]!=0 & Sample_File$Tire[i]==0 & Sample_File$Services[i]==0) #5 
     { 
     Csum[5,1] <- Csum[5,1] + Sample_File$Car[i] 
     Csum[5,2] <- Csum[5,2] + 0 
     Csum[5,3] <- Csum[5,3] + 0 
     } 
     else if(Sample_File$Car[i]==0 & Sample_File$Tire[i]!=0 & Sample_File$Services[i]==0)#6 
     { 
     Csum[6,1] <- Csum[6,1] + 0 
     Csum[6,2] <- Csum[6,2] + Sample_File$Tire[i] 
     Csum[6,3] <- Csum[6,3] + 0 
     } 
     else #7 
     { 
      Csum[7,1] <- Csum[7,1] + 0 
      Csum[7,2] <- Csum[7,2] + 0 
      Csum[7,3] <- Csum[7,3] + Sample_File$Services[i] 
     } 
     i<-i+1 
    } 

我有编写代码只处理一年,因为复制此代码三年非常痛苦。我正在寻找一种解决方案,可以创建3个数据帧的列表,每个数据帧为三年。

下面是一个随机样本,大小为10,包含原始文件中的六个变量。

dput(Sample_File_Random) 
structure(list(Order.ID = c(171, 173, 132, 174, 169, 175, 163, 
186, 178, 121), Fiscal.Year = c(2016, 2016, 2017, 2016, 2015, 
2016, 2015, 2015, 2015, 2017), Car = c(2, 0, 3, 0, 0, 0, 0, 5346.25, 
0, 0), Tire = c(0, 0, 0, 8691.55800460666, 3198, 5, 2, 0, 2, 
3282.18), Services = c(3, 0, 4, 0, 0, 0, 0, 0, 0, 0), Insurance = c(4, 
0, 0, 4, 0, 4, 0, 0, 0, 0), Accessories = c(94.3, 3749.8, 9308.65, 
0, 2, 0, 1, 633.75, 51.44, 0), Finance = c(0, 0, 0, 4, 0, 14800, 
0, 0, 0, 0)), .Names = c("Order.ID", "Fiscal.Year", "Car", "Tire", 
"Services", "Insurance", "Accessories", "Finance"), row.names = c(NA, 
10L), class = "data.frame") 

我真的很坚持,所以我将真诚地感谢与矢量化这个任何帮助..


@ Ronak Shah的要求:下面是Sample_File_Random

Output_File 
    Fiscal.Year  Car  Tire Services Insurance Accessories Finance 
1  2015 0.00 3202.000  0   0  54.44  0 
2  2015 5346.25 0.000  0   0  633.75  0 
3  2016 2.00 0.000  3   4  94.30  0 
4  2016 0.00 0.000  0   0  3749.80  0 
5  2016 0.00 8696.558  0   8  0.00 14804 
6  2017 3.00 0.000  4   0  9308.65  0 
7  2017 0.00 3282.180  0   0  0.00  0 
+0

@Ronak沙阿 - 我将它。如果您有更多问题,请让我知道。 – watchtower

+0

@Ronak Shah - 感谢您的提问。您将添加这些行,因为它们表示给定年份的相同产品组合的收入。这是写在我的问题(标题),即总结他们。另外,我在'while'循环代码中完成了这项工作。 – watchtower

+0

@Ronak Shah - 是的,你确实是对的。这是我手边的一个错误。对此我很抱歉。我已纠正它。行178应该被添加。示例输出手动完成,因此错误进入。 – watchtower

回答

1

这里是一个紧凑&表现dplyr溶液,将其进行在三个步骤:

  1. 创建指标每个服务是否是在篮子或不
  2. 组由年,并且的指标
  3. 总和组合由分组变量

在这里,服务是值执行此代码:

df_foo %>% 
    # 1. create the combinations of whether each of the 
    # products is in the basket or not 
    mutate_each(
    funs(In_Basket = . > 0), Car:Services 
) %>% 
    # 2. group by the year and the basket service indicators 
    group_by_(.dots = c("Fiscal.Year", grep("_In_Basket", names(.), value = TRUE))) %>% 
    # 3. sum the service values 
    summarise_each(
    funs(sum(., na.rm = TRUE)), Car:Services 
) 

这使输出:

Source: local data frame [7 x 7] 
Groups: Fiscal.Year, Car_In_Basket, Tire_In_Basket [?] 

    Fiscal.Year Car_In_Basket Tire_In_Basket Services_In_Basket Car Tire Services 
     <dbl>   <lgl>   <lgl>    <lgl> <dbl> <dbl> <dbl> 
1  2016   FALSE   FALSE    TRUE  0  0  8 
2  2016   FALSE   TRUE    FALSE  0  1  0 
3  2016   FALSE   TRUE    TRUE  0  1  1 
4  2016   TRUE   FALSE    TRUE  5  0  5 
5  2016   TRUE   TRUE    FALSE  1  1  0 
6  2017   TRUE   FALSE    TRUE  2  0  3 
7  2018   TRUE   TRUE    FALSE  1  1  0 
+0

您的回复非常好。我很好奇 - 你为什么使用'In_Basket'来做'group_by_'?我认为'group_by'只能用于分组行和不分栏。不是吗?我是初学者,所以我很感谢你解释这个概念。再次感谢你的帮助。 – watchtower

+1

@watchtower'In_Basket'变量不是行,它们是列。你可以看看'mutate_each'后面的数据是什么样的,看看这些列是什么样的。 – tchakravarty

0

什么预期的输出这里不错的挑战....

Using your dataset that I called test. I chose to approach this with matrices. 
names<-colnames(test[3:8]) 
library(combinat) 
one<-t(combn(names,1)) 
two<-t(combn(names,2)) 
three<-t(combn(names,3)) 
four<-t(combn(names,4)) 
five<-t(combn(names,5)) 
six<-t(combn(names,6)) 
library(plyr) 
myset<-unname(rbind.fill.matrix(one,two,three,four,five,six)) 
head(myset,3); tail(myset,3) 

提供了以下:

 [,1]  [,2] [,3] [,4] [,5] [,6] 
[1,] "Car"  NA NA NA NA NA 
[2,] "Tire"  NA NA NA NA NA 
[3,] "Services" NA NA NA NA NA 
     [,1] [,2]  [,3]  [,4]   [,5]   [,6]  
[61,] "Car" "Services" "Insurance" "Accessories" "Finance"  NA  
[62,] "Tire" "Services" "Insurance" "Accessories" "Finance"  NA  
[63,] "Car" "Tire"  "Services" "Insurance" "Accessories" "Finance" 

使用dplyr逐年得到您的款项:

library(dplyr) 
testsums<- test %>% select(-Order.ID) %>% group_by(Fiscal.Year) %>% summarise_each(funs(mean)) 
testsums 
A tibble: 3 × 7 
    Fiscal.Year  Car Tire Services Insurance Accessories Finance 
     <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> 
1  2015 1336.562 800.50  0.00   0 172.0475  0 
2  2016 0.500 2174.14  0.75   3 961.0250 3701 
3  2017 1.500 1641.09  2.00   0 4654.3250  0 

创建1的的MATIX和0被你每年的款项矢量相同的六个变量乘以。

mult.matrix<-myset 
mult.matrix[!is.na(mult.matrix)]<-1 
mult.matrix[is.na(mult.matrix)]<-0 
class(mult.matrix) <- "numeric" 
head(mult.matrix,3);tail(mult.matrix,3) 
    [,1] [,2] [,3] [,4] [,5] [,6] 
[1,] 1 0 0 0 0 0 
[2,] 1 0 0 0 0 0 
[3,] 1 0 0 0 0 0 
     [,1] [,2] [,3] [,4] [,5] [,6] 
[61,] 1 1 1 1 1 0 
[62,] 1 1 1 1 1 0 
[63,] 1 1 1 1 1 1 

将年总和转换为矩阵表示法。乘以它乘以mult.matrix。将3个新列绑定到原始组合数据集。

year_sums<-unname(as.matrix(testsums[1:3,2:7])) 
all_sums<-mult.matrix %*% t(year_sums) 
myset<-unname(rbind.fill.matrix(one,two,three,four,five,six)) 
myset<-cbind(myset,all_sums) 
head(myset,5); tail(myset,5) 
    [,1]   [,2] [,3] [,4] [,5] [,6] [,7]  [,8] [,9] 
[1,] "Car"   NA NA NA NA NA "1336.5625" "0.5" "1.5" 
[2,] "Tire"  NA NA NA NA NA "1336.5625" "0.5" "1.5" 
[3,] "Services" NA NA NA NA NA "1336.5625" "0.5" "1.5" 
[4,] "Insurance" NA NA NA NA NA "1336.5625" "0.5" "1.5" 
[5,] "Accessories" NA NA NA NA NA "1336.5625" "0.5" "1.5" 
     [,1] [,2]  [,3]  [,4]   [,5]   [,6]  [,7]  [,8]    [,9]  
[59,] "Car" "Tire"  "Services" "Accessories" "Finance"  NA  "2309.11" "3139.41450115167" "6298.915" 
[60,] "Car" "Tire"  "Insurance" "Accessories" "Finance"  NA  "2309.11" "3139.41450115167" "6298.915" 
[61,] "Car" "Services" "Insurance" "Accessories" "Finance"  NA  "2309.11" "3139.41450115167" "6298.915" 
[62,] "Tire" "Services" "Insurance" "Accessories" "Finance"  NA  "2309.11" "3139.41450115167" "6298.915" 
[63,] "Car" "Tire"  "Services" "Insurance" "Accessories" "Finance" "2309.11" "6840.41450115166" "6298.915" 

这可以清理很多。我选择了穿过我的思考过程。你现在可以把最后一个矩阵,转换成数据框,重命名头等等...