2015-12-02 124 views
1

R中的我的数据集包含年份变量(year1,year2,year3等)的多个虚拟变量。如何将这些虚拟变量转换为对应于某个东西的“年”变量像1995年,1996年,1997年?R:将多年虚拟转换为单一因子变量

在Stata我会做这样的事情:

gen year=0 
replace year=1995 if year1==1 
replace year=1996 if year2==1 

数据dput

structure(list(wkd_ind = c(123L, 140L, 177L, 127L, 285L, 227L, 
333L, 135L, 124L, 395L, 104L, 362L, 204L, 309L, 510L, 154L, 276L, 
409L, 262L, 168L), assaults = c(2661L, 2845L, 3361L, 2490L, 5493L, 
4213L, 6579L, 2653L, 2849L, 6944L, 1650L, 5312L, 2917L, 4414L, 
7593L, 2041L, 5470L, 5531L, 4651L, 3159L), attend_v = c(0.74936, 
2.2334, 0.075539, 5.4919, 5.1195, 0.29706, 0.43023, 6.7021, 0.82108, 
0.49968, 3.0424, 0.15407, 2.0871, 0.081484, 0.7144, 9.9863, 3.7653, 
1.2931, 0.64987, 0.1372), attend_m = c(7.523, 6.4573, 14.575, 
5.2794, 7.5652, 10.649, 8.5319, 6.5313, 6.1471, 5.7738, 3.3895, 
3.42, 7.5825, 6.0173, 2.7251, 2.8784, 1.7649, 9.5522, 10.834, 
12.922), attend_n = c(5.5719, 2.5885, 8.3358, 4.2664, 6.3695, 
6.4263, 9.0384, 9.6412, 4.7777, 19.82, 20.971, 11.688, 18.561, 
10.305, 13.957, 4.942, 9.9064, 9.3939, 7.1644, 5.7901), h_chris = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), h_newyr = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), h_easter = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), h_july4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), h_mem = c(0L, 0L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L), h_labor = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), w_maxa = c(0.16587, 
0.81338, 0.11745, 0.03471, 0.58038, 0.50356, 0.45934, 0.82159, 
0.52968, 0.21778, 0, 0, 0.094779, 0, 0.13667, 0, 0.1637, 0, 0, 
0), w_maxb = c(0, 0.00823, 0.31271, 0, 0.24928, 0, 0.12819, 0.12525, 
0.0092631, 0.67078, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), w_maxc = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0.0041149, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0), w_mina = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15232, 0.0014491, 
0, 0.00030794, 0, 8.93e-05, 0, 0.00062132, 0.00078076, 0), w_minb = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.038394, 0.036855, 0.22352, 0, 
0.32117, 0.0020882, 0.13658, 0.13159, 0.056588), w_minc = c(0.093716, 
0, 0, 0, 0, 0.041004, 0.018065, 0, 0.059047, 0, 0.12112, 0.13575, 
0.033517, 0.59676, 0, 0.3957, 0.073306, 0.46488, 0.56685, 0.31562 
), w_rain = c(0.2167, 0.17555, 0.29204, 0.38594, 0.66403, 0.24707, 
0.36952, 0.33298, 0.25875, 0.28135, 0.58494, 0.71564, 0.033189, 
0.24098, 0.14998, 0.19021, 0.52752, 0.18456, 0.4079, 0.17756), 
    w_snow = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0056599, 0, 
    0.034913, 0, 0.43373, 0, 0.048099, 0.02458, 0.044347), year1 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), year2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), year3 = c(1L, 
    1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    0L, 0L, 0L, 0L), year4 = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), year5 = c(0L, 
    0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), year6 = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L), year7 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), year8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), year9 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), year10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), month1 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 1L, 0L), month2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month3 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 1L), month4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), month5 = c(1L, 
    0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), month6 = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month7 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), month8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month9 = c(0L, 
    1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), month10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), month11 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
    0L, 1L, 0L, 0L), month12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L), year = c(1995L, 
    1997L, 1996L, 1999L, 1998L, 2005L, 1999L, 1995L, 2005L, 1996L, 
    1996L, 2001L, 2002L, 1998L, 2002L, 2005L, 2004L, 2004L, 1996L, 
    2002L)), .Names = c("wkd_ind", "assaults", "attend_v", "attend_m", 
"attend_n", "h_chris", "h_newyr", "h_easter", "h_july4", "h_mem", 
"h_labor", "w_maxa", "w_maxb", "w_maxc", "w_mina", "w_minb", 
"w_minc", "w_rain", "w_snow", "year1", "year2", "year3", "year4", 
"year5", "year6", "year7", "year8", "year9", "year10", "month1", 
"month2", "month3", "month4", "month5", "month6", "month7", "month8", 
"month9", "month10", "month11", "month12", "year"), datalabel = "", time.stamp = "13 Nov 2015 17:05", formats = c("%8.0g", 
"%8.0g", "%12.0g", "%12.0g", "%12.0g", "%8.0g", "%8.0g", "%8.0g", 
"%8.0g", "%8.0g", "%8.0g", "%12.0g", "%12.0g", "%12.0g", "%12.0g", 
"%12.0g", "%12.0g", "%12.0g", "%12.0g", "%8.0g", "%8.0g", "%8.0g", 
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", 
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", 
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g"), types = c(65529L, 
65529L, 65526L, 65526L, 65526L, 65530L, 65530L, 65530L, 65530L, 
65530L, 65530L, 65526L, 65526L, 65526L, 65526L, 65526L, 65526L, 
65526L, 65526L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 
65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 
65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L 
), val.labels = c("", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", ""), var.labels = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", ""), version = 117L, label.table = list(), expansion.fields = list(), strl = structure(character(0), .Names = character(0)), byteorder = "LSF", row.names = c(122L, 
139L, 176L, 126L, 284L, 226L, 332L, 134L, 123L, 394L, 103L, 361L, 
203L, 308L, 506L, 153L, 275L, 408L, 261L, 167L), class = "data.frame") 
+0

您可以发布您的输入数据的样本? –

+0

所以没有ID列,只有年N列? –

+0

我有周末指标专栏对应第一周末,第二周末。 – Parseltongue

回答

1
(df <- data.frame(y = 1:4, d1 = c(1,0,0,1), d2 = c(0,1,0,0), d3 = c(0,0,1,0))) 
# y d1 d2 d3 
# 1 1 1 0 0 
# 2 2 0 1 0 
# 3 3 0 0 1 
# 4 4 1 0 0 
cols <- 2:4 # or c("d1", "d2", "d3") - Dummy variable columns 
nval <- 1999:2001 # New corresponding values 
df$year <- t(sweep(df[, cols], 2, nval, "*"))[t(df[, cols]) != 0] 
df 
# y d1 d2 d3 year 
# 1 1 1 0 0 1999 
# 2 2 0 1 0 2000 
# 3 3 0 0 1 2001 
# 4 4 1 0 0 1999 
+0

这是非常有趣的 - 你能解释扫描是如何工作的吗?你也遍历所有的列?我怎么迭代刚刚匹配的年份*编辑 - 我现在看到哪些列是 – Parseltongue

+0

@Parseltongue,'cols'是一个变量,其中包含索引或虚拟变量列的名称(您想要迭代的列)。然后'nval'包含新的对应值,例如1999'列为'd1',2000'为'd2'等。检查'?sweep',它执行一些操作,在这种情况下,将'df [,cols]'的每一行乘以'nvals',检查'扫(df [,cols],2,nval,“*”)'。 – Julius

+0

这给了我一些奇怪的结果:http://i.imgur.com/OjkfTrK.png – Parseltongue

1

这里是任务的一些基本代码。在代码df将是你的数据框。

# making an example data frame with two years of dummy variables 
set.seed(10) 
year1 <- round(runif(10,0,1)) 
year2 <- 1-year1 
df <- as.data.frame(cbind(year1,year2)) 

# substituting year in for the dummy variables 
df$year <- NA 
df$year[which(df$year1 %in% 1)] <- 1995 
df$year[which(df$year2 %in% 1)] <- 1996 #etc 

如果你有很多虚拟变量和虚拟变量是不相关的一种很好的方式(他们炒或几年在索引跳过)年度变量,那么你可以使用一个循环,如一个在下面给出。只要将变量yearsyear_names定义为引用值(虚拟变量和年份),它就非常灵活。当虚拟变量与年份之间存在明确的关系时,使用避免循环的公式可能会更有效。

# names of year dummy variables 
year_names <- c('year1','year2') 

# years corresponding to the year1, year2, ... columns 
years <- seq(1995,1996,1) 

# initializing column of dataframe 
df$year <- NA 

# looping over the year dummy variables 
for(i in 1:length(year_names)){ 

    df$year[(df[year_names[i]] == 1)] <- years[i] 
} 
+0

如果你展示这样一个循环的外观可能会更有帮助。 – Frank

0

这个怎么样?

year_cols <- paste0('year', 1:10) 
my_data$year <- 1994 + apply(my_data[, year_cols], 1, function(x) which(x==1)) 

我们将子集划分为包含年份的列。然后我们使用apply函数来说:“对于每一行,告诉我哪一列等于1”。这会给你一个索引,你可以在1994年添加索引,以便索引1 = 1995,索引2 = 1996等等。

(我开始my_data如上你dput声明的内容)

2

下面是使用一种方法dplyrtidyr

library(dplyr) 
library(tidyr) 

d %>% 
    # turn year columns into rows 
    gather('year', 'dummy', starts_with('year')) %>% 
    # remove extraneous rows created by gather 
    filter(dummy == 1) %>% 
    # extract the year index and add it to a base year 
    mutate(year=extract_numeric(year) + 1994) %>% 
    # remove year dummy variable 
    select(-dummy)