1
我有一个数据框,每行中包含多个因子,用逗号分隔。各行中的因素数量和因素数量未知。我需要对这一列进行热编码,这样每一个独特的因素都占据着自己的专栏。我在下面有一个解决方案,但我相信有一个更好,更优雅的解决方案。这里是一个例子:一个热点编码每行中有多个因子的数据帧
#one hot encode multiple factors in each row
library(stringr)
library(caret)
library(splitstackshape)
#create toy data frame
set.seed(123)
factor.num <- sample(3:6,1) #how many factors in each row
factors <- letters[sample(1:26,4)]
df1 <- data.frame(fact = replicate(100,paste(sample(factors,sample(1:factor.num,1)),collapse = ", ")))
df1
#split "fact" into uknown number of columns
df1_split <- cSplit(df1,"fact",",")
# convert all columns into dummy columns
dmy <- dummyVars(" ~ .", data = df1_split)
trsf <- data.frame(predict(dmy, newdata = df1_split))
#collect all columns with unique factors
final_df <- as.data.frame(matrix(0, ncol = factor.num, nrow = 100))
colnames(final_df) <- paste0("all_",factors)
for (i in 1:factor.num) {
fac_cols <- colnames(trsf)[str_detect(colnames(trsf),paste0("(?<=\\.)",factors[i],"$"))]
final_df[,paste0("all_",factors[i])] <- apply(trsf[,fac_cols],1,function(x) as.numeric(any(x==1,na.rm=T)))
}
final_df