2017-04-21 71 views
0

我写了一个函数,它接受任何DataFrame并评估每个列以返回汇总表。现在,对于分类在Answer Label列下的任何Variable Name,我想将Variable TypeAnswer Code向下移一行。R:DataFrame格式化操作

示例代码:

CreateCodebook <- function(dF){ 
    numbercols <- length(colnames(dF)) 

    table <- data.frame() 

    for (i in 1:length(colnames(dF))){ 
    AnswerCode <- if (sapply(dF, is.factor)[i]) 1:nrow(unique(dF[i])) else NA 
    AnswerLabel <- if (sapply(dF, is.factor)[i]) as.vector(unique(dF[order(dF[i]),][i])) else "Open ended" 
    VariableName <- if (length(AnswerCode) > 1) c(colnames(dF)[i], 
                rep(NA,length(AnswerCode) - 1)) else colnames(dF)[i] 
    VariableLabel <- if (length(AnswerCode) > 1) c(colnames(dF)[i], 
                rep(NA,length(AnswerCode) - 1)) else colnames(dF)[i] 
    VariableType <- if (length(AnswerCode) > 1) c(sapply(dF, class)[i], 
                rep(NA,length(AnswerCode) - 1)) else sapply(dF, class)[i] 

    df = data.frame(VariableName, VariableLabel, AnswerLabel, AnswerCode, VariableType, stringsAsFactors = FALSE) 
    names(df) <- c("Variable Name", "Variable Label", "Variable Type", "Answer Code", "Answer Label") 
    table <- rbind(table, df) 

    } 
    rownames(table) <- 1:nrow(table) 
    return(table) 
} 

使用该数据集MASS::anorexia,我得到这个输出从我的功能:

Variable Name Variable Label Variable Type Answer Code Answer Label 
1   Treat   Treat   CBT   1  factor 
2   <NA>   <NA>   Cont   2   <NA> 
3   <NA>   <NA>   FT   3   <NA> 
4   Prewt   Prewt Open ended   NA  numeric 
5  Postwt   Postwt Open ended   NA  numeric 

所需的输出:

Variable Name Variable Label Variable Type Answer Code Answer Label 
1   Treat   Treat   <NA>   NA  factor 
2   <NA>   <NA>   CBT   1   <NA> 
3   <NA>   <NA>   Cont   2   <NA> 
4   <NA>   <NA>   FT   3   <NA> 
5   Prewt   Prewt Open ended   NA  numeric 
6  Postwt   Postwt Open ended   NA  numeric 
+0

确保你提供[再现的示例](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example)与样品输入到测试函数。 – MrFlick

+0

谢谢。我现在在我的文章中提供了一个可重现的例子。 –

回答

2

希望这将工作:

CreateCodebook <- function(dF){ 
    numbercols <- length(colnames(dF)) 

    table <- data.frame() 

    for (i in 1:length(colnames(dF))){ 
     AnswerCode <- if (sapply(dF, is.factor)[i]) 1:nrow(unique(dF[i])) else NA 
     AnswerLabel <- if (sapply(dF, is.factor)[i]) as.vector(unique(dF[order(dF[i]),][i])) else "Open ended" 
     VariableName <- if (length(AnswerCode) > 1) c(colnames(dF)[i], 
                 rep(NA,length(AnswerCode) - 1)) else colnames(dF)[i] 
     VariableLabel <- if (length(AnswerCode) > 1) c(colnames(dF)[i], 
                 rep(NA,length(AnswerCode) - 1)) else colnames(dF)[i] 
     VariableType <- if (length(AnswerCode) > 1) c(sapply(dF, class)[i], 
                 rep(NA,length(AnswerCode) - 1)) else sapply(dF, class)[i] 

     df = data.frame(VariableName, VariableLabel, AnswerLabel, AnswerCode, VariableType, stringsAsFactors = FALSE) 
     names(df) <- c("Variable Name", "Variable Label", "Variable Type", "Answer Code", "Answer Label") 
     table <- rbind(table, df) 

    } 


    # add a new column of row id 
    table$row <- 1:nrow(table) 

    # created new rows to be added 
    x <- table[which(table$`Answer Label` == 'factor'), ] 
    x[, c(1, 2, 5)] <- NA 

    # change original factor rows 
    table[which(table$`Answer Label` == 'factor'), 3:4] <- NA 

    # combine the two data.frame and reorder rows 
    table <- rbind(table, x) 
    table <- table[order(table$row), -ncol(table)] 

    rownames(table) <- 1:nrow(table) 
    return(table) 
} 
+0

谢谢 - 但它需要集成到我的函数中,以便它可以应用于任何数据框。看起来你的方式是硬编码的? –

+0

@RileyHun,这个呢? – mt1022

+0

哇!这工作很好。非常感谢。真的很感激它。 –

1

以下解决方案需要dplyr,tidyrdata.table包中的函数。

# Load packages 
library(dplyr) 
library(tidyr) 
library(data.table) 

# A function to adjust the output of the CreateCodebook function 
Adjust_factor <- function(dF){ 

    dF2 <- dF %>% 
    # Create a new column called Indicator, which is a copy of Answer Label 
    mutate(Indicator = `Answer Label`) %>% 
    # Impute NA based on the previous and nearest non-NA value 
    fill(Indicator) %>% 
    # Create run length group number 
    mutate(Index = rleid(Indicator)) 

    # Split the data frame to list based on the Index 
    dF_list <- split(dF2, f = dF2$Index) 

    # Adjust each data frame subset 
    dF_list2 <- lapply(dF_list, function(x){ 

    if (x$Indicator[1] == "factor"){ # If Indicator is "factor" 

     # Copy and bind the first row 
     x <- bind_rows(x[1, ], x) 
     # Change the content of the first and second row. Replace the value with NA 
     x[1, c("Variable Type", "Answer Code")] <- NA 
     x[2, c("Variable Name", "Variable Label", "Answer Label")] <- NA 
    } 
    return(x) 
    }) 

    # Combine all data frame 
    dF3 <- bind_rows(dF_list2) %>% 
    # Remove the Indicator and Index column 
    select(-Indicator, -Index) 

    return(dF3) 
} 

# Test the function 
library(MASS) 
data(anorexia) 
dat1 <- anorexia 
dat2 <- CreateCodebook(dat1) 
dat3 <- Adjust_factor(dat2) 

test1 <- data.frame(a = c("a", "b", "c"), 
        b = c(1, 2, 3), 
        c = 10:12, 
        d = seq(as.Date("2001-01-01"), as.Date("2001-01-03"), 1), 
        e = c("o", "p", "q")) 

test2 <- CreateCodebook(test1) 
test3 <- Adjust_factor(test2) 
+0

谢谢ycw。这是一个很好的解决方案。我去了另一个,因为它不依赖任何外部软件包,并被集成到我的功能。 –