2017-03-15 25 views
0

我想使用年份信息来计算年龄。我有以下特征的数据集:根据年份的顺序推算年龄

dat <- data.table(id = c(rep(1, 8), rep(2, 8)), 
        year = c(2007:2014, 2007:2014), 
        age = c(1, NA, 3, NA, NA, 5, 7, NA, NA, NA, 30, NA, 32, 35, NA, NA), 
        age_imp= c(1, 2, 3, 4, 5, 5, 7, 8, 28, 29, 30, 31, 32, 35, 36, 37) 
) 


    id year age age_imp 
1: 1 2007 1  1 
2: 1 2008 NA  2 
3: 1 2009 3  3 
4: 1 2010 NA  4 
5: 1 2011 NA  5 
6: 1 2012 5  5 
7: 1 2013 7  7 
8: 1 2014 NA  8 
9: 2 2007 NA  28 
10: 2 2008 NA  29 
11: 2 2009 30  30 
12: 2 2010 NA  31 
13: 2 2011 32  32 
14: 2 2012 35  35 
15: 2 2013 NA  36 
16: 2 2014 NA  37 

原始变量age并不总是与一个每年持续时间(例如,一个采访比以前的采访中,测量误差等,一年少加)所以我想保持它的样子。对于NA行,我想逐年开始一个序列(例如,age_imp)。

有关如何做到这一点的任何建议?

+0

在新的例子,你要归咎于两列? – akrun

回答

1

您可以首先使用第一个非NA年龄来形成线性方程,并在每个ID内线性插值&而不先处理跳跃。

然后,确定每个身份的年龄跳跃/步数。

然后,再次考虑到跳跃,对每个组(即id和步骤对)进行插值和外推。

更多解释直列..

#ensure order is correct before using shift 
setorder(dat, id, year) 

#' Fill NA by interpolating and extrapolating using a known point 
#' 
#' @param dt - data.table 
#' @param years - the xout that are required 
#' 
#' @return a numeric vector of ages given the years 
#' 
extrapolate <- function(dt, years) { 
    #find the first non NA entry 
    firstnonNA <- head(dt[!is.na(age)], 1) 

    #using linear equation y - y_1 = 1 * (x - x_1) 
    as.numeric(sapply(years, function(x) (x - firstnonNA$year) + firstnonNA$age)) 
} 

#interp and extrap age for years that are missing age assuming linearity without jumps 
dat[, imp1 := extrapolate(.SD, year), by="id"] 

#identifying when the age jumps up/down 
dat[, jump:=cumsum(
     (!is.na(age) & imp1!=age) | 
     (!is.na(age) & !is.na(shift(age)) & (age+1)!=shift(age)) 
    ), by="id"] 

#interp and extrap age for years taking into account jumps 
dat[, age_imp1 := extrapolate(.SD, year), by=c("id","jump")] 

#print results 
dat[,c("imp1","jump"):=NULL][] 

#check if the results are identical as requested 
dat[, identical(age_imp, age_imp1)] 
0

我终于创造了这个功能:

impute.age <- function(age) { 
    if (any(is.na(age))) { 
    min.age <- min(age, na.rm = TRUE) 
    position <- which(age == min.age)[1] # ties 
    if (!is.na(position)) { 
    if (position > 1) { # initial values 
    for (i in 1:(position-1)) { 
     age[position - i] <- age[position] - i 
    } 
    } 
    missing <- which(is.na(age)) # missing data position 
    for (i in missing) { 
    age[i] = age[i-1] + 1 
    } 
    } else { age = as.numeric(NA) } 
} 
return(age) 
}