2016-04-15 100 views
1

我有一个data.frame d包含一些POSIX日期,我想通过d$date$year <- 100修改年份。这似乎工作得很好,但是在选择了data.frame中的一些行之后,除了第一次修改日期之外,所有行都被转换为<NA>。我在这里做错了什么?请参阅下面的代码。 (R-Fiddledata.frame内的修改日期在选择后变为<NA>

date <- c("2014-01-01","2015-01-02","2016-01-03") 
val <- c("a","b","c") 
d <- data.frame(date,val) 
d$date <- strptime(d$date,format="%Y-%m-%d") 
d 
#  date val 
#1 2014-01-01 a 
#2 2015-01-02 b 
#3 2016-01-03 c 
# correct date as expected 

d[c(TRUE,TRUE,TRUE),] 
#  date val 
#1 2014-01-01 a 
#2 2015-01-02 b 
#3 2016-01-03 c 
# correct dates as expected 

d$date2000 <- d$date 
d$date2000$year <- 100 # set year to 2000 

d 
#  date val date2000 
#1 2014-01-01 a 2000-01-01 
#2 2015-01-02 b 2000-01-02 
#3 2016-01-03 c 2000-01-03 
# correct dates as expected 

d[c(TRUE,TRUE,TRUE),] 
#  date val date2000 
#1 2014-01-01 a 2000-01-01 
#2 2015-01-02 b  <NA> 
#3 2016-01-03 c  <NA> 
# first entry correct, second and third entry <NA> 
+0

奇怪,您运行的是哪个版本@KunalPuri?我在Mac OS上运行3.2.4。 – bnord

+0

我添加了一个显示相同行为的R-Fiddle。 – bnord

+0

当然可重现;但是,如果仅仅是为了我的熏陶,你打算用d [c(TRUE,TRUE,TRUE)]构造来测试什么。假想或预期会是什么? – Chris

回答

1

何时发生此问题?

它似乎发生在功能[.data.frame(见d[c(TRUE,TRUE,TRUE),],但也d[1:3,]或甚至d[3,])的调用。下面是函数的定义:

> `[.data.frame` 
function (x, i, j, drop = if (missing(i)) TRUE else length(cols) == 
    1) 
{ 
    mdrop <- missing(drop) 
    Narg <- nargs() - (!mdrop) 
    has.j <- !missing(j) 
    if (!all(names(sys.call()) %in% c("", "drop")) && !isS4(x)) 
     warning("named arguments other than 'drop' are discouraged") 
    if (Narg < 3L) { 
     if (!mdrop) 
      warning("'drop' argument will be ignored") 
     if (missing(i)) 
      return(x) 
     if (is.matrix(i)) 
      return(as.matrix(x)[i]) 
     nm <- names(x) 
     if (is.null(nm)) 
      nm <- character() 
     if (!is.character(i) && anyNA(nm)) { 
      names(nm) <- names(x) <- seq_along(x) 
      y <- NextMethod("[") 
      cols <- names(y) 
      if (anyNA(cols)) 
       stop("undefined columns selected") 
      cols <- names(y) <- nm[cols] 
     } 
     else { 
      y <- NextMethod("[") 
      cols <- names(y) 
      if (!is.null(cols) && anyNA(cols)) 
       stop("undefined columns selected") 
     } 
     if (anyDuplicated(cols)) 
      names(y) <- make.unique(cols) 
     attr(y, "row.names") <- .row_names_info(x, 0L) 
     attr(y, "class") <- oldClass(x) 
     return(y) 
    } 
    if (missing(i)) { 
     if (drop && !has.j && length(x) == 1L) 
      return(.subset2(x, 1L)) 
     nm <- names(x) 
     if (is.null(nm)) 
      nm <- character() 
     if (has.j && !is.character(j) && anyNA(nm)) { 
      names(nm) <- names(x) <- seq_along(x) 
      y <- .subset(x, j) 
      cols <- names(y) 
      if (anyNA(cols)) 
       stop("undefined columns selected") 
      cols <- names(y) <- nm[cols] 
     } 
     else { 
      y <- if (has.j) 
       .subset(x, j) 
      else x 
      cols <- names(y) 
      if (anyNA(cols)) 
       stop("undefined columns selected") 
     } 
     if (drop && length(y) == 1L) 
      return(.subset2(y, 1L)) 
     if (anyDuplicated(cols)) 
      names(y) <- make.unique(cols) 
     nrow <- .row_names_info(x, 2L) 
     if (drop && !mdrop && nrow == 1L) 
      return(structure(y, class = NULL, row.names = NULL)) 
     else { 
      attr(y, "class") <- oldClass(x) 
      attr(y, "row.names") <- .row_names_info(x, 0L) 
      return(y) 
     } 
    } 
    xx <- x 
    cols <- names(xx) 
    x <- vector("list", length(x)) 
    x <- .Internal(copyDFattr(xx, x)) 
    oldClass(x) <- attr(x, "row.names") <- NULL 
    if (has.j) { 
     nm <- names(x) 
     if (is.null(nm)) 
      nm <- character() 
     if (!is.character(j) && anyNA(nm)) 
      names(nm) <- names(x) <- seq_along(x) 
     x <- x[j] 
     cols <- names(x) 
     if (drop && length(x) == 1L) { 
      if (is.character(i)) { 
       rows <- attr(xx, "row.names") 
       i <- pmatch(i, rows, duplicates.ok = TRUE) 
      } 
      xj <- .subset2(.subset(xx, j), 1L) 
      return(if (length(dim(xj)) != 2L) xj[i] else xj[i, 
       , drop = FALSE]) 
     } 
     if (anyNA(cols)) 
      stop("undefined columns selected") 
     if (!is.null(names(nm))) 
      cols <- names(x) <- nm[cols] 
     nxx <- structure(seq_along(xx), names = names(xx)) 
     sxx <- match(nxx[j], seq_along(xx)) 
    } 
    else sxx <- seq_along(x) 
    rows <- NULL 
    if (is.character(i)) { 
     rows <- attr(xx, "row.names") 
     i <- pmatch(i, rows, duplicates.ok = TRUE) 
    } 
    for (j in seq_along(x)) { 
     xj <- xx[[sxx[j]]] 
     x[[j]] <- if (length(dim(xj)) != 2L) 
      xj[i] 
     else xj[i, , drop = FALSE] 
    } 
    if (drop) { 
     n <- length(x) 
     if (n == 1L) 
      return(x[[1L]]) 
     if (n > 1L) { 
      xj <- x[[1L]] 
      nrow <- if (length(dim(xj)) == 2L) 
       dim(xj)[1L] 
      else length(xj) 
      drop <- !mdrop && nrow == 1L 
     } 
     else drop <- FALSE 
    } 
    if (!drop) { 
     if (is.null(rows)) 
      rows <- attr(xx, "row.names") 
     rows <- rows[i] 
     if ((ina <- anyNA(rows)) | (dup <- anyDuplicated(rows))) { 
      if (!dup && is.character(rows)) 
       dup <- "NA" %in% rows 
      if (ina) 
       rows[is.na(rows)] <- "NA" 
      if (dup) 
       rows <- make.unique(as.character(rows)) 
     } 
     if (has.j && anyDuplicated(nm <- names(x))) 
      names(x) <- make.unique(nm) 
     if (is.null(rows)) 
      rows <- attr(xx, "row.names")[i] 
     attr(x, "row.names") <- rows 
     oldClass(x) <- oldClass(xx) 
    } 
    x 
} 
<bytecode: 0x7fe8cc3a5548> 
<environment: namespace:base> 

相关位发生在这里:

for (j in seq_along(x)) { 
      xj <- xx[[sxx[j]]] 
      x[[j]] <- if (length(dim(xj)) != 2L) 
       xj[i] 
      else xj[i, , drop = FALSE] 
     } 

此时(在d[3,]例子为例),我们有这样的:

> str(xx) 
'data.frame': 3 obs. of 3 variables: 
$ date : POSIXlt, format: "2014-01-01" "2015-01-02" "2016-01-03" 
$ val  : Factor w/ 3 levels "a","b","c": 1 2 3 
$ date2000: POSIXlt, format: "2000-01-01" "2000-01-02" "2000-01-03" 
> str(x) 
List of 3 
$ date : NULL 
$ val  : NULL 
$ date2000: NULL 
> i 
[1] 3 
> str(sxx) 
int [1:3] 1 2 3 

对于j = 3我们有:

> str(xj) 
POSIXlt[1:3], format: "2000-01-01" "2000-01-02" "2000-01-03" 
> dim(xj) 
NULL 
> xj[3] 
[1] NA 

所以这是失败的地方。 我认为,问题就来了(如你提到的)从您更换d$date2000$year 1个值,而不是3事实:

> xj$wday 
[1] 3 5 0 
> xj$year 
[1] 100 
> xj[3] 
[1] NA 
> xj$year <- c(100,100,100) 
> xj[3] 
[1] "2000-01-03 CET" 

似乎显示当xj(或d),用于xj$year价值被回收,但是当仅显示xj[3]时,它会尝试构建POSIXlt并因其缺少year元素而失败。实际上,如果我们尝试使用两个元素而不是一个或三个,我们可以看到该矢量正在被回收:

> xj$year <- c(100,101) 
> xj 
[1] "2000-01-01 CET" "2001-01-02 CET" "2000-01-03 CET" 
> xj[2] 
[1] "2001-01-02 CET" 
> xj[3] 
[1] NA 
+0

感谢您深入挖掘。 – bnord

1

问题似乎在d$date2000$year <- 100。它使用d$date2000$year <- rep(100,length(d$date2000))按预期工作,不知道为什么修改后的data.frame在选择前给出了预期的结果。

相关问题