2017-04-03 42 views
1

我有一个相当大的数据集(15.000行),并且由于数据结构,我需要对每行进行计算。我的数据集中有一列需要进一步拆分。下面是一个例子:对所有行执行操作并将结果添加回主数据框

date <- c("2015-07-10", "2013-05-06", "2017-08-10") 
Number <- c(345, 231, 10) 
Route <- c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67") 

Dep <- c("FGC","HAM","ICAO") 
Plan <- data.frame(date, Number, Route, Dep) 

对我来说,重要的信息是在“路线”列中。我需要从这一栏生成聚合功能。该列中每个单元格中的信息都需要被“;”分隔。

我试过到目前为止:

  1. 选择一行

  2. 创建一个新的数据帧正好与此一行。

  3. 在列“路由”上使用mutate和unnest将其拆分为“;”点,并创建一个新的行对每个

    测试< - 计划[1,]
    测试< - 试验%>%突变(路线= strsplit(as.character(途径), “;”))%>% UNNEST(路线)

  4. 使用CSPLIT通过拆分列“路由”的信息“:”

    test = cSplit(test, "Route", ":") 
    
  5. 我则对数据的这个子集进行我的计算。

  6. 创建变量X,Y,Z救我的计算

    x1 <- mean(test$Route_2) 
        y1 <- max(test$Route_5) 
        z1 <- min(test$Route_8) 
    

两个问题:

我怎么能在我的原始数据集自动执行此操作的所有行? 如何将保存的变量(x,y,z)中的数据合并回原始数据框?

期望的输出 (这些都不是从X2和X3,只是一个例子中的数据的实际值)

x1 <- 12 
y1 <- 86363 
z1 <- 7383 
x2 <- 45 
y2 <- 6754 
z2 <- 3553 
x3 <- 5648 
y3 <- 64 
z3 <- 6363 

Plan$x <- c(x1,x2,x3) 
Plan$y <- c(y1, y2, y3) 
Plan$z <- c(z1,z2,z3) 

head(Plan) 

全样本CODE一次全部

library(splitstackshape) 
library(plyr) 
library(tidyr) 

date <- c("2015-07-10", "2013-05-06", "2017-08-10") 
Number <- c(345, 231, 10) 
Route <- c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67") 

Dep <- c("FGC","HAM","ICAO") 

Plan <- data.frame(date, Number, Route, Dep) 

test <- Plan[1,] 
test <- test %>% mutate(Route=strsplit(as.character(Route), ";")) %>% unnest(Route) 
test = cSplit(test, "Route", ":") 

x1 <- mean(test$Route_2) 
y1 <- max(test$Route_5) 
z1 <- min(test$Route_8) 

x2 <- 45 
y2 <- 6754 
z2 <- 3553 
x3 <- 5648 
y3 <- 64 
z3 <- 6363 

Plan$x <- c(x1,x2,x3) 
Plan$y <- c(y1, y2, y3) 
Plan$z <- c(z1,z2,z3) 

head(Plan) 
+0

请包括您提供的示例data.frame所需的输出。我怀疑你想要'strsplit',但是我不完全确定最终的data.frame。 – lmo

+0

感谢您的信息! – Anna2803

+0

'tidyr'包中的'separate'函数在这里可能会有所帮助 – bouncyball

回答

2

创建第二个临时路由列,名为Route_tmp,并从中为其每个分量生成一个单独的行,以分号分隔,然后用冒号将结果变量Route_tmp分隔成单独的列。现在按原始变量进行分组,我们采用所需列的平均值。 (请注意,如果我们在输出中不需要Route,那么我们可以忽略顶部的mutate并使用Route代替Route_tmp。)

library(dplyr) 
library(tidyr) 

out <- Plan %>% 
    mutate(Route_tmp = Route) %>% 
    separate_rows(Route_tmp, sep = ";") %>% 
    separate(Route_tmp, as.character(1:8), convert = TRUE) %>% 
    group_by(date, Number, Route, Dep) %>% 
    summarize(x = mean(`2`), y = mean(`5`), z = mean(`8`)) %>% 
    ungroup 

给予以下(不显示路径栏,使其更易于阅读):

> out[-3] 
# A tibble: 3 × 6 
     date Number Dep  x  y  z 
     <fctr> <dbl> <fctr> <dbl> <dbl> <dbl> 
1 2013-05-06 231 HAM 8224.333 17 33.66667 
2 2015-07-10 345 FGC 8224.333 17 33.66667 
3 2017-08-10  10 ICAO 8224.333 17 33.66667 

注:由于规划中的问题是覆盖它不是清楚我正是哪个版本的计划是输入,但我已经假设:

Plan <- data.frame(date = c("2015-07-10", "2013-05-06", "2017-08-10"), 
      Number = c(345, 231, 10), 
      Route = c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67"), 
      Dep = c("FGC","HAM","ICAO")) 
2

以下是我如何使用tidyverse包装:

library(dplyr) 
library(tidyr) 
library(stringr) 
library(purrr) 
# This function takes a single item from Plan$Route, splits it into its 
# relevant columns and then finds the mean of columns 2, 5 and 8. 
route_extract <- function(route) { 
    cols <- str_split(route, fixed(":"), simplify = TRUE)[, c(2, 5, 8), drop = FALSE] 
    # Converts the matrix to numeric without losing dimensions 
    storage.mode(cols) <- "numeric" 
    # Calculate the column means and then return the result as a `tibble` 
    cm <- colMeans(cols) 
    tibble(x = cm[1], y = cm[2], z = cm[3]) 
} 
route_calc <- function(routes) { 
    str_split(routes, fixed(";")) %>% 
    map_df(route_extract) 

} 

Plan <- bind_cols(Plan, route_calc(Plan$Route)) 
相关问题