2017-10-14 184 views
2

我的数据如下计算基于另一列

0.5,4.96,0.724973,0.01481065 
0.5,5.11,0.726749,0.01140151 
0.5,4.99,0.893074,0.00910343 
0.5,4.14,0.734336,0.00835252 
0.5,1.69,0.755600,0.00422898 
0.6,4.43,0.733582,0.01796329 
0.6,4.47,0.740393,0.01399680 
0.6,4.49,0.885607,0.01095668 
0.6,3.69,0.720035,0.00992851 
0.6,1.60,0.748339,0.00456993 
0.7,4.03,0.756354,0.02086922 
0.7,3.99,0.771689,0.01705783 
0.7,4.02,0.854532,0.01319982 
0.7,3.33,0.725414,0.01170297 

我想根据第一列的值来计算第二,第三和第四列的平均值意味着列。

例如0.5

0.5,4.18,0.766946,0.00957942

回答

4

GNU datamash最短的解决方案:

datamash -st, -g1 mean 2 mean 3 mean 4 <file 
  • -s - 排序记录

  • -t, - 由第一字段组记录


输出 - 设置逗号,作为字段分隔

  • -g1

    0.5,4.178,0.7669464,0.009579418 
    0.6,3.736,0.7655912,0.011483042 
    0.7,3.8425,0.77699725,0.01570746 
    
  • +0

    令人惊叹.....从来没有听说过这种工具。是否有一种方法来保持与输入相同的小数位数。我不介意额外的零 – user2650277

    +0

    @ user2650277,不,它执行计算并打印全实数 – RomanPerekhrest

    +1

    或者'datamash -st,-g1表示2-4 randomir

    2

    awk来保存(考虑到你的INPUT_FILE排序模式下,如果没有,那么你可以下面的代码之前使用sort -t, -k1 | awk ...太):

    awk -F, 'prev && prev != $1{for(i in a){split(i, b," ");val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);};delete a;print b[1],val[b[1]]}{a[$1,2]+=$2;a[$1,3]+=$3;a[$1,4]+=$4;c[$1,2]++;c[$1,3]++;c[$1,4]++;prev=$1} END{for(i in a){split(i, b," ");val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);};delete a;print b[1],val[b[1]]}' SUBSEP=" " Input_file 
    

    输出将如下。

    0.5 4.18,0.77,0.01 
    0.6 3.74,0.77,0.01 
    0.7 3.84,0.78,0.02 
    

    现在也增加一种非线性形式的解决方案。

    awk -F, ' 
    prev && prev != $1{ 
        for(i in a){ 
        split(i, b," "); 
        val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
    }; 
    delete a; 
    print b[1],val[b[1]] 
    } 
    { 
    a[$1,2]+=$2; 
    a[$1,3]+=$3; 
    a[$1,4]+=$4; 
    c[$1,2]++; 
    c[$1,3]++; 
    c[$1,4]++; 
    prev=$1 
    } 
    END{ 
    for(i in a){ 
        split(i, b," "); 
        val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
    }; 
    delete a; 
    print b[1],val[b[1]] 
    } 
    ' SUBSEP=" " Input_file 
    

    编辑:添加的命令解释过了。

    awk -F, ' 
    ##making field seprator as comma(,) 
    prev && prev != $1{ 
    ##Checking here if value of prev variable is NOT equal to first column and value of variable prev is NOT NULL. 
        for(i in a){ 
    ##Traversing in array named a now. 
        split(i, b," "); 
    ##using split utility of awk which will split any variable or line to an array with provided delimiter eg--> split(variable/line, array_name,delimiter), like i(index of array a) is provided here to be splited into array named b with delimiter as a space. 
        val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
    ##creating an array named val with index of array b value whose value will be the AVG/MEAN of all $1s and its index will be $1. It will concatenate its own value. 
    }; 
    delete a; 
    ##Deleting array a here. 
    print b[1],val[b[1]] 
    ##printing array b whose index is 1 and array val whose index is value of b[1] array. 
    } 
    { 
    a[$1,2]+=$2; 
    ##creating array a whose index is $1,2 where 2 denoted the 2nd field and it will add its all $2 values of whole Input_file. 
    a[$1,3]+=$3; 
    ##creating array a whose index is $1,3 where 3 denoted the 3rd field and it will add its all $3 values of whole Input_file. 
    a[$1,4]+=$4; 
    ##creating array a whose index is $1,4 where 4 denoted the 4th field and it will add its all $4 values of whole Input_file. 
    c[$1,2]++; 
    ##creating array named c with index of $1,2 and incrementing its value each time to make sure no empty column values will come. 
    c[$1,3]++; 
    ##creating array named c with index of $1,3 and incrementing its value each time to make sure no empty column values will come. 
    c[$1,4]++; 
    ##creating array named c with index of $1,4 and incrementing its value each time to make sure no empty column values will come. 
    prev=$1 
    ##Assigning variable prev value as column 1. 
    } 
    END{ 
    for(i in a){ 
    ##Again traversing through the array a and getting the MEAN/AVG of last line which will not come before END block of awk so same logic above mentioned to get first field and its means of $2,$3 and $4. 
        split(i, b," "); 
        val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
    }; 
    delete a; 
    print b[1],val[b[1]] 
    ##printing value of array b with index 1 and array val whose index is value of array b[1] value. 
    } 
    ' SUBSEP=" " file17 
    ##Setting SUBSEP as space and Mentioning Input_file name above. 
    
    2

    这里是一个整洁的小Awk脚本,你可以使用这个目的,

    #!/usr/bin/awk 
    
    # Setting the input and output field-separators and setting a special variable 
    # CONVFMT to control the precision width while printing the output 
    # Change CONVFMT to %.2f if you don't want the rounding of digits 
    
    BEGIN { FS=OFS=","; CONVFMT="%.2g" } 
    
    NF == 4 { 
        # Creating a hash-table based on $1 value by summing up the value present 
        # in each of the other columns present. 
        sumOfCol2[$1]+=$2 
        sumOfCol3[$1]+=$3 
        sumOfCol4[$1]+=$4 
        count[$1]++; 
    } 
    
    END { 
        # Print the value (sum)/(count) value with the required precision control 
        for (i in sumOfCol2) 
         print i, (sumOfCol2[i]/count[i]), (sumOfCol3[i]/count[i]), (sumOfCol4[i]/count[i]) 
    } 
    

    ,并运行脚本

    awk -f script.awk file 
    0.5,4.178,0.766946,0.00957942 
    0.6,3.736,0.765591,0.011483 
    0.7,3.8425,0.776997,0.0157075