2017-04-18 94 views
1

我试图从CSV文件最后一行的第二个字段获取数字。到目前为止,我有这个:使用awk忽略CSV文件字段中的逗号

awk -F"," 'END {print $2}' /file/path/fileName.csv 

这个工程,除非最后一行中的第一个字段有一个逗号。因此,对于一个行看起来像这样,

"Company Name, LLC", 12345, Type1, SubType3 

...其中"Company Name, LLC"实际上是第一个领域,awk命令将返回LLC

我如何忽略第一个字段中的逗号,以便我可以在第二个字段中获取信息?

+0

如果最后三个字段不能包含''你可以用'$(NF-2 )'假设有4个字段 – Sundeep

回答

2

我用FS时认为你的要求是完美的用例使用GNU AwkFPAT

引用原样从man page

通常情况下,gawk定义字段部件记录发生在每个字段分隔符之间。换句话说,FS定义了一个字段不是什么,而不是字段是什么。但是,有时候你真的想根据它们是什么来定义字段,而不是根据它们不是什么来定义字段。

最臭名昭着的这种情况就是所谓的逗号分隔值(CSV)数据。如果逗号仅分隔数据,则不会有问题。当其中一个字段包含嵌入逗号时,问题就来了。在这种情况下,大多数程序将该字段嵌入双引号中。

在这里给出的CSV数据的情况下,每个字段或者是“任何不是逗号”或者“双引号,任何不是双引号,并且是双引号”。如果写入作为正则表达式常量(请参阅Regexp),我们将有/([^,]+)|("[^"]+")/。写这作为一个字符串需要我们逃脱双引号,导致:

FPAT = "([^,]+)|(\"[^\"]+\")" 

使用您的输入文件,

awk 'BEGIN{FPAT = "([^,]+)|(\"[^\"]+\")"}{print $1}' file 
"Company Name, LLC" 
1

没有一般的回答这个问题,因为正则表达式AREN (通常情况下)不足以解析csv。我solution是一个C程序预处理用有限状态机的输入,其输出可以输入到awk中:

/* NAME 
* 
*  csv -- convert comma-separated values file to character-delimited 
* 
* 
* SYNOPSIS 
* 
*  csv [-Cc] [-Fc] [filename ...] 
* 
* 
* DESCRIPTION 
* 
*  Csv reads from standard input or from one or more files named on 
*  the command line a sequence of records in comma-separated values 
*  format and writes on standard output the same records in character- 
*  delimited format. Csv returns 0 on success, 1 for option errors, 
*  and 2 if any file couldn't be opened. 
* 
*  The comma-separated values format has developed over time as a 
*  set of conventions that has never been formally defined, and some 
*  implementations are in conflict about some of the details. In 
*  general, the comma-separated values format is used by databases, 
*  spreadsheets, and other programs that need to write data consisting 
*  of records containing fields. The data is written as ascii text, 
*  with records terminated by newlines and fields containing zero or 
*  more characters separated by commas. Leading and trailing space in 
*  unquoted fields is preserved. Fields may be surrounded by double- 
*  quote characters (ascii \042); such fields may contain newlines, 
*  literal commas (ascii \054), and double-quote characters 
*  represented as two successive double-quotes. The examples shown 
*  below clarify many irregular situations that may arise. 
* 
*  The field separator is normally a comma, but can be changed to an 
*  arbitrary character c with the command line option -Cc. This is 
*  useful in those european countries that use a comma instead of a 
*  decimal point, where the field separator is normally changed to a 
*  semicolon. 
* 
*  Character-delimited format has records terminated by newlines and 
*  fields separated by a single character, which is \034 by default 
*  but may be changed with the -Fc option on the command line. 
* 
* 
* EXAMPLE 
* 
*  Each record below has five fields. For readability, the three- 
*  character sequence TAB represents a single tab character (ascii 
*  \011). 
* 
*   $ cat testdata.csv 
*   1,abc,def ghi,jkl,unquoted character strings 
*   2,"abc","def ghi","jkl",quoted character strings 
*   3,123,456,789,numbers 
*   4, abc,def , ghi ,strings with whitespace 
*   5, "abc","def" , "ghi" ,quoted strings with whitespace 
*   6, 123,456 , 789 ,numbers with whitespace 
*   7,TAB123,456TAB,TAB789TAB,numbers with tabs for whitespace 
*   8, -123, +456, 1E3,more numbers with whitespace 
*   9,123 456,123"456, 123 456 ,strange numbers 
*   10,abc",de"f,g"hi,embedded quotes 
*   11,"abc""","de""f","g""hi",quoted embedded quotes 
*   12,"","" "",""x"",doubled quotes 
*   13,"abc"def,abc"def","abc" "def",strange quotes 
*   14,,"", ,empty fields 
*   15,abc,"def 
*   ghi",jkl,embedded newline 
*   16,abc,"def",789,multiple types of fields 
* 
*   $ csv -F'|' testdata.csv 
*   1|abc|def ghi|jkl|unquoted character strings 
*   2|abc|def ghi|jkl|quoted character strings 
*   3|123|456|789|numbers 
*   4| abc|def | ghi |strings with whitespace 
*   5| "abc"|def | "ghi" |quoted strings with whitespace 
*   6| 123|456 | 789 |numbers with whitespace 
*   7|TAB123|456TAB|TAB789TAB|numbers with tabs for whitespace 
*   8| -123| +456| 1E3|more numbers with whitespace 
*   9|123 456|123"456| 123 456 |strange numbers 
*   10|abc"|de"f|g"hi|embedded quotes 
*   11|abc"|de"f|g"hi|quoted embedded quotes 
*   12|| ""|x""|doubled quotes 
*   13|abcdef|abc"def"|abc "def"|strange quotes 
*   14||| |empty fields 
*   15|abc|def 
*   ghi|jkl|embedded newline 
*   16|abc|def|789|multiple types of fields 
* 
*  It is particularly easy to pipe the output from csv into any of 
*  the unix tools that accept character-delimited fielded text data 
*  files, such as sort, join, or cut. For example: 
* 
*   csv datafile.csv | awk -F'\034' -f program.awk 
* 
* 
* BUGS 
* 
*  On DOS, Windows, and OS/2 systems, processing of each file stops 
*  at the first appearance of the ascii \032 (control-Z) end of file 
*  character. 
* 
*  Because newlines embedded in quoted fields are treated literally, 
*  a missing closing quote can suck up all remaining input. 
* 
* 
* LICENSE 
* 
*  This program was written by Philip L. Bewig of Saint Louis, 
*  Missouri, United States of America on February 28, 2002 and 
*  placed in the public domain. 
*/ 

#include <stdio.h> 

/* dofile -- convert one file from comma-separated to delimited */ 
void dofile(char ofs, char fs, FILE *f) { 
    int c; /* current input character */ 

    START: 
     c = fgetc(f); 
     if (c == EOF) {      return; } 
     if (c == '\r') {      goto CARRIAGE_RETURN; } 
     if (c == '\n') {      goto LINE_FEED; } 
     if (c == '\"') {      goto QUOTED_FIELD; } 
     if (c == fs) { putchar(ofs);  goto NOT_FIELD; } 
     /* default */ { putchar(c);   goto UNQUOTED_FIELD; } 

    NOT_FIELD: 
     c = fgetc(f); 
     if (c == EOF) { putchar('\n');  return; } 
     if (c == '\r') {      goto CARRIAGE_RETURN; } 
     if (c == '\n') {      goto LINE_FEED; } 
     if (c == '\"') {      goto QUOTED_FIELD; } 
     if (c == fs) { putchar(ofs);  goto NOT_FIELD; } 
     /* default */ { putchar(c);   goto UNQUOTED_FIELD; } 

    QUOTED_FIELD: 
     c = fgetc(f); 
     if (c == EOF) { putchar('\n');  return; } 
     if (c == '\"') {      goto MAY_BE_DOUBLED_QUOTES; } 
     /* default */ { putchar(c);   goto QUOTED_FIELD; } 

    MAY_BE_DOUBLED_QUOTES: 
     c = fgetc(f); 
     if (c == EOF) { putchar('\n');  return; } 
     if (c == '\r') {      goto CARRIAGE_RETURN; } 
     if (c == '\n') {      goto LINE_FEED; } 
     if (c == '\"') { putchar('\"');  goto QUOTED_FIELD; } 
     if (c == fs) { putchar(ofs);  goto NOT_FIELD; } 
     /* default */ { putchar(c);   goto UNQUOTED_FIELD; } 

    UNQUOTED_FIELD: 
     c = fgetc(f); 
     if (c == EOF) { putchar('\n');  return; } 
     if (c == '\r') {      goto CARRIAGE_RETURN; } 
     if (c == '\n') {      goto LINE_FEED; } 
     if (c == fs) { putchar(ofs);  goto NOT_FIELD; } 
     /* default */ { putchar(c);   goto UNQUOTED_FIELD; } 

    CARRIAGE_RETURN: 
     c = fgetc(f); 
     if (c == EOF) { putchar('\n');  return; } 
     if (c == '\r') { putchar('\n');  goto CARRIAGE_RETURN; } 
     if (c == '\n') { putchar('\n');  goto START; } 
     if (c == '\"') { putchar('\n');  goto QUOTED_FIELD; } 
     if (c == fs) { printf("\n%c",ofs); goto NOT_FIELD; } 
     /* default */ { printf("\n%c",c); goto UNQUOTED_FIELD; } 

    LINE_FEED: 
     c = fgetc(f); 
     if (c == EOF) { putchar('\n');  return; } 
     if (c == '\r') { putchar('\n');  goto START; } 
     if (c == '\n') { putchar('\n');  goto LINE_FEED; } 
     if (c == '\"') { putchar('\n');  goto QUOTED_FIELD; } 
     if (c == fs) { printf("\n%c",ofs); goto NOT_FIELD; } 
     /* default */ { printf("\n%c",c); goto UNQUOTED_FIELD; } 
} 

/* main -- process command line, call appropriate conversion */ 
int main(int argc, char *argv[]) { 
    char ofs = '\034'; /* output field separator */ 
    char fs = ',';  /* input field separator */ 
    int status = 0; /* error status for return to operating system */ 
    char *progname; /* name of program for error messages */ 

    FILE *f; 
    int i; 

    progname = (char *) malloc(strlen(argv[0])+1); 
    strcpy(progname, argv[0]); 

    while (argc > 1 && argv[1][0] == '-') { 
     switch (argv[1][1]) { 
      case 'c': 
      case 'C': 
       fs = argv[1][2]; 
       break; 
      case 'f': 
      case 'F': 
       ofs = argv[1][2]; 
       break; 
      default: 
       fprintf(stderr, "%s: unknown argument %s\n", 
        progname, argv[1]); 
       fprintf(stderr, 
        "usage: %s [-Cc] [-Fc] [filename ...]\n", 
        progname); 
       exit(1); 
     } 
     argc--; 
     argv++; 
    } 

    if (argc == 1) 
     dofile(ofs, fs, stdin); 
    else 
     for (i = 1; i < argc; i++) 
      if ((f = fopen(argv[i], "r")) == NULL) { 
       fprintf(stderr, "%s: can't open %s\n", 
        progname, argv[i]); 
       status = 2; 
      } else { 
       dofile(ofs, fs, f); 
       fclose(f); 
      } 

    exit(status); 
}