对于文件中每个可能的一对两个唯一字，打印出该对的出现次数

-5

此代码适用于单个字计数，并区分具有大写小写字母的标点符号字。有没有简单的方法来使这个代码适用于配对，而不是单个单词？就像我需要在文本文件中打印每对单词的出现一样。你的帮助是非常赞赏，对于文件中每个可能的一对两个唯一字，打印出该对的出现次数

#include <stdio.h> 
#include <stdlib.h> 


int main(int argc, char **argv) 
{ 
FILE* f = fopen (argv[1], "r"); 
char buffer[10000]; 
if (argc != 2) 
{ 
    fprintf(stderr, "Usage: %s file\n", argv[0]); 

} 
fclose(f); 
snprintf(buffer, sizeof(buffer), "tr -cs '[:punct:][a-z][A-Z]' '[\\n*]' < %s |" 
           " sort | uniq -c | sort -n", argv[1]); 

return(system(buffer)); 
}

实施例输入

The Cat Sat On The Mat

输出（猫，星期六，该上时，所述的，垫，猫，猫星期六，猫开，对于30对）

来源

2016-04-03 user3328381

唉！ “在C”？你为什么不直接输入该命令？ –

@weather叶片编辑 – user3328381

这是一个任务吗？你确定你可以使用'system'来调用外部工具吗？分配通常要求您在不调用其他程序的情况下实现实际功能。 – kaylum

您的任务确定文件中字对的频率的目的似乎是不可思议的，那就是让您在调用system时将shell实用程序的管道字符串包装起来。那可能教你关于C的是什么？存在允许shell访问的system函数？那么，它确实如此，而且你可以完成课程，没有任何学问。

似乎更有可能的是，意图是让你了解使用结构的持有相关数据集合在一个单一的对象，或在最小阵列或指针索引检查对在文件中的相邻词语中。在2种常规方法中，使用结构或索引算术，使用结构更有利。简单地容纳一对单词以及看到的对的频率就是你所需要的。例如：

enum { MAXC = 32, MAXP = 100 }; 

typedef struct { 
    char w1[MAXC]; 
    char w2[MAXC]; 
    size_t freq; 
} wordpair;

（注意，该enum简单定义的常量MAXC（32）和MAXP（100）每字最多字符，最大对记录您可以用两个#define语句同一端。）

可以声明的wordpair结构，这将保持在一对或单词w1和w2和多少时间对被认为是在freq的阵列。结构数组可以像对待任何其他数组一样对待，排序等。

要分析文件，只需将前两个单词读入第一个结构，保存一个指向第二个单词的指针，然后读取文件中剩余的剩余单词，比较由指针和新单词形成的对是否已经存在（如果只是简单地更新所看到的次数），并且如果它不存在，则添加一个新对以更新指向读取的新单词的指针，然后重复。

下面是一个简短示例，它将检查命令行中作为参数给出的所有文件名中单词的出现次数（例如./progname file1 file2 ...）。如果没有给出文件，默认情况下代码将从stdin中读取。

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 

enum { MAXC = 32, MAXP = 100 }; 

typedef struct { 
    char w1[MAXC]; 
    char w2[MAXC]; 
    size_t freq; 
} wordpair; 

size_t get_pair_freq (wordpair *words, FILE *fp); 
int compare (const void *a, const void *b); 

int main (int argc, char **argv) { 

    /* initialize variables & open file or stdin for seening */ 
    wordpair words[MAXP] = {{"", "", 0}}; 
    size_t i, idx = 0; 
    FILE *fp = argc > 1 ? fopen (argv[1], "r") : stdin; 

    if (!fp) { 
     fprintf (stderr, "error: file open failed '%s'.\n", argv[1]); 
     return 1; 
    } 

    /* read from file given, or from stdin (default) */ 
    idx = get_pair_freq (words, stdin); 

    /* read each remaining file given on command line */ 
    for (i = 2; i < (size_t)argc; i++) 
    { if (fp && fp != stdin) { fclose (fp); fp = NULL; } 
     /* open file for reading */ 
     if (!(fp = fopen (argv[i], "r"))) { 
      fprintf (stderr, "error: file open failed '%s'.\n", 
         argv[i]); 
      continue; 
     } 

     /* check 'idx' against MAXP */ 
     if ((idx += get_pair_freq (words, fp)) == MAXP) 
      break; 
    } 
    if (fp && fp != stdin) fclose (fp); 

    /* sort words alphabetically */ 
    qsort (words, idx, sizeof *words, compare); 

    /* output the frequency of word pairs */ 
    printf ("\nthe occurrence of words pairs are:\n\n"); 
    for (i = 0; i < idx; i++) { 
     char pair[MAXC * 2] = ""; 
     sprintf (pair, "%s:%s", words[i].w1, words[i].w2); 
     printf (" %-32s : %zu\n", pair, words[i].freq); 
    } 

    return 0; 
} 

size_t get_pair_freq (wordpair *pairs, FILE *fp) 
{ 
    char w1[MAXC] = "", w2[MAXC] = ""; 
    char *fmt1 = " %32[^ ,.\t\n]%*c"; 
    char *fmt2 = " %32[^ ,.\t\n]%*[^A-Za-z0-9]%32[^ ,.\t\n]%*c"; 
    char *w1p; 
    int nw = 0; 
    size_t i, idx = 0; 

    /* read 1st 2 words into pair, update index 'idx' */ 
    if (idx == 0) { 
     if ((nw = fscanf (fp, fmt2, w1, w2)) == 2) { 
      strcpy (pairs[idx].w1, w1); 
      strcpy (pairs[idx].w2, w2); 
      pairs[idx].freq++; 
      w1p = pairs[idx].w2; /* save pointer to w2 for next w1 */ 
      idx++; 
     } 
     else { 
      if (!nw) fprintf (stderr, "error: file read error.\n"); 
      return idx; 
     } 
    } 

    /* read each word in file into w2 */ 
    while (fscanf (fp, fmt1, w2) == 1) { 
     /* check against all pairs in struct */ 
     for (i = 0; i < idx; i++) { 
      /* check if pair already exists */ 
      if (strcmp (pairs[i].w1, w1p) == 0 && 
       strcmp (pairs[i].w2, w2) == 0) { 
       pairs[i].freq++; /* update frequency for pair */ 
       goto skipdup;  /* skip adding duplicate pair */ 
      } 
     } /* add new pair, update pairs[*idx].freq */ 
     strcpy (pairs[idx].w1, w1p); 
     strcpy (pairs[idx].w2, w2); 
     pairs[idx].freq++; 
     w1p = pairs[idx].w2; 
     idx++; 

    skipdup: 

     if (idx == MAXP) { /* check 'idx' against MAXP */ 
      fprintf (stderr, "warning: MAXP words exceeded.\n"); 
      break; 
     } 
    } 

    return idx; 
} 

/* qsort compare funciton */ 
int compare (const void *a, const void *b) 
{ 
    return (strcmp (((wordpair *)a)->w1, ((wordpair *)b)->w1)); 
}

使用/输出

鉴于你的"Hi how are you are you."例如，（根据你LOCALE按排序顺序）产生所期望的结果。

$ echo "Hi how are you are you." | ./bin/file_word_pairs 

the occurrence of words pairs are: 

    Hi:how       : 1 
    are:you       : 2 
    how:are       : 1 
    you:are       : 1

（还有就是你对结果进行排序没有要求，但它使查找/确认轻松了许多具有较长的文件）

删除快速排序

$ echo "Hi how are you are you." | ./bin/file_word_pairs 

the occurrence of words pairs are: 

    Hi:how       : 1 
    how:are       : 1 
    are:you       : 2 
    you:are       : 1

虽然你是自由的尝试使用您的system版本，为什么不花时间学习如何解决C中的问题。如果您想通过拨打system来学习如何操作，请拨打Linux课程，因为这样做与C没什么关系。

仔细查看，在手册页中查找对您而言是新手的函数，然后询问您之后不了解的任何内容。

来源

2016-04-04 01:29:52

非常感谢大卫！我只是想编译程序gcc program.c然后./a.out words_file.txt但它不工作，有什么建议吗？ – user3328381

当然'gcc -Wall -Wextra filename.c'这会产生'a.out'。然后只需'./a.out filename'来读取'filename'中的对。你也可以在你的编译时加上'-o'并为你的可执行文件指定一个真实的名字，例如'gcc -Wall -Wextra -o myexename filename.c'，这将创建'myexename'作为你的可执行文件。 –

我使用了：'gcc -Wall -Wextra -Ofast -o bin/file_word_pairs file_word_pairs.c'，它只是加了'-Ofast'优化。 –

对于文件中每个可能的一对两个唯一字，打印出该对的出现次数

回答

相关问题