Eli Bendersky | 43375bf | 2015-04-20 07:19:09 -0700 | [diff] [blame] | 1 | #include <stdio.h> |
| 2 | #include <stdlib.h> |
| 3 | #include <string.h> |
| 4 | #include <getopt.h> |
| 5 | |
| 6 | #define PACKAGE "wgram" |
| 7 | #define VERSION "0.0.4" |
| 8 | #define MAXLINE 1024 |
| 9 | #define MAXGRAM 32 |
| 10 | |
| 11 | /* status epilepticus .. print help */ |
| 12 | void print_help(int exval); |
| 13 | |
| 14 | int main (int argc, char *argv[]) { |
| 15 | /* word delimeter for strtok() */ |
| 16 | char delim[] = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n"; |
| 17 | char line[MAXLINE]; /* input buff, fgets() */ |
| 18 | char *stray = NULL; /* returned value by strtok() */ |
| 19 | char **strarray = NULL; /* array to hold all entrys */ |
| 20 | int i = 0; /* general counter */ |
| 21 | int strcount = 0; /* number of entrys in pointer array */ |
| 22 | int N = 3, pos = 0; /* ngram size, 3 in this case */ |
| 23 | int opt = 0; /* holds command line opt nr.. */ |
| 24 | int word_flag = 0; /* print only the `raw' words */ |
| 25 | FILE *fp = stdin; /* read input from `FILE', default is stdin */ |
| 26 | |
| 27 | while((opt = getopt(argc, argv, "hvn:wf:")) != -1) { |
| 28 | switch(opt) { |
| 29 | case 'h': |
| 30 | print_help(0); |
| 31 | break; |
| 32 | case 'v': |
| 33 | exit(0); |
| 34 | break; |
| 35 | case 'n': |
| 36 | N = atoi(optarg); |
| 37 | if(N > MAXGRAM || N < 2) { |
| 38 | fprintf(stderr, "%s: Error - Ngram length `%d' out of range `0-%d'\n", |
| 39 | PACKAGE, N, MAXGRAM); |
| 40 | return 1; |
| 41 | } |
| 42 | break; |
| 43 | case 'w': |
| 44 | word_flag = 1; |
| 45 | break; |
| 46 | case 'f': |
| 47 | if(freopen(optarg, "r", fp) == NULL) { |
| 48 | fprintf(stderr, "%s: Error - opening `%s'\n", PACKAGE, optarg); |
| 49 | return 1; |
| 50 | } |
| 51 | break; |
| 52 | case '?': |
| 53 | fprintf(stderr, "%s: Error - No such option: `%c'\n\n", PACKAGE, optopt); |
| 54 | print_help(1); |
| 55 | } /* switch */ |
| 56 | } /* while */ |
| 57 | |
| 58 | /* start reading lines from file pointer, add all entrys to **strarray */ |
| 59 | while((fgets(line, MAXLINE, fp)) != NULL) { |
| 60 | if(strlen(line) < 2) |
| 61 | continue; |
| 62 | |
| 63 | stray = strtok(line, delim); |
| 64 | while(stray != NULL) { |
| 65 | strarray = (char **)realloc(strarray, (strcount + 1) * sizeof(char *)); |
| 66 | strarray[strcount++] = strdup(stray); |
| 67 | stray = strtok(NULL, delim); |
| 68 | } |
| 69 | } |
| 70 | |
| 71 | if(word_flag == 0) { |
| 72 | /* |
| 73 | // print the array of strings, jumping back each time |
| 74 | // (N - 1) positions if a whole ngram of words has been printed |
| 75 | */ |
| 76 | for(i = 0, pos = N; i < strcount; i++, pos--) { |
| 77 | if(pos == 0) pos = N, i -= (N - 1), printf("\n"); |
| 78 | printf("%s ", strarray[i]); |
| 79 | } |
| 80 | printf("\n"); |
| 81 | } else { |
| 82 | /* print raw words */ |
| 83 | for(i = 0; i < strcount; i++) |
| 84 | printf("%s\n", strarray[i]); |
| 85 | } |
| 86 | |
| 87 | /* free the string array */ |
| 88 | for(i = 0; i < strcount; i++) |
| 89 | free(strarray[i]); |
| 90 | |
| 91 | free(strarray); |
| 92 | return 0; |
| 93 | } |
| 94 | |
| 95 | /* status epilepticus .. print help */ |
| 96 | void print_help(int exval) { |
| 97 | printf("%s,%s extract N-grams from text data\n", PACKAGE, VERSION); |
| 98 | printf("Usage: %s [-h] [-v] [-n INT] [-w] [-f FILE]\n\n", PACKAGE); |
| 99 | |
| 100 | printf(" -h print this help and exit\n"); |
| 101 | printf(" -v print version and exit\n\n"); |
| 102 | |
| 103 | printf(" -n INT set ngram length (default=3)\n"); |
| 104 | printf(" -w print only the extracted words\n"); |
| 105 | printf(" -f FILE read input from `FILE' (default=stdin)\n\n"); |
| 106 | exit(exval); |
| 107 | } |