Eli Bendersky | 3921e8e | 2010-05-21 09:05:39 +0300 | [diff] [blame^] | 1 | #include <stdio.h>
|
| 2 | #include <stdlib.h>
|
| 3 | #include <string.h>
|
| 4 | #include <getopt.h>
|
| 5 |
|
| 6 | #define PACKAGE "wgram"
|
| 7 | #define VERSION "0.0.4"
|
| 8 | #define MAXLINE 1024
|
| 9 | #define MAXGRAM 32
|
| 10 |
|
| 11 | /* status epilepticus .. print help */
|
| 12 | void print_help(int exval);
|
| 13 |
|
| 14 | int main (int argc, char *argv[]) {
|
| 15 | /* word delimeter for strtok() */
|
| 16 | char delim[] = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n";
|
| 17 | char line[MAXLINE]; /* input buff, fgets() */
|
| 18 | char *stray = NULL; /* returned value by strtok() */
|
| 19 | char **strarray = NULL; /* array to hold all entrys */
|
| 20 | int i = 0; /* general counter */
|
| 21 | int strcount = 0; /* number of entrys in pointer array */
|
| 22 | int N = 3, pos = 0; /* ngram size, 3 in this case */
|
| 23 | int opt = 0; /* holds command line opt nr.. */
|
| 24 | int word_flag = 0; /* print only the `raw' words */
|
| 25 | FILE *fp = stdin; /* read input from `FILE', default is stdin */
|
| 26 |
|
| 27 | while((opt = getopt(argc, argv, "hvn:wf:")) != -1) {
|
| 28 | switch(opt) {
|
| 29 | case 'h':
|
| 30 | print_help(0);
|
| 31 | break;
|
| 32 | case 'v':
|
| 33 | exit(0);
|
| 34 | break;
|
| 35 | case 'n':
|
| 36 | N = atoi(optarg);
|
| 37 | if(N > MAXGRAM || N < 2) {
|
| 38 | fprintf(stderr, "%s: Error - Ngram length `%d' out of range `0-%d'\n",
|
| 39 | PACKAGE, N, MAXGRAM);
|
| 40 | return 1;
|
| 41 | }
|
| 42 | break;
|
| 43 | case 'w':
|
| 44 | word_flag = 1;
|
| 45 | break;
|
| 46 | case 'f':
|
| 47 | if(freopen(optarg, "r", fp) == NULL) {
|
| 48 | fprintf(stderr, "%s: Error - opening `%s'\n", PACKAGE, optarg);
|
| 49 | return 1;
|
| 50 | }
|
| 51 | break;
|
| 52 | case '?':
|
| 53 | fprintf(stderr, "%s: Error - No such option: `%c'\n\n", PACKAGE, optopt);
|
| 54 | print_help(1);
|
| 55 | } /* switch */
|
| 56 | } /* while */
|
| 57 |
|
| 58 | /* start reading lines from file pointer, add all entrys to **strarray */
|
| 59 | while((fgets(line, MAXLINE, fp)) != NULL) {
|
| 60 | if(strlen(line) < 2)
|
| 61 | continue;
|
| 62 |
|
| 63 | stray = strtok(line, delim);
|
| 64 | while(stray != NULL) {
|
| 65 | strarray = (char **)realloc(strarray, (strcount + 1) * sizeof(char *));
|
| 66 | strarray[strcount++] = strdup(stray);
|
| 67 | stray = strtok(NULL, delim);
|
| 68 | }
|
| 69 | }
|
| 70 |
|
| 71 | if(word_flag == 0) {
|
| 72 | /*
|
| 73 | // print the array of strings, jumping back each time
|
| 74 | // (N - 1) positions if a whole ngram of words has been printed
|
| 75 | */
|
| 76 | for(i = 0, pos = N; i < strcount; i++, pos--) {
|
| 77 | if(pos == 0) pos = N, i -= (N - 1), printf("\n");
|
| 78 | printf("%s ", strarray[i]);
|
| 79 | }
|
| 80 | printf("\n");
|
| 81 | } else {
|
| 82 | /* print raw words */
|
| 83 | for(i = 0; i < strcount; i++)
|
| 84 | printf("%s\n", strarray[i]);
|
| 85 | }
|
| 86 |
|
| 87 | /* free the string array */
|
| 88 | for(i = 0; i < strcount; i++)
|
| 89 | free(strarray[i]);
|
| 90 |
|
| 91 | free(strarray);
|
| 92 | return 0;
|
| 93 | }
|
| 94 |
|
| 95 | /* status epilepticus .. print help */
|
| 96 | void print_help(int exval) {
|
| 97 | printf("%s,%s extract N-grams from text data\n", PACKAGE, VERSION);
|
| 98 | printf("Usage: %s [-h] [-v] [-n INT] [-w] [-f FILE]\n\n", PACKAGE);
|
| 99 |
|
| 100 | printf(" -h print this help and exit\n");
|
| 101 | printf(" -v print version and exit\n\n");
|
| 102 |
|
| 103 | printf(" -n INT set ngram length (default=3)\n");
|
| 104 | printf(" -w print only the extracted words\n");
|
| 105 | printf(" -f FILE read input from `FILE' (default=stdin)\n\n");
|
| 106 | exit(exval);
|
| 107 | }
|