#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <getopt.h> | |
#define PACKAGE "wgram" | |
#define VERSION "0.0.4" | |
#define MAXLINE 1024 | |
#define MAXGRAM 32 | |
/* status epilepticus .. print help */ | |
void print_help(int exval); | |
int main (int argc, char *argv[]) { | |
/* word delimeter for strtok() */ | |
char delim[] = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n"; | |
char line[MAXLINE]; /* input buff, fgets() */ | |
char *stray = NULL; /* returned value by strtok() */ | |
char **strarray = NULL; /* array to hold all entrys */ | |
int i = 0; /* general counter */ | |
int strcount = 0; /* number of entrys in pointer array */ | |
int N = 3, pos = 0; /* ngram size, 3 in this case */ | |
int opt = 0; /* holds command line opt nr.. */ | |
int word_flag = 0; /* print only the `raw' words */ | |
FILE *fp = stdin; /* read input from `FILE', default is stdin */ | |
while((opt = getopt(argc, argv, "hvn:wf:")) != -1) { | |
switch(opt) { | |
case 'h': | |
print_help(0); | |
break; | |
case 'v': | |
exit(0); | |
break; | |
case 'n': | |
N = atoi(optarg); | |
if(N > MAXGRAM || N < 2) { | |
fprintf(stderr, "%s: Error - Ngram length `%d' out of range `0-%d'\n", | |
PACKAGE, N, MAXGRAM); | |
return 1; | |
} | |
break; | |
case 'w': | |
word_flag = 1; | |
break; | |
case 'f': | |
if(freopen(optarg, "r", fp) == NULL) { | |
fprintf(stderr, "%s: Error - opening `%s'\n", PACKAGE, optarg); | |
return 1; | |
} | |
break; | |
case '?': | |
fprintf(stderr, "%s: Error - No such option: `%c'\n\n", PACKAGE, optopt); | |
print_help(1); | |
} /* switch */ | |
} /* while */ | |
/* start reading lines from file pointer, add all entrys to **strarray */ | |
while((fgets(line, MAXLINE, fp)) != NULL) { | |
if(strlen(line) < 2) | |
continue; | |
stray = strtok(line, delim); | |
while(stray != NULL) { | |
strarray = (char **)realloc(strarray, (strcount + 1) * sizeof(char *)); | |
strarray[strcount++] = strdup(stray); | |
stray = strtok(NULL, delim); | |
} | |
} | |
if(word_flag == 0) { | |
/* | |
// print the array of strings, jumping back each time | |
// (N - 1) positions if a whole ngram of words has been printed | |
*/ | |
for(i = 0, pos = N; i < strcount; i++, pos--) { | |
if(pos == 0) pos = N, i -= (N - 1), printf("\n"); | |
printf("%s ", strarray[i]); | |
} | |
printf("\n"); | |
} else { | |
/* print raw words */ | |
for(i = 0; i < strcount; i++) | |
printf("%s\n", strarray[i]); | |
} | |
/* free the string array */ | |
for(i = 0; i < strcount; i++) | |
free(strarray[i]); | |
free(strarray); | |
return 0; | |
} | |
/* status epilepticus .. print help */ | |
void print_help(int exval) { | |
printf("%s,%s extract N-grams from text data\n", PACKAGE, VERSION); | |
printf("Usage: %s [-h] [-v] [-n INT] [-w] [-f FILE]\n\n", PACKAGE); | |
printf(" -h print this help and exit\n"); | |
printf(" -v print version and exit\n\n"); | |
printf(" -n INT set ngram length (default=3)\n"); | |
printf(" -w print only the extracted words\n"); | |
printf(" -f FILE read input from `FILE' (default=stdin)\n\n"); | |
exit(exval); | |
} |