/** * fcomp - parse text for string tokens and search for them * Copyright (C) 2019 Anastasis Grammenos * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "stdlib.h" #include "stdio.h" #include "string.h" #include "getopt.h" #include "unistd.h" #include "ctype.h" #include "linenoise.h" typedef struct slist { char **s; unsigned int n; } slist; typedef struct token { char *s; unsigned int count; } token; /** * Count result */ typedef struct result { token **tok; unsigned int n; } result; /** * search methods */ static int exact(const char *str, const char *c); static int starts(const char *str, const char *c); static int fuz(const char *str, const char *c); /** * stats for debugging */ typedef struct stats { unsigned int discarded; char *search_results; char *tok_results; char *unique_results; } stats; static stats st = { 0, NULL, NULL, NULL }; /** * config */ typedef struct config { int help; /** 0: off 1: on */ int debug; /** 0: off 1: on */ int stats; /** 0: off 1: on */ int stdin; /** 0: off 1: on */ int reverse_sort; /** 0: off 1: on */ int lisp_print; /** 0: off 1: on */ int print_count; /** 0: off 1: on */ int print_all; /** 0: off 1: on */ int all_uniq; /** 0: off 1: on */ int interactive; /** 0: off 1: on */ int prompt; /** 0: off 1: on */ int filestream; /** 0: off 1: on */ char *file; char *query; unsigned int min_word_size; int evc; /** 0: off 1: on */ char *extra_valid_chars; int eic; /** 0: off 1: on */ char *extra_invalid_chars; int (*search_method)(const char *, const char *); unsigned int max_token_size; } config; /** * Initial config vector */ static config cfg = { .help = 0, .debug = 0, .stats = 0, .stdin = 1, .reverse_sort = 0, .lisp_print = 0, .print_count = 0, .print_all = 0, .all_uniq = 0, .interactive = 0, .prompt = 1, .filestream = 0, .file = NULL, .query = NULL, .min_word_size = 3, .evc = 0, .extra_valid_chars = NULL, .eic = 0, .extra_invalid_chars = NULL, .search_method = starts, .max_token_size = 1000 }; static void print_help(char *argv0) { fprintf(stderr, "fcomp - parse text for string tokens and search for them\n"); fprintf(stderr, "Copyright (C) 2019 Anastasis Grammenos\n"); fprintf(stderr, "This program is licenced under GPLv2. See source tree for details.\n\n"); fprintf(stderr, "Usage:\n~~~~~~\n"); fprintf(stderr, "%s [query] [-f FILE] [...]\n\n", argv0); fprintf(stderr, "%8s %4s %15s %50s\n", "Options", "", "Mnemonic", "Description"); fprintf(stderr, "%8s %4s %15s %50s\n", "~~~~~~~", "", "~~~~~~~~", "~~~~~~~~~~~"); fprintf(stderr, "%8s %4s %15s %50s\n\n", "-h", "", "help", "Print this help message"); fprintf(stderr, "%8s %4s %15s %50s\n", "query", "[..]", "", "The term to be matched agains the tokens"); fprintf(stderr, "%8s %4s %15s %50s\n", "-f", "[..]", "file", "Select the input file to use"); fprintf(stderr, "%8s %4s %15s %50s\n", "stdin", "[..]", "", "Input can come from rediretion as well as files"); fprintf(stderr, "%8s %4s %15s %50s\n", "-z", "", "fuzzy", "Set the search method to fuzzy"); fprintf(stderr, "%8s %4s %15s %50s\n", "-x", "", "exact", "Set the search method to exact match"); fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "", "(Default is starts_with)"); fprintf(stderr, "%8s %4s %15s %50s\n", "-r", "", "reverse", "Reverse the sorting of matches"); fprintf(stderr, "%8s %4s %15s %50s\n", "-c", "", "count", "Print the count of each matched token"); fprintf(stderr, "%8s %4s %15s %50s\n", "-l", "", "lisp", "Print matched tokens in a lisp list"); fprintf(stderr, "%8s %4s %15s %50s\n\n", "-u", "", "unique", "When -a is passed, don't print duplicates"); fprintf(stderr, "%8s %4s %15s %50s\n", "-a", "", "all", "Print all tokens of input"); fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "", "(Performs no search)"); fprintf(stderr, "%8s %4s %15s %50s\n", "-v", "[..]", "valid chars", "Set extra valid chars for tokens"); fprintf(stderr, "%8s %4s %15s %50s\n", "-i", "[..]", "invalid chars", "Set extra invalid chars for tokens"); fprintf(stderr, "%8s %4s %15s %50s\n", "-w", "[..]", "word length", "Set min word length to count as token"); fprintf(stderr, "%8s %4s %15s %50s\n\n", "-t", "[..]", "token length", "Set max token length to parse"); fprintf(stderr, "%8s %4s %15s %50s\n", "-d", "", "debug", "Show debug information"); fprintf(stderr, "%8s %4s %15s %50s\n", "-s", "", "stats", "Print some stats"); fprintf(stderr, "%8s %4s %15s %50s\n", "-I", "", "interactive", "Run with interactive query input"); fprintf(stderr, "%8s %4s %15s %50s\n", "", "", "", "(Pass it twice to disable prompt)"); } static void print_cfg() { fprintf(stderr, "\nValues:\n~~~~~~~\n"); fprintf(stderr, "Input file: %68s\n", cfg.stdin ? "stdin" : cfg.file); fprintf(stderr, "Query: %73s\n", cfg.query ? cfg.query : "---"); fprintf(stderr, "Search method: %65s\n", cfg.search_method == starts ? "starts with" : (cfg.search_method == exact ? "exact match" : "fuzzy")); fprintf(stderr, "Minimun word size: %61d\n", cfg.min_word_size); fprintf(stderr, "Extra valid chars: %61s\n", cfg.evc ? cfg.extra_valid_chars : "---"); fprintf(stderr, "Extra invalid chars: %59s\n", cfg.eic ? cfg.extra_invalid_chars : "---"); fprintf(stderr, "Flags:\n"); fprintf(stderr, " [-d:%4s] ", cfg.debug ? "on" : "off"); fprintf(stderr, "[-s:%4s] ", cfg.stats ? "on" : "off"); fprintf(stderr, "[-f:%4s] ", cfg.stdin ? "off" : "on"); fprintf(stderr, "[-r:%4s] ", cfg.reverse_sort ? "on" : "off"); fprintf(stderr, "[-l:%4s]\n", cfg.lisp_print ? "on" : "off"); fprintf(stderr, " [-c:%4s] ", cfg.print_count ? "on" : "off"); fprintf(stderr, "[-a:%4s] ", cfg.print_all ? "on" : "off"); fprintf(stderr, "[-u:%4s] ", cfg.all_uniq ? "on" : "off"); fprintf(stderr, "[-v:%4s] ", cfg.evc ? "on" : "off"); fprintf(stderr, "[-i:%4s]\n", cfg.eic ? "on" : "off"); } static void print_stats() { fprintf(stderr, "\nStats:\n~~~~~~\n"); fprintf(stderr, "Discarded chars:\t%d\n", st.discarded); if (st.tok_results) { fprintf(stderr, "%s", st.tok_results); free(st.tok_results); } if (st.search_results) { fprintf(stderr, "%s", st.search_results); free(st.search_results); } if (st.unique_results) { fprintf(stderr, "%s", st.unique_results); free(st.unique_results); } } static int parse_cli(int argc, char *argv[]) { if (argc < 2) return -1; char c; while ((c = getopt(argc, argv, "-hIF:xzsaudclrf:i:v:w:t:")) != -1) { switch (c) { case 'I': if (cfg.interactive) { cfg.prompt = 0; } cfg.interactive = 1; break; case 'F': cfg.stdin = 0; cfg.filestream = 1; cfg.file = optarg; break; case 'h': cfg.help = 1; break; case 'd': cfg.debug = 1; break; case 'u': cfg.all_uniq = 1; break; case 's': cfg.stats = 1; break; case 'a': cfg.print_all = 1; break; case 'x': cfg.search_method = exact; break; case 'z': cfg.search_method = fuz; break; case 'c': cfg.print_count = 1; break; case 'l': cfg.lisp_print = 1; break; case 'f': cfg.stdin = 0; cfg.file = optarg; break; case 'v': cfg.evc = 1; cfg.extra_valid_chars = optarg; break; case 'i': cfg.eic = 1; cfg.extra_invalid_chars = optarg; break; case 'w': cfg.min_word_size = atoi(optarg); break; case 't': cfg.max_token_size = atoi(optarg); break; case 'r': cfg.reverse_sort = 1; break; case '\1': cfg.query = optarg; break; case '?': if (optopt == 'w') { fprintf(stderr, "Specify minimum word length after -%c\n", optopt); } else if (optopt == 'v') { fprintf(stderr, "Specify extra valid chars after -%c\n", optopt); } else if (optopt == 'i') { fprintf(stderr, "Specify extra invalid chars after -%c\n", optopt); } else if (optopt == 'f') { fprintf(stderr, "Specify file to read after -%c\n", optopt); } else if (optopt == 't') { fprintf(stderr, "Specify max token size after -%c\n", optopt); } default: return -1; } } return 0; } /** * Check if a character is valid according to the default * and extra valid characters */ static int is_valid(const char c) { if (cfg.eic) { for (size_t i = 0; i < strlen(cfg.extra_invalid_chars); i++) { if (cfg.extra_invalid_chars[i] == c) return 0; } } switch (c) { case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '-': case '_': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return 1; default: if (cfg.evc) { for (size_t i = 0; i < strlen(cfg.extra_valid_chars); i++) { if (cfg.extra_valid_chars[i] == c) return 1; } } return 0; } } /** * Add by refference */ static void radd(slist * l, char *str) { if (l->n == 0) l->s = (char **) malloc(sizeof(char *)); else l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *)); l->s[l->n] = str; l->n++; } /** * Add by value */ static void sadd(slist * l, const char *tmp) { if (l->n == 0) l->s = (char **) malloc(sizeof(char *)); else l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *)); l->s[l->n] = strdup(tmp); l->n++; } /** * Add by refference to the result */ static void cadd(result * r, char *str) { if (r->n == 0) { r->tok = (token **) malloc(sizeof(token *)); } else { r->tok = (token **) realloc(r->tok, (r->n + 1) * sizeof(token *)); } struct token *tok; tok = (token *) malloc(sizeof(token)); if (!tok) fprintf(stderr, "Err\n"); tok->s = str; tok->count = 1; r->tok[r->n] = tok; r->n++; } /** * free result */ static void cfree(result * res) { for (unsigned int i = 0; i < res->n; i++) { free(res->tok[i]); } free(res->tok); res->n = 0; } /** * free slist */ static void sfree(slist * l) { for (unsigned int i = 0; i < l->n; i++) { if (l->s[i]) free(l->s[i]); } if (l->s) free(l->s); l->n = 0; } /** * print slist */ static void pp(slist * l) { for (unsigned int i = 0; i < l->n; i++) { printf("%s\n", l->s[i]); } } /** * Check if @str is the same as @c */ static int exact(const char *str, const char *c) { if (strlen(str) != strlen(c)) return 0; for (size_t i = 0; i < strlen(c); i++) if (str[i] != c[i]) return 0; return -1; } /** * Check if @str starts with @c */ static int starts(const char *str, const char *c) { for (size_t i = 0; i < strlen(c); i++) if (str[i] != c[i]) return 0; return -1; } /** * Check if @str contains @c in the correct order * (e.g str=example, c=xl => e_x_amp_l_e) */ static int fuz(const char *str, const char *c) { size_t i = 0; size_t j = 0; for (; i < strlen(str); i++) { if (str[i] == c[j]) { j++; if (j == strlen(c)) return -1; } } return 0; } /** * Search an slist for @query and place the matches on @res * The search method is in the config struct */ static int search(const slist * l, const char *query, slist * res) { int flag = 0; for (unsigned int i = 0; i < l->n; i++) { if ((*cfg.search_method) (l->s[i], query)) { radd(res, l->s[i]); flag = 1; } } return flag; } /** * Count the entries of a __sorted__ slist and fill * the @res result struct with unique entries along with the count */ static void count(slist * l, result * res) { cadd(res, l->s[0]); for (unsigned int i = 1; i < l->n; i++) { if (strcmp(l->s[i], res->tok[res->n - 1]->s) == 0) { res->tok[res->n - 1]->count++; } else cadd(res, l->s[i]); } } static void finalize_str(char *str, unsigned int n, slist * l) { if (n > cfg.min_word_size) { str[n] = '\0'; sadd(l, str); } } static int cmpstringp(const void *p1, const void *p2) { /* The actual arguments to this function are "pointers to pointers to char", but strcmp(3) arguments are "pointers to char", hence the following cast plus dereference */ return strcmp(*(char *const *) p1, *(char *const *) p2); } static int cmpint(const void *p1, const void *p2) { token *a = *(token * const *) p1; token *b = *(token * const *) p2; if (a->count < b->count) return 1; if (b->count > a->count) return -1; return 0; } static int cmpint_r(const void *p1, const void *p2) { token *a = *(token * const *) p1; token *b = *(token * const *) p2; if (a->count > b->count) return 1; if (b->count < a->count) return -1; return 0; } static void pc(const result * res) { if (cfg.lisp_print) { printf("( "); for (unsigned int i = 0; i < res->n - 1; i++) { printf("\"%s\", ", res->tok[i]->s); } printf("\"%s\" )\n", res->tok[res->n - 1]->s); } else { for (unsigned int i = 0; i < res->n; i++) { if (cfg.print_count) printf("[%d] ", res->tok[i]->count); printf("%s\n", res->tok[i]->s); } } } /** * Fill an slist with tokens from @f */ static int tokenize(FILE * f, slist * l) { unsigned int n = 0; int c; char *tmp = NULL; while ((c = fgetc(f)) != EOF) { if (!is_valid(c)) { st.discarded++; if (tmp) { finalize_str(tmp, n, l); free(tmp); tmp = NULL; } n = 0; continue; } if (!tmp) tmp = (char *) malloc(cfg.max_token_size); if (n < cfg.max_token_size) tmp[n++] = (char) c; else { fprintf(stderr, "Max token size %d is not enough.\n", cfg.max_token_size); return -1; } } if (tmp) { finalize_str(tmp, n, l); free(tmp); } return 0; } static void sort_by_count(result * r) { if (cfg.reverse_sort) qsort(&r->tok[0], r->n, sizeof(token *), cmpint_r); else qsort(&r->tok[0], r->n, sizeof(token *), cmpint); } static void get_slist_stats(slist * l) { if (l->n > 0) { char tmp[] = "Total tokens:"; st.tok_results = (char *) malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) + 4) * sizeof(char)); sprintf(st.tok_results, "%s\t\t%d\n", tmp, l->n); } } static void get_search_stats(slist * l) { if (l->n > 0) { char tmp[] = "Total matches:"; st.search_results = (char *) malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) + 4) * sizeof(char)); sprintf(st.search_results, "%s\t\t%d\n", tmp, l->n); } } static void get_result_stats(result * r) { if (r->n > 0) { char tmp[] = "Unique tokens:"; st.unique_results = (char *) malloc((strlen(tmp) + snprintf(NULL, 0, "%d", r->n) + 4) * sizeof(char)); sprintf(st.unique_results, "%s\t\t%d\n", tmp, r->n); } } void clean_stdin(void) { int c; do { c = getchar(); } while (c != '\n' && c != EOF); } static void prompt(slist * l) { int r = 0; char p[20] = ""; char *tmp = NULL; if (cfg.prompt) sprintf(p, "[%d]> ", r); fflush(NULL); while ((tmp = linenoise(p)) != NULL) { slist search_res = { 0 }; if (search(l, tmp, &search_res)) { result count_res = { 0 }; /* sort the results */ qsort(&search_res.s[0], search_res.n, sizeof(char *), cmpstringp); /* count the unique */ count(&search_res, &count_res); sort_by_count(&count_res); r = count_res.n; /* print them */ pc(&count_res); cfree(&count_res); } if (cfg.prompt) sprintf(p, "[%d]> ", r); r = 0; free(search_res.s); free(tmp); tmp = NULL; } if (tmp) free(tmp); } static int set_input(FILE **f) { if (cfg.stdin) { if (cfg.interactive) { fprintf(stderr, "Can't read from stdin in interactive mode.\n"); return -1; } *f = stdin; } else if (cfg.filestream) { if (!cfg.file || strcmp(cfg.file, "") == 0) return -1; *f = fmemopen(cfg.file, strlen(cfg.file), "r"); } else { *f = fopen(cfg.file, "r"); if (!f) { fprintf(stderr, "Couldn't open %s\n", cfg.file); return -1; } } return 0; } static void get_uniq(slist * l, result *r) { /* sort the results */ qsort(&l->s[0], l->n, sizeof(char *), cmpstringp); /* count the unique, put it in result */ count(l, r); sort_by_count(r); if (cfg.stats) get_result_stats(r); } int main(int argc, char *argv[]) { int rc = 0; FILE *f = NULL; slist token_list = { 0 }; slist ref_list = { 0 }; slist *listp = NULL; result count_res = { 0 }; if (parse_cli(argc, argv) || cfg.help) { print_help(argv[0]); if (cfg.help) return 0; return -1; } if (cfg.query == NULL && !cfg.print_all && !cfg.interactive) { fprintf(stderr, "Query missing ... terminating\n"); return -1; } if ((rc = set_input(&f))) goto done; if ((rc = tokenize(f, &token_list))) goto done; if (cfg.stats) get_slist_stats(&token_list); if (cfg.interactive) { prompt(&token_list); goto done; } if (cfg.print_all) { if (cfg.all_uniq) listp = &token_list; else pp(&token_list); } else { /* search for the query */ if (search(&token_list, cfg.query, &ref_list)) { listp = &ref_list; if (cfg.stats) get_search_stats(&ref_list); } } if (listp) { get_uniq(listp, &count_res); pc(&count_res); cfree(&count_res); if (listp == &ref_list) free(ref_list.s); } if (cfg.debug) print_cfg(); if (cfg.stats) print_stats(); done: sfree(&token_list); if (f) fclose(f); return rc; }