From a5a0ddf73b81d3815def20d283bb8de1d01a7230 Mon Sep 17 00:00:00 2001 From: gramanas Date: Tue, 19 Mar 2019 00:32:20 +0200 Subject: Initial.. --- fcomp.c | 728 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 728 insertions(+) create mode 100644 fcomp.c (limited to 'fcomp.c') diff --git a/fcomp.c b/fcomp.c new file mode 100644 index 0000000..f4715f5 --- /dev/null +++ b/fcomp.c @@ -0,0 +1,728 @@ +/** + * fcomp - parse text for string tokens and search for them + * Copyright (C) 2019 Anastasis Grammenos + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "stdlib.h" +#include "stdio.h" +#include "string.h" +#include "getopt.h" +#include "ctype.h" + +typedef struct slist { + char **s; + unsigned int n; +} slist; + +typedef struct token { + char *s; + unsigned int count; +} token; + +/** + * Count result */ +typedef struct result { + token **tok; + unsigned int n; +} result; + +/** + * search methods */ +static int exact(const char *str, const char *c); +static int starts(const char *str, const char *c); +static int fuz(const char *str, const char *c); + +/** + * stats for debugging */ +typedef struct stats { + char *search_results; + char *tok_results; + char *unique_results; +} stats; + +static stats st = { + NULL, + NULL, + NULL +}; + +/** + * config */ +typedef struct config { + int help; /** 0: off 1: on */ + int debug; /** 0: off 1: on */ + int stats; /** 0: off 1: on */ + int stdin; /** 0: off 1: on */ + int reverse_sort; /** 0: off 1: on */ + int lisp_print; /** 0: off 1: on */ + int print_count; /** 0: off 1: on */ + int print_all; /** 0: off 1: on */ + char *file; + char *query; + unsigned int min_word_size; + int evc; /** 0: off 1: on */ + char *extra_valid_chars; + int eic; /** 0: off 1: on */ + char *extra_invalid_chars; + int (*search_method)(const char *, const char *); + unsigned int max_token_size; +} config; + +/** + * Initial config vector */ +static config cfg = { + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + NULL, + NULL, + 3, + 0, + NULL, + 0, + NULL, + starts, + 1000 +}; + +static void print_help(char *argv0) +{ + fprintf(stderr, + "fcomp - parse text for string tokens and search for them\n"); + fprintf(stderr, "Copyright (C) 2019 Anastasis Grammenos\n"); + fprintf(stderr, + "This program is licenced under GPLv2. See source tree for details.\n\n"); + fprintf(stderr, "Usage:\n~~~~~~\n"); + fprintf(stderr, "%s [query] [-f FILE] [...]\n\n", argv0); + fprintf(stderr, "%8s %4s %15s %50s\n", "Options", "", "Mnemonic", + "Description"); + fprintf(stderr, "%8s %4s %15s %50s\n", "~~~~~~~", "", "~~~~~~~~", + "~~~~~~~~~~~"); + fprintf(stderr, "%8s %4s %15s %50s\n\n", "-h", "", "help", + "Print this help message"); + fprintf(stderr, "%8s %4s %15s %50s\n", "query", "[..]", "", + "The term to be matched agains the tokens"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-f", "[..]", "file", + "Select the input file to use"); + fprintf(stderr, "%8s %4s %15s %50s\n", "stdin", "[..]", "", + "Input can come from rediretion as well as files"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-z", "", "fuzzy", + "Set the search method to fuzzy"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-x", "", "exact", + "Set the search method to exact match"); + fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "", + "(Default is starts_with)"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-r", "", "reverse", + "Reverse the sorting of matches"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-c", "", "count", + "Print the count of each matched token"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-l", "", "lisp", + "Print matched tokens in a lisp list"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-a", "", "all", + "Print all tokens of input"); + fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "", + "(Performs no search)"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-v", "[..]", "valid chars", + "Set extra valid chars for tokens"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-i", "[..]", "invalid chars", + "Set extra invalid chars for tokens"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-w", "[..]", "word length", + "Set min word length to count as token"); + fprintf(stderr, "%8s %4s %15s %50s\n\n", "-t", "[..]", "toekn length", + "Set max token length to parse"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-d", "", "debug", + "Show debug information"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-s", "", "stats", + "Print some stats"); +} + +static void print_cfg() +{ + fprintf(stderr, "\nValues:\n~~~~~~~\n"); + fprintf(stderr, "Input file: %68s\n", cfg.stdin ? "stdin" : cfg.file); + fprintf(stderr, "Query: %73s\n", cfg.query ? cfg.query : "---"); + fprintf(stderr, "Search method: %65s\n", + cfg.search_method == + starts ? "starts with" : (cfg.search_method == + exact ? "exact match" : "fuzzy")); + fprintf(stderr, "Minimun word size: %61d\n", cfg.min_word_size); + fprintf(stderr, "Extra valid chars: %61s\n", + cfg.evc ? cfg.extra_valid_chars : "---"); + fprintf(stderr, "Extra invalid chars: %59s\n", + cfg.eic ? cfg.extra_invalid_chars : "---"); + fprintf(stderr, "Flags:\n"); + fprintf(stderr, " [-d:%4s] ", cfg.debug ? "on" : "off"); + fprintf(stderr, "[-s:%4s] ", cfg.stats ? "on" : "off"); + fprintf(stderr, "[-f:%4s] ", cfg.stdin ? "off" : "on"); + fprintf(stderr, "[-r:%4s] ", cfg.reverse_sort ? "on" : "off"); + fprintf(stderr, "[-l:%4s]\n", cfg.lisp_print ? "on" : "off"); + fprintf(stderr, " [-c:%4s] ", cfg.print_count ? "on" : "off"); + fprintf(stderr, "[-a:%4s] ", cfg.print_all ? "on" : "off"); + fprintf(stderr, "[-v:%4s] ", cfg.evc ? "on" : "off"); + fprintf(stderr, "[-i:%4s]\n", cfg.eic ? "on" : "off"); +} + +static void print_stats() +{ + fprintf(stderr, "\nStats:\n~~~~~~\n"); + if (st.tok_results) { + fprintf(stderr, "%s", st.tok_results); + free(st.tok_results); + } + if (st.search_results) { + fprintf(stderr, "%s", st.search_results); + free(st.search_results); + } + if (st.unique_results) { + fprintf(stderr, "%s", st.unique_results); + free(st.unique_results); + } +} + +static int parse_cli(int argc, char *argv[]) +{ + if (argc < 2) + return -1; + char c; + while ((c = getopt(argc, argv, "-hxzsadclrf:i:v:w:t:")) != -1) { + switch (c) { + case 'h': + cfg.help = 1; + break; + case 'd': + cfg.debug = 1; + break; + case 's': + cfg.stats = 1; + break; + case 'a': + cfg.print_all = 1; + break; + case 'x': + cfg.search_method = exact; + break; + case 'z': + cfg.search_method = fuz; + break; + case 'c': + cfg.print_count = 1; + break; + case 'l': + cfg.lisp_print = 1; + break; + case 'f': + cfg.stdin = 0; + cfg.file = optarg; + break; + case 'v': + cfg.evc = 1; + cfg.extra_valid_chars = optarg; + break; + case 'i': + cfg.eic = 1; + cfg.extra_invalid_chars = optarg; + break; + case 'w': + cfg.min_word_size = atoi(optarg); + break; + case 't': + cfg.max_token_size = atoi(optarg); + break; + case 'r': + cfg.reverse_sort = 1; + break; + case '\1': + cfg.query = optarg; + break; + case '?': + if (optopt == 'w') { + fprintf(stderr, "Specify minimum word length after -%c\n", + optopt); + } else if (optopt == 'v') { + fprintf(stderr, + "Specify extra valid chars after -%c\n", optopt); + } else if (optopt == 'i') { + fprintf(stderr, + "Specify extra invalid chars after -%c\n", optopt); + } else if (optopt == 'f') { + fprintf(stderr, + "Specify file to read after -%c\n", optopt); + } else if (optopt == 't') { + fprintf(stderr, + "Specify max token size after -%c\n", optopt); + } + default: + return -1; + } + } + return 0; +} + +/** + * Check if a character is valid according to the default + * and extra valid characters */ +static int is_valid(const char c) +{ + if (cfg.eic) { + for (size_t i = 0; i < strlen(cfg.extra_invalid_chars); i++) { + if (cfg.extra_invalid_chars[i] == c) + return 0; + } + } + switch (c) { + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'g': + case 'h': + case 'i': + case 'j': + case 'k': + case 'l': + case 'm': + case 'n': + case 'o': + case 'p': + case 'q': + case 'r': + case 's': + case 't': + case 'u': + case 'v': + case 'w': + case 'x': + case 'y': + case 'z': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + case 'G': + case 'H': + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'P': + case 'Q': + case 'R': + case 'S': + case 'T': + case 'U': + case 'V': + case 'W': + case 'X': + case 'Y': + case 'Z': + case '-': + case '_': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return 1; + default: + if (cfg.evc) { + for (size_t i = 0; i < strlen(cfg.extra_valid_chars); i++) { + if (cfg.extra_valid_chars[i] == c) + return 1; + } + } + return 0; + } +} + +/** + * Add by refference */ +static void radd(slist * l, char *str) +{ + if (l->n == 0) + l->s = (char **) malloc(sizeof(char *)); + else + l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *)); + + l->s[l->n] = str; + l->n++; +} + +/** + * Add by value */ +static void sadd(slist * l, const char *tmp) +{ + if (l->n == 0) + l->s = (char **) malloc(sizeof(char *)); + else + l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *)); + + l->s[l->n] = strdup(tmp); + l->n++; +} + +/** + * Add by refference to the result */ +static void cadd(result * r, char *str) +{ + if (r->n == 0) { + r->tok = (token **) malloc(sizeof(token *)); + } else { + r->tok = (token **) realloc(r->tok, (r->n + 1) * sizeof(token *)); + } + + struct token *tok; + tok = (token *) malloc(sizeof(token)); + if (!tok) + fprintf(stderr, "Err\n"); + tok->s = str; + tok->count = 1; + + r->tok[r->n] = tok; + + r->n++; +} + +/** + * free result */ +static void cfree(result * res) +{ + for (unsigned int i = 0; i < res->n; i++) { + free(res->tok[i]); + } + free(res->tok); +} + +/** + * free slist */ +static void sfree(slist * l) +{ + for (unsigned int i = 0; i < l->n; i++) { + if (l->s[i]) + free(l->s[i]); + } + if (l->s) + free(l->s); +} + +/** + * print slist */ +static void pp(slist * l) +{ + for (unsigned int i = 0; i < l->n; i++) { + printf("%s\n", l->s[i]); + } +} + +/** + * Check if @str is the same as @c */ +static int exact(const char *str, const char *c) +{ + if (strlen(str) != strlen(c)) + return 0; + for (size_t i = 0; i < strlen(c); i++) + if (str[i] != c[i]) + return 0; + return -1; +} + +/** + * Check if @str starts with @c */ +static int starts(const char *str, const char *c) +{ + for (size_t i = 0; i < strlen(c); i++) + if (str[i] != c[i]) + return 0; + return -1; +} + +/** + * Check if @str contains @c in the correct order + * (e.g str=example, c=xl => e_x_amp_l_e) */ +static int fuz(const char *str, const char *c) +{ + size_t i = 0; + size_t j = 0; + for (; i < strlen(str); i++) { + if (str[i] == c[j]) { + j++; + if (j == strlen(c)) + return -1; + } + } + return 0; +} + +/** + * Search an slist for @query and place the matches on @res + * The search method is in the config struct */ +static int search(slist * l, const char *query, slist * res) +{ + int flag = 0; + for (unsigned int i = 0; i < l->n; i++) { + if ((*cfg.search_method) (l->s[i], query)) { + radd(res, l->s[i]); + flag = 1; + } + } + return flag; +} + +/** + * Count the entries of a __sorted__ slist and fill + * the @res result struct with unique entries along with the count */ +static void count(slist * l, result * res) +{ + cadd(res, l->s[0]); + for (unsigned int i = 1; i < l->n; i++) { + if (strcmp(l->s[i], res->tok[res->n - 1]->s) == 0) { + res->tok[res->n - 1]->count++; + } else + cadd(res, l->s[i]); + } +} + +static void finalize_str(char *str, unsigned int n, slist * l) +{ + if (n > cfg.min_word_size) { + str[n] = '\0'; + sadd(l, str); + } +} + +static int cmpstringp(const void *p1, const void *p2) +{ + /* The actual arguments to this function are "pointers to + pointers to char", but strcmp(3) arguments are "pointers + to char", hence the following cast plus dereference */ + + return strcmp(*(char *const *) p1, *(char *const *) p2); +} + +static int cmpint(const void *p1, const void *p2) +{ + token *a = *(token * const *) p1; + token *b = *(token * const *) p2; + + if (a->count < b->count) + return 1; + if (b->count > a->count) + return -1; + return 0; +} + +static int cmpint_r(const void *p1, const void *p2) +{ + token *a = *(token * const *) p1; + token *b = *(token * const *) p2; + + if (a->count > b->count) + return 1; + if (b->count < a->count) + return -1; + return 0; +} + +static void pc(const result * res) +{ + if (cfg.lisp_print) { + printf("( "); + for (unsigned int i = 0; i < res->n - 1; i++) { + printf("\"%s\", ", res->tok[i]->s); + } + printf("\"%s\" )\n", res->tok[res->n - 1]->s); + } else { + for (unsigned int i = 0; i < res->n; i++) { + if (cfg.print_count) + printf("[%d] ", res->tok[i]->count); + printf("%s\n", res->tok[i]->s); + } + } +} + +/** + * Fill an slist with tokens from @f */ +static int tokenize(FILE * f, slist * l) +{ + unsigned int n = 0; + char c; + char *tmp = NULL; + + while ((c = fgetc(f)) != EOF) { + if (!is_valid(c)) { + if (tmp) { + finalize_str(tmp, n, l); + free(tmp); + tmp = NULL; + } + n = 0; + continue; + } + if (!tmp) + tmp = (char *) malloc(cfg.max_token_size); + if (n < cfg.max_token_size) + tmp[n++] = (char) c; + else { + fprintf(stderr, + "Max token size %d is not enough.\n", + cfg.max_token_size); + return -1; + } + } + if (tmp) { + finalize_str(tmp, n, l); + free(tmp); + } + return 0; +} + +static void sort_by_count(result * r) +{ + if (cfg.reverse_sort) + qsort(&r->tok[0], r->n, sizeof(token *), cmpint_r); + else + qsort(&r->tok[0], r->n, sizeof(token *), cmpint); +} + +static void get_slist_stats(slist * l) +{ + if (l->n > 0) { + char tmp[] = "Total tokens:"; + st.tok_results = + (char *) + malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) + + 3) * sizeof(char)); + sprintf(st.tok_results, "%s\t%d\n", tmp, l->n); + } +} + +static void get_search_stats(slist * l) +{ + if (l->n > 0) { + char tmp[] = "Total matches:"; + st.search_results = + (char *) + malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) + + 3) * sizeof(char)); + sprintf(st.search_results, "%s\t%d\n", tmp, l->n); + } +} + +static void get_result_stats(result * r) +{ + if (r->n > 0) { + char tmp[] = "Unique matches:"; + st.unique_results = + (char *) + malloc((strlen(tmp) + snprintf(NULL, 0, "%d", r->n) + + 3) * sizeof(char)); + sprintf(st.unique_results, "%s\t%d\n", tmp, r->n); + } +} + +int main(int argc, char *argv[]) +{ + FILE *f; + slist list = { 0 }; + slist search_res = { 0 }; + result count_res = { 0 }; + + if (parse_cli(argc, argv) + || cfg.help) { + print_help(argv[0]); + if (cfg.help) + return 0; + return -1; + } + + if (cfg.query == NULL && !cfg.print_all) { + fprintf(stderr, "Query missing ... terminating\n"); + return -1; + } + + /* set input */ + if (cfg.stdin) { + f = stdin; + } else { + f = fopen(cfg.file, "r"); + if (!f) { + fprintf(stderr, "Couldn't open %s\n", cfg.file); + return -1; + } + } + + /* tokenize */ + if (tokenize(f, &list)) + goto err; + + if (cfg.stats) + get_slist_stats(&list); + + if (cfg.print_all) { + pp(&list); + } else { + /* search for the query */ + if (search(&list, cfg.query, &search_res)) { + + if (cfg.stats) + get_search_stats(&search_res); + + /* sort the results */ + qsort(&search_res.s[0], search_res.n, sizeof(char *), + cmpstringp); + + /* count the unique */ + count(&search_res, &count_res); + sort_by_count(&count_res); + + if (cfg.stats) + get_result_stats(&count_res); + + /* print them */ + pc(&count_res); + + cfree(&count_res); + free(search_res.s); + } + } + + if (cfg.debug) + print_cfg(); + if (cfg.stats) + print_stats(); + + err: + sfree(&list); + fclose(f); + return 0; +} -- cgit v1.2.3