/**
* fcomp - parse text for string tokens and search for them
* Copyright (C) 2019 Anastasis Grammenos
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include "stdlib.h"
#include "stdio.h"
#include "string.h"
#include "getopt.h"
#include "ctype.h"
typedef struct slist {
char **s;
unsigned int n;
} slist;
typedef struct token {
char *s;
unsigned int count;
} token;
/**
* Count result */
typedef struct result {
token **tok;
unsigned int n;
} result;
/**
* search methods */
static int exact(const char *str, const char *c);
static int starts(const char *str, const char *c);
static int fuz(const char *str, const char *c);
/**
* stats for debugging */
typedef struct stats {
char *search_results;
char *tok_results;
char *unique_results;
} stats;
static stats st = {
NULL,
NULL,
NULL
};
/**
* config */
typedef struct config {
int help; /** 0: off 1: on */
int debug; /** 0: off 1: on */
int stats; /** 0: off 1: on */
int stdin; /** 0: off 1: on */
int reverse_sort; /** 0: off 1: on */
int lisp_print; /** 0: off 1: on */
int print_count; /** 0: off 1: on */
int print_all; /** 0: off 1: on */
char *file;
char *query;
unsigned int min_word_size;
int evc; /** 0: off 1: on */
char *extra_valid_chars;
int eic; /** 0: off 1: on */
char *extra_invalid_chars;
int (*search_method)(const char *, const char *);
unsigned int max_token_size;
} config;
/**
* Initial config vector */
static config cfg = {
0,
0,
0,
1,
0,
0,
0,
0,
NULL,
NULL,
3,
0,
NULL,
0,
NULL,
starts,
1000
};
static void print_help(char *argv0)
{
fprintf(stderr,
"fcomp - parse text for string tokens and search for them\n");
fprintf(stderr, "Copyright (C) 2019 Anastasis Grammenos\n");
fprintf(stderr,
"This program is licenced under GPLv2. See source tree for details.\n\n");
fprintf(stderr, "Usage:\n~~~~~~\n");
fprintf(stderr, "%s [query] [-f FILE] [...]\n\n", argv0);
fprintf(stderr, "%8s %4s %15s %50s\n", "Options", "", "Mnemonic",
"Description");
fprintf(stderr, "%8s %4s %15s %50s\n", "~~~~~~~", "", "~~~~~~~~",
"~~~~~~~~~~~");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "-h", "", "help",
"Print this help message");
fprintf(stderr, "%8s %4s %15s %50s\n", "query", "[..]", "",
"The term to be matched agains the tokens");
fprintf(stderr, "%8s %4s %15s %50s\n", "-f", "[..]", "file",
"Select the input file to use");
fprintf(stderr, "%8s %4s %15s %50s\n", "stdin", "[..]", "",
"Input can come from rediretion as well as files");
fprintf(stderr, "%8s %4s %15s %50s\n", "-z", "", "fuzzy",
"Set the search method to fuzzy");
fprintf(stderr, "%8s %4s %15s %50s\n", "-x", "", "exact",
"Set the search method to exact match");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "",
"(Default is starts_with)");
fprintf(stderr, "%8s %4s %15s %50s\n", "-r", "", "reverse",
"Reverse the sorting of matches");
fprintf(stderr, "%8s %4s %15s %50s\n", "-c", "", "count",
"Print the count of each matched token");
fprintf(stderr, "%8s %4s %15s %50s\n", "-l", "", "lisp",
"Print matched tokens in a lisp list");
fprintf(stderr, "%8s %4s %15s %50s\n", "-a", "", "all",
"Print all tokens of input");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "",
"(Performs no search)");
fprintf(stderr, "%8s %4s %15s %50s\n", "-v", "[..]", "valid chars",
"Set extra valid chars for tokens");
fprintf(stderr, "%8s %4s %15s %50s\n", "-i", "[..]", "invalid chars",
"Set extra invalid chars for tokens");
fprintf(stderr, "%8s %4s %15s %50s\n", "-w", "[..]", "word length",
"Set min word length to count as token");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "-t", "[..]", "toekn length",
"Set max token length to parse");
fprintf(stderr, "%8s %4s %15s %50s\n", "-d", "", "debug",
"Show debug information");
fprintf(stderr, "%8s %4s %15s %50s\n", "-s", "", "stats",
"Print some stats");
}
static void print_cfg()
{
fprintf(stderr, "\nValues:\n~~~~~~~\n");
fprintf(stderr, "Input file: %68s\n", cfg.stdin ? "stdin" : cfg.file);
fprintf(stderr, "Query: %73s\n", cfg.query ? cfg.query : "---");
fprintf(stderr, "Search method: %65s\n",
cfg.search_method ==
starts ? "starts with" : (cfg.search_method ==
exact ? "exact match" : "fuzzy"));
fprintf(stderr, "Minimun word size: %61d\n", cfg.min_word_size);
fprintf(stderr, "Extra valid chars: %61s\n",
cfg.evc ? cfg.extra_valid_chars : "---");
fprintf(stderr, "Extra invalid chars: %59s\n",
cfg.eic ? cfg.extra_invalid_chars : "---");
fprintf(stderr, "Flags:\n");
fprintf(stderr, " [-d:%4s] ", cfg.debug ? "on" : "off");
fprintf(stderr, "[-s:%4s] ", cfg.stats ? "on" : "off");
fprintf(stderr, "[-f:%4s] ", cfg.stdin ? "off" : "on");
fprintf(stderr, "[-r:%4s] ", cfg.reverse_sort ? "on" : "off");
fprintf(stderr, "[-l:%4s]\n", cfg.lisp_print ? "on" : "off");
fprintf(stderr, " [-c:%4s] ", cfg.print_count ? "on" : "off");
fprintf(stderr, "[-a:%4s] ", cfg.print_all ? "on" : "off");
fprintf(stderr, "[-v:%4s] ", cfg.evc ? "on" : "off");
fprintf(stderr, "[-i:%4s]\n", cfg.eic ? "on" : "off");
}
static void print_stats()
{
fprintf(stderr, "\nStats:\n~~~~~~\n");
if (st.tok_results) {
fprintf(stderr, "%s", st.tok_results);
free(st.tok_results);
}
if (st.search_results) {
fprintf(stderr, "%s", st.search_results);
free(st.search_results);
}
if (st.unique_results) {
fprintf(stderr, "%s", st.unique_results);
free(st.unique_results);
}
}
static int parse_cli(int argc, char *argv[])
{
if (argc < 2)
return -1;
char c;
while ((c = getopt(argc, argv, "-hxzsadclrf:i:v:w:t:")) != -1) {
switch (c) {
case 'h':
cfg.help = 1;
break;
case 'd':
cfg.debug = 1;
break;
case 's':
cfg.stats = 1;
break;
case 'a':
cfg.print_all = 1;
break;
case 'x':
cfg.search_method = exact;
break;
case 'z':
cfg.search_method = fuz;
break;
case 'c':
cfg.print_count = 1;
break;
case 'l':
cfg.lisp_print = 1;
break;
case 'f':
cfg.stdin = 0;
cfg.file = optarg;
break;
case 'v':
cfg.evc = 1;
cfg.extra_valid_chars = optarg;
break;
case 'i':
cfg.eic = 1;
cfg.extra_invalid_chars = optarg;
break;
case 'w':
cfg.min_word_size = atoi(optarg);
break;
case 't':
cfg.max_token_size = atoi(optarg);
break;
case 'r':
cfg.reverse_sort = 1;
break;
case '\1':
cfg.query = optarg;
break;
case '?':
if (optopt == 'w') {
fprintf(stderr, "Specify minimum word length after -%c\n",
optopt);
} else if (optopt == 'v') {
fprintf(stderr,
"Specify extra valid chars after -%c\n", optopt);
} else if (optopt == 'i') {
fprintf(stderr,
"Specify extra invalid chars after -%c\n", optopt);
} else if (optopt == 'f') {
fprintf(stderr,
"Specify file to read after -%c\n", optopt);
} else if (optopt == 't') {
fprintf(stderr,
"Specify max token size after -%c\n", optopt);
}
default:
return -1;
}
}
return 0;
}
/**
* Check if a character is valid according to the default
* and extra valid characters */
static int is_valid(const char c)
{
if (cfg.eic) {
for (size_t i = 0; i < strlen(cfg.extra_invalid_chars); i++) {
if (cfg.extra_invalid_chars[i] == c)
return 0;
}
}
switch (c) {
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '-':
case '_':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return 1;
default:
if (cfg.evc) {
for (size_t i = 0; i < strlen(cfg.extra_valid_chars); i++) {
if (cfg.extra_valid_chars[i] == c)
return 1;
}
}
return 0;
}
}
/**
* Add by refference */
static void radd(slist * l, char *str)
{
if (l->n == 0)
l->s = (char **) malloc(sizeof(char *));
else
l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *));
l->s[l->n] = str;
l->n++;
}
/**
* Add by value */
static void sadd(slist * l, const char *tmp)
{
if (l->n == 0)
l->s = (char **) malloc(sizeof(char *));
else
l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *));
l->s[l->n] = strdup(tmp);
l->n++;
}
/**
* Add by refference to the result */
static void cadd(result * r, char *str)
{
if (r->n == 0) {
r->tok = (token **) malloc(sizeof(token *));
} else {
r->tok = (token **) realloc(r->tok, (r->n + 1) * sizeof(token *));
}
struct token *tok;
tok = (token *) malloc(sizeof(token));
if (!tok)
fprintf(stderr, "Err\n");
tok->s = str;
tok->count = 1;
r->tok[r->n] = tok;
r->n++;
}
/**
* free result */
static void cfree(result * res)
{
for (unsigned int i = 0; i < res->n; i++) {
free(res->tok[i]);
}
free(res->tok);
}
/**
* free slist */
static void sfree(slist * l)
{
for (unsigned int i = 0; i < l->n; i++) {
if (l->s[i])
free(l->s[i]);
}
if (l->s)
free(l->s);
}
/**
* print slist */
static void pp(slist * l)
{
for (unsigned int i = 0; i < l->n; i++) {
printf("%s\n", l->s[i]);
}
}
/**
* Check if @str is the same as @c */
static int exact(const char *str, const char *c)
{
if (strlen(str) != strlen(c))
return 0;
for (size_t i = 0; i < strlen(c); i++)
if (str[i] != c[i])
return 0;
return -1;
}
/**
* Check if @str starts with @c */
static int starts(const char *str, const char *c)
{
for (size_t i = 0; i < strlen(c); i++)
if (str[i] != c[i])
return 0;
return -1;
}
/**
* Check if @str contains @c in the correct order
* (e.g str=example, c=xl => e_x_amp_l_e) */
static int fuz(const char *str, const char *c)
{
size_t i = 0;
size_t j = 0;
for (; i < strlen(str); i++) {
if (str[i] == c[j]) {
j++;
if (j == strlen(c))
return -1;
}
}
return 0;
}
/**
* Search an slist for @query and place the matches on @res
* The search method is in the config struct */
static int search(slist * l, const char *query, slist * res)
{
int flag = 0;
for (unsigned int i = 0; i < l->n; i++) {
if ((*cfg.search_method) (l->s[i], query)) {
radd(res, l->s[i]);
flag = 1;
}
}
return flag;
}
/**
* Count the entries of a __sorted__ slist and fill
* the @res result struct with unique entries along with the count */
static void count(slist * l, result * res)
{
cadd(res, l->s[0]);
for (unsigned int i = 1; i < l->n; i++) {
if (strcmp(l->s[i], res->tok[res->n - 1]->s) == 0) {
res->tok[res->n - 1]->count++;
} else
cadd(res, l->s[i]);
}
}
static void finalize_str(char *str, unsigned int n, slist * l)
{
if (n > cfg.min_word_size) {
str[n] = '\0';
sadd(l, str);
}
}
static int cmpstringp(const void *p1, const void *p2)
{
/* The actual arguments to this function are "pointers to
pointers to char", but strcmp(3) arguments are "pointers
to char", hence the following cast plus dereference */
return strcmp(*(char *const *) p1, *(char *const *) p2);
}
static int cmpint(const void *p1, const void *p2)
{
token *a = *(token * const *) p1;
token *b = *(token * const *) p2;
if (a->count < b->count)
return 1;
if (b->count > a->count)
return -1;
return 0;
}
static int cmpint_r(const void *p1, const void *p2)
{
token *a = *(token * const *) p1;
token *b = *(token * const *) p2;
if (a->count > b->count)
return 1;
if (b->count < a->count)
return -1;
return 0;
}
static void pc(const result * res)
{
if (cfg.lisp_print) {
printf("( ");
for (unsigned int i = 0; i < res->n - 1; i++) {
printf("\"%s\", ", res->tok[i]->s);
}
printf("\"%s\" )\n", res->tok[res->n - 1]->s);
} else {
for (unsigned int i = 0; i < res->n; i++) {
if (cfg.print_count)
printf("[%d] ", res->tok[i]->count);
printf("%s\n", res->tok[i]->s);
}
}
}
/**
* Fill an slist with tokens from @f */
static int tokenize(FILE * f, slist * l)
{
unsigned int n = 0;
char c;
char *tmp = NULL;
while ((c = fgetc(f)) != EOF) {
if (!is_valid(c)) {
if (tmp) {
finalize_str(tmp, n, l);
free(tmp);
tmp = NULL;
}
n = 0;
continue;
}
if (!tmp)
tmp = (char *) malloc(cfg.max_token_size);
if (n < cfg.max_token_size)
tmp[n++] = (char) c;
else {
fprintf(stderr,
"Max token size %d is not enough.\n",
cfg.max_token_size);
return -1;
}
}
if (tmp) {
finalize_str(tmp, n, l);
free(tmp);
}
return 0;
}
static void sort_by_count(result * r)
{
if (cfg.reverse_sort)
qsort(&r->tok[0], r->n, sizeof(token *), cmpint_r);
else
qsort(&r->tok[0], r->n, sizeof(token *), cmpint);
}
static void get_slist_stats(slist * l)
{
if (l->n > 0) {
char tmp[] = "Total tokens:";
st.tok_results =
(char *)
malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) +
3) * sizeof(char));
sprintf(st.tok_results, "%s\t%d\n", tmp, l->n);
}
}
static void get_search_stats(slist * l)
{
if (l->n > 0) {
char tmp[] = "Total matches:";
st.search_results =
(char *)
malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) +
3) * sizeof(char));
sprintf(st.search_results, "%s\t%d\n", tmp, l->n);
}
}
static void get_result_stats(result * r)
{
if (r->n > 0) {
char tmp[] = "Unique matches:";
st.unique_results =
(char *)
malloc((strlen(tmp) + snprintf(NULL, 0, "%d", r->n) +
3) * sizeof(char));
sprintf(st.unique_results, "%s\t%d\n", tmp, r->n);
}
}
int main(int argc, char *argv[])
{
FILE *f;
slist list = { 0 };
slist search_res = { 0 };
result count_res = { 0 };
if (parse_cli(argc, argv)
|| cfg.help) {
print_help(argv[0]);
if (cfg.help)
return 0;
return -1;
}
if (cfg.query == NULL && !cfg.print_all) {
fprintf(stderr, "Query missing ... terminating\n");
return -1;
}
/* set input */
if (cfg.stdin) {
f = stdin;
} else {
f = fopen(cfg.file, "r");
if (!f) {
fprintf(stderr, "Couldn't open %s\n", cfg.file);
return -1;
}
}
/* tokenize */
if (tokenize(f, &list))
goto err;
if (cfg.stats)
get_slist_stats(&list);
if (cfg.print_all) {
pp(&list);
} else {
/* search for the query */
if (search(&list, cfg.query, &search_res)) {
if (cfg.stats)
get_search_stats(&search_res);
/* sort the results */
qsort(&search_res.s[0], search_res.n, sizeof(char *),
cmpstringp);
/* count the unique */
count(&search_res, &count_res);
sort_by_count(&count_res);
if (cfg.stats)
get_result_stats(&count_res);
/* print them */
pc(&count_res);
cfree(&count_res);
free(search_res.s);
}
}
if (cfg.debug)
print_cfg();
if (cfg.stats)
print_stats();
err:
sfree(&list);
fclose(f);
return 0;
}