/**
* fcomp - parse text for string tokens and search for them
* Copyright (C) 2019 Anastasis Grammenos
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include "stdlib.h"
#include "stdio.h"
#include "string.h"
#include "getopt.h"
#include "unistd.h"
#include "ctype.h"
#include "linenoise.h"
static const char valid[] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
'W', 'X', 'Y', 'Z', '-', '_', '0', '1', '2', '3', '4', '5',
'6', '7', '8', '9' };
typedef struct slist {
char **s;
unsigned int n;
} slist;
typedef struct token {
char *s;
unsigned int count;
} token;
/**
* Count result */
typedef struct result {
token **tok;
unsigned int n;
} result;
/**
* search methods */
static int exact(const char *str, const char *c);
static int starts(const char *str, const char *c);
static int fuz(const char *str, const char *c);
/**
* stats for debugging */
typedef struct stats {
unsigned int discarded;
char *search_results;
char *tok_results;
char *unique_results;
} stats;
static stats st = {
0,
NULL,
NULL,
NULL
};
/**
* config */
typedef struct config {
int help; /** 0: off 1: on */
int debug; /** 0: off 1: on */
int stats; /** 0: off 1: on */
int stdin; /** 0: off 1: on */
int reverse_sort; /** 0: off 1: on */
int lisp_print; /** 0: off 1: on */
int print_count; /** 0: off 1: on */
int print_all; /** 0: off 1: on */
int all_uniq; /** 0: off 1: on */
int interactive; /** 0: off 1: on */
int prompt; /** 0: off 1: on */
int filestream; /** 0: off 1: on */
char *file;
char *query;
unsigned int min_word_size;
int evc; /** 0: off 1: on */
char *extra_valid_chars;
int eic; /** 0: off 1: on */
char *extra_invalid_chars;
int (*search_method)(const char *, const char *);
unsigned int max_token_size;
} config;
/**
* Initial config vector */
static config cfg = {
.help = 0,
.debug = 0,
.stats = 0,
.stdin = 1,
.reverse_sort = 0,
.lisp_print = 0,
.print_count = 0,
.print_all = 0,
.all_uniq = 0,
.interactive = 0,
.prompt = 1,
.filestream = 0,
.file = NULL,
.query = NULL,
.min_word_size = 3,
.evc = 0,
.extra_valid_chars = NULL,
.eic = 0,
.extra_invalid_chars = NULL,
.search_method = starts,
.max_token_size = 1000
};
static void print_help(char *argv0)
{
fprintf(stderr,
"fcomp - parse text for string tokens and search for them\n");
fprintf(stderr, "Copyright (C) 2019 Anastasis Grammenos\n");
fprintf(stderr,
"This program is licenced under GPLv2. See source tree for details.\n\n");
fprintf(stderr, "Usage:\n~~~~~~\n");
fprintf(stderr, "%s [query] [-f FILE] [...]\n\n", argv0);
fprintf(stderr, "%8s %4s %15s %50s\n", "Options", "", "Mnemonic",
"Description");
fprintf(stderr, "%8s %4s %15s %50s\n", "~~~~~~~", "", "~~~~~~~~",
"~~~~~~~~~~~");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "-h", "", "help",
"Print this help message");
fprintf(stderr, "%8s %4s %15s %50s\n", "query", "", "",
"The term to be matched agains the tokens");
fprintf(stderr, "%8s %4s %15s %50s\n", "-q", "[..]", "query",
"Same");
fprintf(stderr, "%8s %4s %15s %50s\n", "-f", "[..]", "file",
"Select the input file to use");
fprintf(stderr, "%8s %4s %15s %50s\n", "-F", "[..]", "filestream",
"Input can be provided as a string");
fprintf(stderr, "%8s %4s %15s %50s\n", "stdin", "[..]", "",
"Input can come from rediretion as well as files");
fprintf(stderr, "%8s %4s %15s %50s\n", "-z", "", "fuzzy",
"Set the search method to fuzzy");
fprintf(stderr, "%8s %4s %15s %50s\n", "-x", "", "exact",
"Set the search method to exact match");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "",
"(Default is starts_with)");
fprintf(stderr, "%8s %4s %15s %50s\n", "-r", "", "reverse",
"Reverse the sorting of matches");
fprintf(stderr, "%8s %4s %15s %50s\n", "-c", "", "count",
"Print the count of each matched token");
fprintf(stderr, "%8s %4s %15s %50s\n", "-l", "", "lisp",
"Print matched tokens in a lisp list");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "-u", "", "unique",
"When -a is passed, don't print duplicates");
fprintf(stderr, "%8s %4s %15s %50s\n", "-a", "", "all",
"Print all tokens of input");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "",
"(Performs no search)");
fprintf(stderr, "%8s %4s %15s %50s\n", "-v", "[..]", "valid chars",
"Set extra valid chars for tokens");
fprintf(stderr, "%8s %4s %15s %50s\n", "-i", "[..]", "invalid chars",
"Set extra invalid chars for tokens");
fprintf(stderr, "%8s %4s %15s %50s\n", "-w", "[..]", "word length",
"Set min word length to count as token");
fprintf(stderr, "%8s %4s %15s %50s\n\n", "-t", "[..]", "token length",
"Set max token length to parse");
fprintf(stderr, "%8s %4s %15s %50s\n", "-d", "", "debug",
"Show debug information");
fprintf(stderr, "%8s %4s %15s %50s\n", "-s", "", "stats",
"Print some stats");
fprintf(stderr, "%8s %4s %15s %50s\n", "-I", "", "interactive",
"Run with interactive query input");
fprintf(stderr, "%8s %4s %15s %50s\n", "", "", "",
"(Pass it twice to disable prompt)");
}
static void print_cfg()
{
fprintf(stderr, "\nValues:\n~~~~~~~\n");
fprintf(stderr, "Input file: %68s\n", cfg.stdin ? "stdin" : cfg.file);
fprintf(stderr, "Query: %73s\n", cfg.query ? cfg.query : "---");
fprintf(stderr, "Search method: %65s\n",
cfg.search_method ==
starts ? "starts with" : (cfg.search_method ==
exact ? "exact match" : "fuzzy"));
fprintf(stderr, "Minimun word size: %61d\n", cfg.min_word_size);
fprintf(stderr, "Extra valid chars: %61s\n",
cfg.evc ? cfg.extra_valid_chars : "---");
fprintf(stderr, "Extra invalid chars: %59s\n",
cfg.eic ? cfg.extra_invalid_chars : "---");
fprintf(stderr, "Flags:\n");
fprintf(stderr, " [-d:%4s] ", cfg.debug ? "on" : "off");
fprintf(stderr, "[-s:%4s] ", cfg.stats ? "on" : "off");
fprintf(stderr, "[-f:%4s] ", cfg.stdin ? "off" : "on");
fprintf(stderr, "[-r:%4s] ", cfg.reverse_sort ? "on" : "off");
fprintf(stderr, "[-l:%4s]\n", cfg.lisp_print ? "on" : "off");
fprintf(stderr, " [-c:%4s] ", cfg.print_count ? "on" : "off");
fprintf(stderr, "[-a:%4s] ", cfg.print_all ? "on" : "off");
fprintf(stderr, "[-u:%4s] ", cfg.all_uniq ? "on" : "off");
fprintf(stderr, "[-v:%4s] ", cfg.evc ? "on" : "off");
fprintf(stderr, "[-i:%4s]\n", cfg.eic ? "on" : "off");
}
static void print_stats()
{
fprintf(stderr, "\nStats:\n~~~~~~\n");
fprintf(stderr, "Discarded chars:\t%d\n", st.discarded);
if (st.tok_results) {
fprintf(stderr, "%s", st.tok_results);
free(st.tok_results);
}
if (st.search_results) {
fprintf(stderr, "%s", st.search_results);
free(st.search_results);
}
if (st.unique_results) {
fprintf(stderr, "%s", st.unique_results);
free(st.unique_results);
}
}
static int parse_cli(int argc, char *argv[])
{
if (argc < 2)
return -1;
char c;
while ((c = getopt(argc, argv, "-hIxzsaudclrq:F:f:i:v:w:t:")) != -1) {
switch (c) {
case 'I':
if (cfg.interactive) {
cfg.prompt = 0;
}
cfg.interactive = 1;
break;
case 'F':
cfg.stdin = 0;
cfg.filestream = 1;
cfg.file = optarg;
break;
case 'h':
cfg.help = 1;
break;
case 'd':
cfg.debug = 1;
break;
case 'u':
cfg.all_uniq = 1;
break;
case 's':
cfg.stats = 1;
break;
case 'a':
cfg.print_all = 1;
break;
case 'x':
cfg.search_method = exact;
break;
case 'z':
cfg.search_method = fuz;
break;
case 'c':
cfg.print_count = 1;
break;
case 'l':
cfg.lisp_print = 1;
break;
case 'f':
cfg.stdin = 0;
cfg.file = optarg;
break;
case 'v':
cfg.evc = 1;
cfg.extra_valid_chars = optarg;
break;
case 'i':
cfg.eic = 1;
cfg.extra_invalid_chars = optarg;
break;
case 'w':
cfg.min_word_size = atoi(optarg);
break;
case 't':
cfg.max_token_size = atoi(optarg);
break;
case 'r':
cfg.reverse_sort = 1;
break;
case '\1':
case 'q':
cfg.query = optarg;
break;
case '?':
if (optopt == 'w') {
fprintf(stderr, "Specify minimum word length after -%c\n",
optopt);
} else if (optopt == 'v') {
fprintf(stderr,
"Specify extra valid chars after -%c\n", optopt);
} else if (optopt == 'i') {
fprintf(stderr,
"Specify extra invalid chars after -%c\n", optopt);
} else if (optopt == 'f') {
fprintf(stderr,
"Specify file to read after -%c\n", optopt);
} else if (optopt == 't') {
fprintf(stderr,
"Specify max token size after -%c\n", optopt);
}
default:
return -1;
}
}
return 0;
}
/**
* Check if a character is valid according to the default
* and extra valid characters */
static int is_valid(const char c)
{
if (cfg.eic) {
for (size_t i = 0; i < strlen(cfg.extra_invalid_chars); i++) {
if (cfg.extra_invalid_chars[i] == c)
return 0;
}
}
for (size_t i = 0; i < (sizeof(valid) / sizeof(valid[0])); i++) {
if (valid[i] == c)
return 1;
}
if (cfg.evc) {
for (size_t i = 0; i < strlen(cfg.extra_valid_chars); i++) {
if (cfg.extra_valid_chars[i] == c)
return 1;
}
}
return 0;
}
/**
* Add by refference */
static void radd(slist * l, char *str)
{
if (l->n == 0)
l->s = (char **) malloc(sizeof(char *));
else
l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *));
l->s[l->n] = str;
l->n++;
}
/**
* Add by value */
static void sadd(slist * l, const char *tmp)
{
if (l->n == 0)
l->s = (char **) malloc(sizeof(char *));
else
l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *));
l->s[l->n] = strdup(tmp);
l->n++;
}
/**
* Add by refference to the result */
static void cadd(result * r, char *str)
{
if (r->n == 0) {
r->tok = (token **) malloc(sizeof(token *));
} else {
r->tok = (token **) realloc(r->tok, (r->n + 1) * sizeof(token *));
}
struct token *tok;
tok = (token *) malloc(sizeof(token));
if (!tok)
fprintf(stderr, "Err\n");
tok->s = str;
tok->count = 1;
r->tok[r->n] = tok;
r->n++;
}
/**
* free result */
static void cfree(result * res)
{
for (unsigned int i = 0; i < res->n; i++) {
free(res->tok[i]);
}
free(res->tok);
res->n = 0;
}
/**
* free slist */
static void sfree(slist * l)
{
for (unsigned int i = 0; i < l->n; i++) {
if (l->s[i])
free(l->s[i]);
}
if (l->s)
free(l->s);
l->n = 0;
}
/**
* print slist */
static void pp(slist * l)
{
for (unsigned int i = 0; i < l->n; i++) {
printf("%s\n", l->s[i]);
}
}
/**
* Check if @str is the same as @c */
static int exact(const char *str, const char *c)
{
if (strlen(str) != strlen(c))
return 0;
for (size_t i = 0; i < strlen(c); i++)
if (str[i] != c[i])
return 0;
return -1;
}
/**
* Check if @str starts with @c */
static int starts(const char *str, const char *c)
{
for (size_t i = 0; i < strlen(c); i++)
if (str[i] != c[i])
return 0;
return -1;
}
/**
* Check if @str contains @c in the correct order
* (e.g str=example, c=xl => e_x_amp_l_e) */
static int fuz(const char *str, const char *c)
{
size_t i = 0;
size_t j = 0;
for (; i < strlen(str); i++) {
if (str[i] == c[j]) {
j++;
if (j == strlen(c))
return -1;
}
}
return 0;
}
/**
* Search an slist for @query and place the matches on @res
* The search method is in the config struct */
static int search(const slist * l, const char *query, slist * res)
{
int flag = 0;
for (unsigned int i = 0; i < l->n; i++) {
if ((*cfg.search_method) (l->s[i], query)) {
radd(res, l->s[i]);
flag = 1;
}
}
return flag;
}
/**
* Count the entries of a __sorted__ slist and fill
* the @res result struct with unique entries along with the count */
static void count(slist * l, result * res)
{
cadd(res, l->s[0]);
for (unsigned int i = 1; i < l->n; i++) {
if (strcmp(l->s[i], res->tok[res->n - 1]->s) == 0) {
res->tok[res->n - 1]->count++;
} else
cadd(res, l->s[i]);
}
}
static void finalize_str(char *str, unsigned int n, slist * l)
{
if (n > cfg.min_word_size) {
str[n] = '\0';
sadd(l, str);
}
}
static int cmpstringp(const void *p1, const void *p2)
{
/* The actual arguments to this function are "pointers to
pointers to char", but strcmp(3) arguments are "pointers
to char", hence the following cast plus dereference */
return strcmp(*(char *const *) p1, *(char *const *) p2);
}
static int cmpint(const void *p1, const void *p2)
{
token *a = *(token * const *) p1;
token *b = *(token * const *) p2;
if (a->count < b->count)
return 1;
if (b->count > a->count)
return -1;
return 0;
}
static int cmpint_r(const void *p1, const void *p2)
{
token *a = *(token * const *) p1;
token *b = *(token * const *) p2;
if (a->count > b->count)
return 1;
if (b->count < a->count)
return -1;
return 0;
}
static void pc(const result * res)
{
if (cfg.lisp_print) {
printf("( ");
for (unsigned int i = 0; i < res->n - 1; i++) {
printf("\"%s\", ", res->tok[i]->s);
}
printf("\"%s\" )\n", res->tok[res->n - 1]->s);
} else {
for (unsigned int i = 0; i < res->n; i++) {
if (cfg.print_count)
printf("[%d] ", res->tok[i]->count);
printf("%s\n", res->tok[i]->s);
}
}
}
/**
* Fill an slist with tokens from @f */
static int tokenize(FILE * f, slist * l)
{
unsigned int n = 0;
int c;
char *tmp = NULL;
while ((c = fgetc(f)) != EOF) {
if (!is_valid(c)) {
st.discarded++;
if (tmp) {
finalize_str(tmp, n, l);
free(tmp);
tmp = NULL;
}
n = 0;
continue;
}
if (!tmp)
tmp = (char *) malloc(cfg.max_token_size);
if (n < cfg.max_token_size)
tmp[n++] = (char) c;
else {
fprintf(stderr,
"Max token size %d is not enough.\n",
cfg.max_token_size);
return -1;
}
}
if (tmp) {
finalize_str(tmp, n, l);
free(tmp);
}
return 0;
}
static void sort_by_count(result * r)
{
if (cfg.reverse_sort)
qsort(&r->tok[0], r->n, sizeof(token *), cmpint_r);
else
qsort(&r->tok[0], r->n, sizeof(token *), cmpint);
}
static void get_slist_stats(slist * l)
{
if (l->n > 0) {
char tmp[] = "Total tokens:";
st.tok_results = (char *)
malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) +
4) * sizeof(char));
sprintf(st.tok_results, "%s\t\t%d\n", tmp, l->n);
}
}
static void get_search_stats(slist * l)
{
if (l->n > 0) {
char tmp[] = "Total matches:";
st.search_results = (char *)
malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) +
4) * sizeof(char));
sprintf(st.search_results, "%s\t\t%d\n", tmp, l->n);
}
}
static void get_result_stats(result * r)
{
if (r->n > 0) {
char tmp[] = "Unique tokens:";
st.unique_results = (char *)
malloc((strlen(tmp) + snprintf(NULL, 0, "%d", r->n) +
4) * sizeof(char));
sprintf(st.unique_results, "%s\t\t%d\n", tmp, r->n);
}
}
void clean_stdin(void)
{
int c;
do {
c = getchar();
} while (c != '\n' && c != EOF);
}
static void prompt(slist * l)
{
int r = 0;
char p[20] = "";
char *tmp = NULL;
if (cfg.prompt)
sprintf(p, "[%d]> ", r);
fflush(NULL);
while ((tmp = linenoise(p)) != NULL) {
slist search_res = { 0 };
if (search(l, tmp, &search_res)) {
result count_res = { 0 };
/* sort the results */
qsort(&search_res.s[0], search_res.n, sizeof(char *),
cmpstringp);
/* count the unique */
count(&search_res, &count_res);
sort_by_count(&count_res);
r = count_res.n;
/* print them */
pc(&count_res);
cfree(&count_res);
}
if (cfg.prompt)
sprintf(p, "[%d]> ", r);
r = 0;
free(search_res.s);
free(tmp);
tmp = NULL;
}
if (tmp)
free(tmp);
}
static int set_input(FILE **f)
{
if (cfg.stdin) {
if (cfg.interactive) {
fprintf(stderr,
"Can't read from stdin in interactive mode.\n");
return -1;
}
*f = stdin;
} else if (cfg.filestream) {
if (!cfg.file || strcmp(cfg.file, "") == 0)
return -1;
*f = fmemopen(cfg.file, strlen(cfg.file), "r");
if (!*f) return -1;
}
else {
*f = fopen(cfg.file, "r");
if (!*f) {
fprintf(stderr, "Couldn't open %s\n", cfg.file);
return -1;
}
}
return 0;
}
static void get_uniq(slist * l, result *r)
{
/* sort the results */
qsort(&l->s[0], l->n, sizeof(char *),
cmpstringp);
/* count the unique, put it in result */
count(l, r);
sort_by_count(r);
if (cfg.stats)
get_result_stats(r);
}
int main(int argc, char *argv[])
{
int rc = 0;
FILE *f = NULL;
slist token_list = { 0 };
slist ref_list = { 0 };
slist *listp = NULL;
result count_res = { 0 };
if (parse_cli(argc, argv)
|| cfg.help) {
print_help(argv[0]);
if (cfg.help)
return 0;
return -1;
}
if (cfg.query == NULL && !cfg.print_all && !cfg.interactive) {
fprintf(stderr, "Query missing ... terminating\n");
return -1;
}
if ((rc = set_input(&f)))
goto done;
if ((rc = tokenize(f, &token_list)))
goto done;
if (cfg.stats)
get_slist_stats(&token_list);
if (cfg.interactive) {
prompt(&token_list);
goto done;
}
if (cfg.print_all) {
if (cfg.all_uniq)
listp = &token_list;
else
pp(&token_list);
} else {
/* search for the query */
if (search(&token_list, cfg.query, &ref_list)) {
listp = &ref_list;
if (cfg.stats)
get_search_stats(&ref_list);
}
}
if (listp) {
if (listp->n != 0) {
get_uniq(listp, &count_res);
pc(&count_res);
cfree(&count_res);
}
if (listp == &ref_list)
free(ref_list.s);
}
if (cfg.debug)
print_cfg();
if (cfg.stats)
print_stats();
done:
sfree(&token_list);
if (f) fclose(f);
return rc;
}