summaryrefslogblamecommitdiffstats
path: root/fcomp.c
blob: a41c75df204093553c64cbc9ae598e423d97fb05 (plain) (tree)






















                                                                                  
                   
                  
                      


























                                                  
                           





                         
      















                                         
                                     
                                        
                                   
                                       













                                                     




















                                


































                                                                                      

                                                                 









                                                                          
                                                                           




                                                              



                                                                    























                                                                          
                                                              






                                                          
                                                            


















                                                 
                                                                        
                    
                 


                                  



                                

                               
                  





                          


                             

















































































































































































































                                                                           
               











                                             
             




















































                                                           
                                                                  























































































                                                                     
          



                                   
                           





































                                                           
                                 
                                                                 
                                       
                                                         






                                       
                                    
                                                                 
                                       
                                                            





                                        

                                      
                                                                 
                                       





                                                            



                                    

 
                             



                     

                                
                 




                                          






                                                                 
                            



                              





















                                                                    

                                                   






                                                            
     














                                            



                                
               
                   


                             









                             
                                                                  



                                                           



                                        
                  

                  
                                     
 
                          

                            

     
                        



                                

                                  





                                                        
 



                                    
 

                               

     
 




                      

                       
                     
              
 
/**
 * fcomp - parse text for string tokens and search for them
 * Copyright (C) 2019  Anastasis Grammenos
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

#include "stdlib.h"
#include "stdio.h"
#include "string.h"
#include "getopt.h"
#include "unistd.h"
#include "ctype.h"
#include "linenoise.h"

typedef struct slist {
    char **s;
    unsigned int n;
} slist;

typedef struct token {
    char *s;
    unsigned int count;
} token;

/**
 * Count result */
typedef struct result {
    token **tok;
    unsigned int n;
} result;

/**
 * search methods */
static int exact(const char *str, const char *c);
static int starts(const char *str, const char *c);
static int fuz(const char *str, const char *c);

/**
 * stats for debugging */
typedef struct stats {
    unsigned int discarded;
    char *search_results;
    char *tok_results;
    char *unique_results;
} stats;

static stats st = {
    0,
    NULL,
    NULL,
    NULL
};

/**
 * config */
typedef struct config {
    int help; /** 0: off 1: on */
    int debug; /** 0: off 1: on */
    int stats; /** 0: off 1: on */
    int stdin; /** 0: off 1: on */
    int reverse_sort; /** 0: off 1: on */
    int lisp_print; /** 0: off 1: on */
    int print_count; /** 0: off 1: on */
    int print_all; /** 0: off 1: on */
    int all_uniq; /** 0: off 1: on */
    int interactive; /** 0: off 1: on */
    int prompt;	/** 0: off 1: on */
    int filestream; /** 0: off 1: on */
    char *file;
    char *query;
    unsigned int min_word_size;
    int evc; /** 0: off 1: on */
    char *extra_valid_chars;
    int eic; /** 0: off 1: on */
    char *extra_invalid_chars;
    int (*search_method)(const char *, const char *);
    unsigned int max_token_size;
} config;

/**
 * Initial config vector */
static config cfg = {
    .help = 0,
    .debug = 0,
    .stats = 0,
    .stdin = 1,
    .reverse_sort = 0,
    .lisp_print = 0,
    .print_count = 0,
    .print_all = 0,
    .all_uniq = 0,
    .interactive = 0,
    .prompt = 1,
    .filestream = 0,
    .file = NULL,
    .query = NULL,
    .min_word_size = 3,
    .evc = 0,
    .extra_valid_chars = NULL,
    .eic = 0,
    .extra_invalid_chars = NULL,
    .search_method = starts,
    .max_token_size = 1000
};

static void print_help(char *argv0)
{
    fprintf(stderr,
	    "fcomp - parse text for string tokens and search for them\n");
    fprintf(stderr, "Copyright (C) 2019  Anastasis Grammenos\n");
    fprintf(stderr,
	    "This program is licenced under GPLv2. See source tree for details.\n\n");
    fprintf(stderr, "Usage:\n~~~~~~\n");
    fprintf(stderr, "%s [query] [-f FILE] [...]\n\n", argv0);
    fprintf(stderr, "%8s %4s  %15s %50s\n", "Options", "", "Mnemonic",
	    "Description");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "~~~~~~~", "", "~~~~~~~~",
	    "~~~~~~~~~~~");
    fprintf(stderr, "%8s %4s  %15s %50s\n\n", "-h", "", "help",
	    "Print this help message");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "query", "[..]", "",
	    "The term to be matched agains the tokens");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-f", "[..]", "file",
	    "Select the input file to use");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "stdin", "[..]", "",
	    "Input can come from rediretion as well as files");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-z", "", "fuzzy",
	    "Set the search method to fuzzy");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-x", "", "exact",
	    "Set the search method to exact match");
    fprintf(stderr, "%8s %4s  %15s %50s\n\n", "", "", "",
	    "(Default is starts_with)");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-r", "", "reverse",
	    "Reverse the sorting of matches");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-c", "", "count",
	    "Print the count of each matched token");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-l", "", "lisp",
	    "Print matched tokens in a lisp list");
    fprintf(stderr, "%8s %4s  %15s %50s\n\n", "-u", "", "unique",
	    "When -a is passed, don't print duplicates");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-a", "", "all",
	    "Print all tokens of input");
    fprintf(stderr, "%8s %4s  %15s %50s\n\n", "", "", "",
	    "(Performs no search)");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-v", "[..]", "valid chars",
	    "Set extra valid chars for tokens");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-i", "[..]", "invalid chars",
	    "Set extra invalid chars for tokens");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-w", "[..]", "word length",
	    "Set min word length to count as token");
    fprintf(stderr, "%8s %4s  %15s %50s\n\n", "-t", "[..]", "token length",
	    "Set max token length to parse");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-d", "", "debug",
	    "Show debug information");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-s", "", "stats",
	    "Print some stats");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "-I", "", "interactive",
	    "Run with interactive query input");
    fprintf(stderr, "%8s %4s  %15s %50s\n", "", "", "",
	    "(Pass it twice to disable prompt)");
}

static void print_cfg()
{
    fprintf(stderr, "\nValues:\n~~~~~~~\n");
    fprintf(stderr, "Input file: %68s\n", cfg.stdin ? "stdin" : cfg.file);
    fprintf(stderr, "Query: %73s\n", cfg.query ? cfg.query : "---");
    fprintf(stderr, "Search method: %65s\n",
	    cfg.search_method ==
	    starts ? "starts with" : (cfg.search_method ==
				      exact ? "exact match" : "fuzzy"));
    fprintf(stderr, "Minimun word size: %61d\n", cfg.min_word_size);
    fprintf(stderr, "Extra valid chars: %61s\n",
	    cfg.evc ? cfg.extra_valid_chars : "---");
    fprintf(stderr, "Extra invalid chars: %59s\n",
	    cfg.eic ? cfg.extra_invalid_chars : "---");
    fprintf(stderr, "Flags:\n");
    fprintf(stderr, "  [-d:%4s] ", cfg.debug ? "on" : "off");
    fprintf(stderr, "[-s:%4s] ", cfg.stats ? "on" : "off");
    fprintf(stderr, "[-f:%4s] ", cfg.stdin ? "off" : "on");
    fprintf(stderr, "[-r:%4s] ", cfg.reverse_sort ? "on" : "off");
    fprintf(stderr, "[-l:%4s]\n", cfg.lisp_print ? "on" : "off");
    fprintf(stderr, "  [-c:%4s] ", cfg.print_count ? "on" : "off");
    fprintf(stderr, "[-a:%4s] ", cfg.print_all ? "on" : "off");
    fprintf(stderr, "[-u:%4s] ", cfg.all_uniq ? "on" : "off");
    fprintf(stderr, "[-v:%4s] ", cfg.evc ? "on" : "off");
    fprintf(stderr, "[-i:%4s]\n", cfg.eic ? "on" : "off");
}

static void print_stats()
{
    fprintf(stderr, "\nStats:\n~~~~~~\n");
    fprintf(stderr, "Discarded chars:\t%d\n", st.discarded);
    if (st.tok_results) {
	fprintf(stderr, "%s", st.tok_results);
	free(st.tok_results);
    }
    if (st.search_results) {
	fprintf(stderr, "%s", st.search_results);
	free(st.search_results);
    }
    if (st.unique_results) {
	fprintf(stderr, "%s", st.unique_results);
	free(st.unique_results);
    }
}

static int parse_cli(int argc, char *argv[])
{
    if (argc < 2)
	return -1;
    char c;
    while ((c = getopt(argc, argv, "-hIF:xzsaudclrf:i:v:w:t:")) != -1) {
	switch (c) {
	case 'I':
	    if (cfg.interactive) {
		cfg.prompt = 0;
	    }
	    cfg.interactive = 1;
	    break;
	case 'F':
	    cfg.stdin = 0;
	    cfg.filestream = 1;
	    cfg.file = optarg;
	    break;
	case 'h':
	    cfg.help = 1;
	    break;
	case 'd':
	    cfg.debug = 1;
	    break;
	case 'u':
	    cfg.all_uniq = 1;
	    break;
	case 's':
	    cfg.stats = 1;
	    break;
	case 'a':
	    cfg.print_all = 1;
	    break;
	case 'x':
	    cfg.search_method = exact;
	    break;
	case 'z':
	    cfg.search_method = fuz;
	    break;
	case 'c':
	    cfg.print_count = 1;
	    break;
	case 'l':
	    cfg.lisp_print = 1;
	    break;
	case 'f':
	    cfg.stdin = 0;
	    cfg.file = optarg;
	    break;
	case 'v':
	    cfg.evc = 1;
	    cfg.extra_valid_chars = optarg;
	    break;
	case 'i':
	    cfg.eic = 1;
	    cfg.extra_invalid_chars = optarg;
	    break;
	case 'w':
	    cfg.min_word_size = atoi(optarg);
	    break;
	case 't':
	    cfg.max_token_size = atoi(optarg);
	    break;
	case 'r':
	    cfg.reverse_sort = 1;
	    break;
	case '\1':
	    cfg.query = optarg;
	    break;
	case '?':
	    if (optopt == 'w') {
		fprintf(stderr, "Specify minimum word length after -%c\n",
			optopt);
	    } else if (optopt == 'v') {
		fprintf(stderr,
			"Specify extra valid chars after -%c\n", optopt);
	    } else if (optopt == 'i') {
		fprintf(stderr,
			"Specify extra invalid chars after -%c\n", optopt);
	    } else if (optopt == 'f') {
		fprintf(stderr,
			"Specify file to read after -%c\n", optopt);
	    } else if (optopt == 't') {
		fprintf(stderr,
			"Specify max token size after -%c\n", optopt);
	    }
	default:
	    return -1;
	}
    }
    return 0;
}

/**
 * Check if a character is valid according to the default
 * and extra valid characters */
static int is_valid(const char c)
{
    if (cfg.eic) {
	for (size_t i = 0; i < strlen(cfg.extra_invalid_chars); i++) {
	    if (cfg.extra_invalid_chars[i] == c)
		return 0;
	}
    }
    switch (c) {
    case 'a':
    case 'b':
    case 'c':
    case 'd':
    case 'e':
    case 'f':
    case 'g':
    case 'h':
    case 'i':
    case 'j':
    case 'k':
    case 'l':
    case 'm':
    case 'n':
    case 'o':
    case 'p':
    case 'q':
    case 'r':
    case 's':
    case 't':
    case 'u':
    case 'v':
    case 'w':
    case 'x':
    case 'y':
    case 'z':
    case 'A':
    case 'B':
    case 'C':
    case 'D':
    case 'E':
    case 'F':
    case 'G':
    case 'H':
    case 'I':
    case 'J':
    case 'K':
    case 'L':
    case 'M':
    case 'N':
    case 'O':
    case 'P':
    case 'Q':
    case 'R':
    case 'S':
    case 'T':
    case 'U':
    case 'V':
    case 'W':
    case 'X':
    case 'Y':
    case 'Z':
    case '-':
    case '_':
    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
	return 1;
    default:
	if (cfg.evc) {
	    for (size_t i = 0; i < strlen(cfg.extra_valid_chars); i++) {
		if (cfg.extra_valid_chars[i] == c)
		    return 1;
	    }
	}
	return 0;
    }
}

/**
 * Add by refference */
static void radd(slist * l, char *str)
{
    if (l->n == 0)
	l->s = (char **) malloc(sizeof(char *));
    else
	l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *));

    l->s[l->n] = str;
    l->n++;
}

/**
 * Add by value */
static void sadd(slist * l, const char *tmp)
{
    if (l->n == 0)
	l->s = (char **) malloc(sizeof(char *));
    else
	l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *));

    l->s[l->n] = strdup(tmp);
    l->n++;
}

/**
 * Add by refference to the result */
static void cadd(result * r, char *str)
{
    if (r->n == 0) {
	r->tok = (token **) malloc(sizeof(token *));
    } else {
	r->tok = (token **) realloc(r->tok, (r->n + 1) * sizeof(token *));
    }

    struct token *tok;
    tok = (token *) malloc(sizeof(token));
    if (!tok)
	fprintf(stderr, "Err\n");
    tok->s = str;
    tok->count = 1;

    r->tok[r->n] = tok;

    r->n++;
}

/**
 * free result */
static void cfree(result * res)
{
    for (unsigned int i = 0; i < res->n; i++) {
	free(res->tok[i]);
    }
    free(res->tok);
    res->n = 0;
}

/**
 * free slist */
static void sfree(slist * l)
{
    for (unsigned int i = 0; i < l->n; i++) {
	if (l->s[i])
	    free(l->s[i]);
    }
    if (l->s)
	free(l->s);
    l->n = 0;
}

/**
 * print slist */
static void pp(slist * l)
{
    for (unsigned int i = 0; i < l->n; i++) {
	printf("%s\n", l->s[i]);
    }
}

/**
 * Check if @str is the same as @c */
static int exact(const char *str, const char *c)
{
    if (strlen(str) != strlen(c))
	return 0;
    for (size_t i = 0; i < strlen(c); i++)
	if (str[i] != c[i])
	    return 0;
    return -1;
}

/**
 * Check if @str starts with @c */
static int starts(const char *str, const char *c)
{
    for (size_t i = 0; i < strlen(c); i++)
	if (str[i] != c[i])
	    return 0;
    return -1;
}

/**
 * Check if @str contains @c in the correct order
 * (e.g str=example, c=xl => e_x_amp_l_e) */
static int fuz(const char *str, const char *c)
{
    size_t i = 0;
    size_t j = 0;
    for (; i < strlen(str); i++) {
	if (str[i] == c[j]) {
	    j++;
	    if (j == strlen(c))
		return -1;
	}
    }
    return 0;
}

/**
 * Search an slist for @query and place the matches on @res
 * The search method is in the config struct */
static int search(const slist * l, const char *query, slist * res)
{
    int flag = 0;
    for (unsigned int i = 0; i < l->n; i++) {
	if ((*cfg.search_method) (l->s[i], query)) {
	    radd(res, l->s[i]);
	    flag = 1;
	}
    }
    return flag;
}

/**
 * Count the entries of a __sorted__ slist and fill
 * the @res result struct with unique entries along with the count */
static void count(slist * l, result * res)
{
    cadd(res, l->s[0]);
    for (unsigned int i = 1; i < l->n; i++) {
	if (strcmp(l->s[i], res->tok[res->n - 1]->s) == 0) {
	    res->tok[res->n - 1]->count++;
	} else
	    cadd(res, l->s[i]);
    }
}

static void finalize_str(char *str, unsigned int n, slist * l)
{
    if (n > cfg.min_word_size) {
	str[n] = '\0';
	sadd(l, str);
    }
}

static int cmpstringp(const void *p1, const void *p2)
{
    /* The actual arguments to this function are "pointers to
       pointers to char", but strcmp(3) arguments are "pointers
       to char", hence the following cast plus dereference */

    return strcmp(*(char *const *) p1, *(char *const *) p2);
}

static int cmpint(const void *p1, const void *p2)
{
    token *a = *(token * const *) p1;
    token *b = *(token * const *) p2;

    if (a->count < b->count)
	return 1;
    if (b->count > a->count)
	return -1;
    return 0;
}

static int cmpint_r(const void *p1, const void *p2)
{
    token *a = *(token * const *) p1;
    token *b = *(token * const *) p2;

    if (a->count > b->count)
	return 1;
    if (b->count < a->count)
	return -1;
    return 0;
}

static void pc(const result * res)
{
    if (cfg.lisp_print) {
	printf("( ");
	for (unsigned int i = 0; i < res->n - 1; i++) {
	    printf("\"%s\", ", res->tok[i]->s);
	}
	printf("\"%s\" )\n", res->tok[res->n - 1]->s);
    } else {
	for (unsigned int i = 0; i < res->n; i++) {
	    if (cfg.print_count)
		printf("[%d] ", res->tok[i]->count);
	    printf("%s\n", res->tok[i]->s);
	}
    }
}

/**
 * Fill an slist with tokens from @f */
static int tokenize(FILE * f, slist * l)
{
    unsigned int n = 0;
    int c;
    char *tmp = NULL;

    while ((c = fgetc(f)) != EOF) {
	if (!is_valid(c)) {
	    st.discarded++;
	    if (tmp) {
		finalize_str(tmp, n, l);
		free(tmp);
		tmp = NULL;
	    }
	    n = 0;
	    continue;
	}
	if (!tmp)
	    tmp = (char *) malloc(cfg.max_token_size);
	if (n < cfg.max_token_size)
	    tmp[n++] = (char) c;
	else {
	    fprintf(stderr,
		    "Max token size %d is not enough.\n",
		    cfg.max_token_size);
	    return -1;
	}
    }
    if (tmp) {
	finalize_str(tmp, n, l);
	free(tmp);
    }
    return 0;
}

static void sort_by_count(result * r)
{
    if (cfg.reverse_sort)
	qsort(&r->tok[0], r->n, sizeof(token *), cmpint_r);
    else
	qsort(&r->tok[0], r->n, sizeof(token *), cmpint);
}

static void get_slist_stats(slist * l)
{
    if (l->n > 0) {
	char tmp[] = "Total tokens:";
	st.tok_results = (char *)
	    malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) +
		    4) * sizeof(char));
	sprintf(st.tok_results, "%s\t\t%d\n", tmp, l->n);
    }
}

static void get_search_stats(slist * l)
{
    if (l->n > 0) {
	char tmp[] = "Total matches:";
	st.search_results = (char *)
	    malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) +
		    4) * sizeof(char));
	sprintf(st.search_results, "%s\t\t%d\n", tmp, l->n);
    }
}

static void get_result_stats(result * r)
{
    if (r->n > 0) {
	char tmp[] = "Unique tokens:";
	st.unique_results = (char *)
	    malloc((strlen(tmp) + snprintf(NULL, 0, "%d", r->n) +
		    4) * sizeof(char));
	sprintf(st.unique_results, "%s\t\t%d\n", tmp, r->n);
    }
}

void clean_stdin(void)
{
    int c;
    do {
	c = getchar();
    } while (c != '\n' && c != EOF);
}

static void prompt(slist * l)
{
    int r = 0;
    char p[20] = "";
    char *tmp = NULL;
    if (cfg.prompt)
	sprintf(p, "[%d]> ", r);
    fflush(NULL);
    while ((tmp = linenoise(p)) != NULL) {
	slist search_res = { 0 };
	if (search(l, tmp, &search_res)) {
	    result count_res = { 0 };
	    /* sort the results */
	    qsort(&search_res.s[0], search_res.n, sizeof(char *),
		  cmpstringp);

	    /* count the unique */
	    count(&search_res, &count_res);
	    sort_by_count(&count_res);

	    r = count_res.n;
	    /* print them */
	    pc(&count_res);

	    cfree(&count_res);
	}
	if (cfg.prompt)
	    sprintf(p, "[%d]> ", r);
	r = 0;
	free(search_res.s);
	free(tmp);
	tmp = NULL;
    }
    if (tmp)
	free(tmp);
}

static int set_input(FILE **f)
{
    if (cfg.stdin) {
	if (cfg.interactive) {
	    fprintf(stderr,
		    "Can't read from stdin in interactive mode.\n");
	    return -1;
	}
	*f = stdin;
    } else if (cfg.filestream) {
        if (!cfg.file || strcmp(cfg.file, "") == 0)
            return -1;
	*f = fmemopen(cfg.file, strlen(cfg.file), "r");
    } else {
	*f = fopen(cfg.file, "r");
	if (!f) {
	    fprintf(stderr, "Couldn't open %s\n", cfg.file);
	    return -1;
	}
    }
    return 0;
}

static void get_uniq(slist * l, result *r)
{
    /* sort the results */
    qsort(&l->s[0], l->n, sizeof(char *),
          cmpstringp);

    /* count the unique, put it in result */
    count(l, r);
    sort_by_count(r);

    if (cfg.stats)
        get_result_stats(r);
}

int main(int argc, char *argv[])
{
    int rc = 0;
    FILE *f = NULL;
    slist token_list = { 0 };
    slist ref_list = { 0 };
    slist *listp = NULL;
    result count_res = { 0 };

    if (parse_cli(argc, argv)
	|| cfg.help) {
	print_help(argv[0]);
	if (cfg.help)
	    return 0;
	return -1;
    }

    if (cfg.query == NULL && !cfg.print_all && !cfg.interactive) {
	fprintf(stderr, "Query missing ... terminating\n");
	return -1;
    }

    if ((rc = set_input(&f)))
        goto done;

    if ((rc = tokenize(f, &token_list)))
	goto done;

    if (cfg.stats)
	get_slist_stats(&token_list);

    if (cfg.interactive) {
	prompt(&token_list);
	goto done;
    }

    if (cfg.print_all) {
        if (cfg.all_uniq)
            listp = &token_list;
        else 
            pp(&token_list);
    } else {
	/* search for the query */
	if (search(&token_list, cfg.query, &ref_list)) {
            listp = &ref_list;
            if (cfg.stats)
		get_search_stats(&ref_list);
	}
    }

    if (listp) {
        get_uniq(listp, &count_res);
        pc(&count_res);
        cfree(&count_res);

        if (listp == &ref_list)
            free(ref_list.s);
    }


    if (cfg.debug)
	print_cfg();
    if (cfg.stats)
	print_stats();

  done:
    sfree(&token_list);
    if (f) fclose(f);
    return rc;
}