From a5a0ddf73b81d3815def20d283bb8de1d01a7230 Mon Sep 17 00:00:00 2001 From: gramanas Date: Tue, 19 Mar 2019 00:32:20 +0200 Subject: Initial.. --- .gitignore | 2 + LICENCE | 280 ++++++++++++++++++++++++ Makefile | 20 ++ fcomp.c | 728 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1030 insertions(+) create mode 100644 .gitignore create mode 100644 LICENCE create mode 100644 Makefile create mode 100644 fcomp.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba6c68b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +fcomp +*.*~ \ No newline at end of file diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..99fb238 --- /dev/null +++ b/LICENCE @@ -0,0 +1,280 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..df15d2e --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +TARGET=fcomp +SRC=fcomp.c +CC=gcc +CFLAGS=-D_GNU_SOURCE -std=c99 -pedantic +REL_FLAGS=-O3 +DBG_FLAGS=-Wall -g3 +make: + $(CC) -o $(TARGET) $(SRC) $(CFLAGS) $(REL_FLAGS) + +debug: + $(CC) -o $(TARGET) $(SRC) $(CFLAGS) $(DBG_FLAGS) -fsanitize=address + +noasan: + $(CC) -o $(TARGET) $(SRC) $(CFLAGS) $(DBG_FLAGS) + +.PHONY: clean + +clean: + rm -f *.o + rm -f $(TARGET) diff --git a/fcomp.c b/fcomp.c new file mode 100644 index 0000000..f4715f5 --- /dev/null +++ b/fcomp.c @@ -0,0 +1,728 @@ +/** + * fcomp - parse text for string tokens and search for them + * Copyright (C) 2019 Anastasis Grammenos + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include "stdlib.h" +#include "stdio.h" +#include "string.h" +#include "getopt.h" +#include "ctype.h" + +typedef struct slist { + char **s; + unsigned int n; +} slist; + +typedef struct token { + char *s; + unsigned int count; +} token; + +/** + * Count result */ +typedef struct result { + token **tok; + unsigned int n; +} result; + +/** + * search methods */ +static int exact(const char *str, const char *c); +static int starts(const char *str, const char *c); +static int fuz(const char *str, const char *c); + +/** + * stats for debugging */ +typedef struct stats { + char *search_results; + char *tok_results; + char *unique_results; +} stats; + +static stats st = { + NULL, + NULL, + NULL +}; + +/** + * config */ +typedef struct config { + int help; /** 0: off 1: on */ + int debug; /** 0: off 1: on */ + int stats; /** 0: off 1: on */ + int stdin; /** 0: off 1: on */ + int reverse_sort; /** 0: off 1: on */ + int lisp_print; /** 0: off 1: on */ + int print_count; /** 0: off 1: on */ + int print_all; /** 0: off 1: on */ + char *file; + char *query; + unsigned int min_word_size; + int evc; /** 0: off 1: on */ + char *extra_valid_chars; + int eic; /** 0: off 1: on */ + char *extra_invalid_chars; + int (*search_method)(const char *, const char *); + unsigned int max_token_size; +} config; + +/** + * Initial config vector */ +static config cfg = { + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + NULL, + NULL, + 3, + 0, + NULL, + 0, + NULL, + starts, + 1000 +}; + +static void print_help(char *argv0) +{ + fprintf(stderr, + "fcomp - parse text for string tokens and search for them\n"); + fprintf(stderr, "Copyright (C) 2019 Anastasis Grammenos\n"); + fprintf(stderr, + "This program is licenced under GPLv2. See source tree for details.\n\n"); + fprintf(stderr, "Usage:\n~~~~~~\n"); + fprintf(stderr, "%s [query] [-f FILE] [...]\n\n", argv0); + fprintf(stderr, "%8s %4s %15s %50s\n", "Options", "", "Mnemonic", + "Description"); + fprintf(stderr, "%8s %4s %15s %50s\n", "~~~~~~~", "", "~~~~~~~~", + "~~~~~~~~~~~"); + fprintf(stderr, "%8s %4s %15s %50s\n\n", "-h", "", "help", + "Print this help message"); + fprintf(stderr, "%8s %4s %15s %50s\n", "query", "[..]", "", + "The term to be matched agains the tokens"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-f", "[..]", "file", + "Select the input file to use"); + fprintf(stderr, "%8s %4s %15s %50s\n", "stdin", "[..]", "", + "Input can come from rediretion as well as files"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-z", "", "fuzzy", + "Set the search method to fuzzy"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-x", "", "exact", + "Set the search method to exact match"); + fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "", + "(Default is starts_with)"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-r", "", "reverse", + "Reverse the sorting of matches"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-c", "", "count", + "Print the count of each matched token"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-l", "", "lisp", + "Print matched tokens in a lisp list"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-a", "", "all", + "Print all tokens of input"); + fprintf(stderr, "%8s %4s %15s %50s\n\n", "", "", "", + "(Performs no search)"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-v", "[..]", "valid chars", + "Set extra valid chars for tokens"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-i", "[..]", "invalid chars", + "Set extra invalid chars for tokens"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-w", "[..]", "word length", + "Set min word length to count as token"); + fprintf(stderr, "%8s %4s %15s %50s\n\n", "-t", "[..]", "toekn length", + "Set max token length to parse"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-d", "", "debug", + "Show debug information"); + fprintf(stderr, "%8s %4s %15s %50s\n", "-s", "", "stats", + "Print some stats"); +} + +static void print_cfg() +{ + fprintf(stderr, "\nValues:\n~~~~~~~\n"); + fprintf(stderr, "Input file: %68s\n", cfg.stdin ? "stdin" : cfg.file); + fprintf(stderr, "Query: %73s\n", cfg.query ? cfg.query : "---"); + fprintf(stderr, "Search method: %65s\n", + cfg.search_method == + starts ? "starts with" : (cfg.search_method == + exact ? "exact match" : "fuzzy")); + fprintf(stderr, "Minimun word size: %61d\n", cfg.min_word_size); + fprintf(stderr, "Extra valid chars: %61s\n", + cfg.evc ? cfg.extra_valid_chars : "---"); + fprintf(stderr, "Extra invalid chars: %59s\n", + cfg.eic ? cfg.extra_invalid_chars : "---"); + fprintf(stderr, "Flags:\n"); + fprintf(stderr, " [-d:%4s] ", cfg.debug ? "on" : "off"); + fprintf(stderr, "[-s:%4s] ", cfg.stats ? "on" : "off"); + fprintf(stderr, "[-f:%4s] ", cfg.stdin ? "off" : "on"); + fprintf(stderr, "[-r:%4s] ", cfg.reverse_sort ? "on" : "off"); + fprintf(stderr, "[-l:%4s]\n", cfg.lisp_print ? "on" : "off"); + fprintf(stderr, " [-c:%4s] ", cfg.print_count ? "on" : "off"); + fprintf(stderr, "[-a:%4s] ", cfg.print_all ? "on" : "off"); + fprintf(stderr, "[-v:%4s] ", cfg.evc ? "on" : "off"); + fprintf(stderr, "[-i:%4s]\n", cfg.eic ? "on" : "off"); +} + +static void print_stats() +{ + fprintf(stderr, "\nStats:\n~~~~~~\n"); + if (st.tok_results) { + fprintf(stderr, "%s", st.tok_results); + free(st.tok_results); + } + if (st.search_results) { + fprintf(stderr, "%s", st.search_results); + free(st.search_results); + } + if (st.unique_results) { + fprintf(stderr, "%s", st.unique_results); + free(st.unique_results); + } +} + +static int parse_cli(int argc, char *argv[]) +{ + if (argc < 2) + return -1; + char c; + while ((c = getopt(argc, argv, "-hxzsadclrf:i:v:w:t:")) != -1) { + switch (c) { + case 'h': + cfg.help = 1; + break; + case 'd': + cfg.debug = 1; + break; + case 's': + cfg.stats = 1; + break; + case 'a': + cfg.print_all = 1; + break; + case 'x': + cfg.search_method = exact; + break; + case 'z': + cfg.search_method = fuz; + break; + case 'c': + cfg.print_count = 1; + break; + case 'l': + cfg.lisp_print = 1; + break; + case 'f': + cfg.stdin = 0; + cfg.file = optarg; + break; + case 'v': + cfg.evc = 1; + cfg.extra_valid_chars = optarg; + break; + case 'i': + cfg.eic = 1; + cfg.extra_invalid_chars = optarg; + break; + case 'w': + cfg.min_word_size = atoi(optarg); + break; + case 't': + cfg.max_token_size = atoi(optarg); + break; + case 'r': + cfg.reverse_sort = 1; + break; + case '\1': + cfg.query = optarg; + break; + case '?': + if (optopt == 'w') { + fprintf(stderr, "Specify minimum word length after -%c\n", + optopt); + } else if (optopt == 'v') { + fprintf(stderr, + "Specify extra valid chars after -%c\n", optopt); + } else if (optopt == 'i') { + fprintf(stderr, + "Specify extra invalid chars after -%c\n", optopt); + } else if (optopt == 'f') { + fprintf(stderr, + "Specify file to read after -%c\n", optopt); + } else if (optopt == 't') { + fprintf(stderr, + "Specify max token size after -%c\n", optopt); + } + default: + return -1; + } + } + return 0; +} + +/** + * Check if a character is valid according to the default + * and extra valid characters */ +static int is_valid(const char c) +{ + if (cfg.eic) { + for (size_t i = 0; i < strlen(cfg.extra_invalid_chars); i++) { + if (cfg.extra_invalid_chars[i] == c) + return 0; + } + } + switch (c) { + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'g': + case 'h': + case 'i': + case 'j': + case 'k': + case 'l': + case 'm': + case 'n': + case 'o': + case 'p': + case 'q': + case 'r': + case 's': + case 't': + case 'u': + case 'v': + case 'w': + case 'x': + case 'y': + case 'z': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + case 'G': + case 'H': + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'P': + case 'Q': + case 'R': + case 'S': + case 'T': + case 'U': + case 'V': + case 'W': + case 'X': + case 'Y': + case 'Z': + case '-': + case '_': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return 1; + default: + if (cfg.evc) { + for (size_t i = 0; i < strlen(cfg.extra_valid_chars); i++) { + if (cfg.extra_valid_chars[i] == c) + return 1; + } + } + return 0; + } +} + +/** + * Add by refference */ +static void radd(slist * l, char *str) +{ + if (l->n == 0) + l->s = (char **) malloc(sizeof(char *)); + else + l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *)); + + l->s[l->n] = str; + l->n++; +} + +/** + * Add by value */ +static void sadd(slist * l, const char *tmp) +{ + if (l->n == 0) + l->s = (char **) malloc(sizeof(char *)); + else + l->s = (char **) realloc(l->s, (l->n + 1) * sizeof(char *)); + + l->s[l->n] = strdup(tmp); + l->n++; +} + +/** + * Add by refference to the result */ +static void cadd(result * r, char *str) +{ + if (r->n == 0) { + r->tok = (token **) malloc(sizeof(token *)); + } else { + r->tok = (token **) realloc(r->tok, (r->n + 1) * sizeof(token *)); + } + + struct token *tok; + tok = (token *) malloc(sizeof(token)); + if (!tok) + fprintf(stderr, "Err\n"); + tok->s = str; + tok->count = 1; + + r->tok[r->n] = tok; + + r->n++; +} + +/** + * free result */ +static void cfree(result * res) +{ + for (unsigned int i = 0; i < res->n; i++) { + free(res->tok[i]); + } + free(res->tok); +} + +/** + * free slist */ +static void sfree(slist * l) +{ + for (unsigned int i = 0; i < l->n; i++) { + if (l->s[i]) + free(l->s[i]); + } + if (l->s) + free(l->s); +} + +/** + * print slist */ +static void pp(slist * l) +{ + for (unsigned int i = 0; i < l->n; i++) { + printf("%s\n", l->s[i]); + } +} + +/** + * Check if @str is the same as @c */ +static int exact(const char *str, const char *c) +{ + if (strlen(str) != strlen(c)) + return 0; + for (size_t i = 0; i < strlen(c); i++) + if (str[i] != c[i]) + return 0; + return -1; +} + +/** + * Check if @str starts with @c */ +static int starts(const char *str, const char *c) +{ + for (size_t i = 0; i < strlen(c); i++) + if (str[i] != c[i]) + return 0; + return -1; +} + +/** + * Check if @str contains @c in the correct order + * (e.g str=example, c=xl => e_x_amp_l_e) */ +static int fuz(const char *str, const char *c) +{ + size_t i = 0; + size_t j = 0; + for (; i < strlen(str); i++) { + if (str[i] == c[j]) { + j++; + if (j == strlen(c)) + return -1; + } + } + return 0; +} + +/** + * Search an slist for @query and place the matches on @res + * The search method is in the config struct */ +static int search(slist * l, const char *query, slist * res) +{ + int flag = 0; + for (unsigned int i = 0; i < l->n; i++) { + if ((*cfg.search_method) (l->s[i], query)) { + radd(res, l->s[i]); + flag = 1; + } + } + return flag; +} + +/** + * Count the entries of a __sorted__ slist and fill + * the @res result struct with unique entries along with the count */ +static void count(slist * l, result * res) +{ + cadd(res, l->s[0]); + for (unsigned int i = 1; i < l->n; i++) { + if (strcmp(l->s[i], res->tok[res->n - 1]->s) == 0) { + res->tok[res->n - 1]->count++; + } else + cadd(res, l->s[i]); + } +} + +static void finalize_str(char *str, unsigned int n, slist * l) +{ + if (n > cfg.min_word_size) { + str[n] = '\0'; + sadd(l, str); + } +} + +static int cmpstringp(const void *p1, const void *p2) +{ + /* The actual arguments to this function are "pointers to + pointers to char", but strcmp(3) arguments are "pointers + to char", hence the following cast plus dereference */ + + return strcmp(*(char *const *) p1, *(char *const *) p2); +} + +static int cmpint(const void *p1, const void *p2) +{ + token *a = *(token * const *) p1; + token *b = *(token * const *) p2; + + if (a->count < b->count) + return 1; + if (b->count > a->count) + return -1; + return 0; +} + +static int cmpint_r(const void *p1, const void *p2) +{ + token *a = *(token * const *) p1; + token *b = *(token * const *) p2; + + if (a->count > b->count) + return 1; + if (b->count < a->count) + return -1; + return 0; +} + +static void pc(const result * res) +{ + if (cfg.lisp_print) { + printf("( "); + for (unsigned int i = 0; i < res->n - 1; i++) { + printf("\"%s\", ", res->tok[i]->s); + } + printf("\"%s\" )\n", res->tok[res->n - 1]->s); + } else { + for (unsigned int i = 0; i < res->n; i++) { + if (cfg.print_count) + printf("[%d] ", res->tok[i]->count); + printf("%s\n", res->tok[i]->s); + } + } +} + +/** + * Fill an slist with tokens from @f */ +static int tokenize(FILE * f, slist * l) +{ + unsigned int n = 0; + char c; + char *tmp = NULL; + + while ((c = fgetc(f)) != EOF) { + if (!is_valid(c)) { + if (tmp) { + finalize_str(tmp, n, l); + free(tmp); + tmp = NULL; + } + n = 0; + continue; + } + if (!tmp) + tmp = (char *) malloc(cfg.max_token_size); + if (n < cfg.max_token_size) + tmp[n++] = (char) c; + else { + fprintf(stderr, + "Max token size %d is not enough.\n", + cfg.max_token_size); + return -1; + } + } + if (tmp) { + finalize_str(tmp, n, l); + free(tmp); + } + return 0; +} + +static void sort_by_count(result * r) +{ + if (cfg.reverse_sort) + qsort(&r->tok[0], r->n, sizeof(token *), cmpint_r); + else + qsort(&r->tok[0], r->n, sizeof(token *), cmpint); +} + +static void get_slist_stats(slist * l) +{ + if (l->n > 0) { + char tmp[] = "Total tokens:"; + st.tok_results = + (char *) + malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) + + 3) * sizeof(char)); + sprintf(st.tok_results, "%s\t%d\n", tmp, l->n); + } +} + +static void get_search_stats(slist * l) +{ + if (l->n > 0) { + char tmp[] = "Total matches:"; + st.search_results = + (char *) + malloc((strlen(tmp) + snprintf(NULL, 0, "%d", l->n) + + 3) * sizeof(char)); + sprintf(st.search_results, "%s\t%d\n", tmp, l->n); + } +} + +static void get_result_stats(result * r) +{ + if (r->n > 0) { + char tmp[] = "Unique matches:"; + st.unique_results = + (char *) + malloc((strlen(tmp) + snprintf(NULL, 0, "%d", r->n) + + 3) * sizeof(char)); + sprintf(st.unique_results, "%s\t%d\n", tmp, r->n); + } +} + +int main(int argc, char *argv[]) +{ + FILE *f; + slist list = { 0 }; + slist search_res = { 0 }; + result count_res = { 0 }; + + if (parse_cli(argc, argv) + || cfg.help) { + print_help(argv[0]); + if (cfg.help) + return 0; + return -1; + } + + if (cfg.query == NULL && !cfg.print_all) { + fprintf(stderr, "Query missing ... terminating\n"); + return -1; + } + + /* set input */ + if (cfg.stdin) { + f = stdin; + } else { + f = fopen(cfg.file, "r"); + if (!f) { + fprintf(stderr, "Couldn't open %s\n", cfg.file); + return -1; + } + } + + /* tokenize */ + if (tokenize(f, &list)) + goto err; + + if (cfg.stats) + get_slist_stats(&list); + + if (cfg.print_all) { + pp(&list); + } else { + /* search for the query */ + if (search(&list, cfg.query, &search_res)) { + + if (cfg.stats) + get_search_stats(&search_res); + + /* sort the results */ + qsort(&search_res.s[0], search_res.n, sizeof(char *), + cmpstringp); + + /* count the unique */ + count(&search_res, &count_res); + sort_by_count(&count_res); + + if (cfg.stats) + get_result_stats(&count_res); + + /* print them */ + pc(&count_res); + + cfree(&count_res); + free(search_res.s); + } + } + + if (cfg.debug) + print_cfg(); + if (cfg.stats) + print_stats(); + + err: + sfree(&list); + fclose(f); + return 0; +} -- cgit v1.2.3