Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/woip/c/lsearcher.c
diff options
context:
space:
mode:
Diffstat (limited to 'woip/c/lsearcher.c')
-rw-r--r--woip/c/lsearcher.c338
1 files changed, 338 insertions, 0 deletions
diff --git a/woip/c/lsearcher.c b/woip/c/lsearcher.c
new file mode 100644
index 0000000..6f69b5f
--- /dev/null
+++ b/woip/c/lsearcher.c
@@ -0,0 +1,338 @@
+#include "lsearcher.h"
+
+int kill_switch;
+ucaddr_t addr = 0;
+
+void kill_search() {
+ kill_switch = 1;
+}
+
+void search_progress(lindex *l, double *d) {
+ double prog = ((double) (addr - l->addr)) / ((double) l->size);
+ debug("prog: %lf", prog);
+ *d = prog;
+}
+
+int check_bigram_char(int ch) {
+ /* legal bigram: 0, ASCII_MIN ... ASCII_MAX */
+ if (ch == 0 || (ch >= ASCII_MIN && ch <= ASCII_MAX))
+ return(ch);
+
+ fatal("locate database header corrupt, bigram char outside 0, %d-%d: %d",
+ ASCII_MIN, ASCII_MAX, ch);
+ exit(1);
+}
+
+uchar_t *tolower_word(uchar_t *word) {
+ register uchar_t *p;
+
+ for(p = word; *p != '\0'; p++)
+ *p = TOLOWER(*p);
+
+ return(word);
+}
+
+int matches = 0;
+
+bool handle_match(uchar_t *s) {
+ printf("%s\n", s);
+ return true;
+}
+
+void init_index(lindex *l, ucaddr_t adr, size_t len, uint32_t *pdb) {
+ if (len < (2*NBG))
+ fatal("database too small");
+ uchar_t *p, *s;
+ int c;
+
+ for (c = 0, p = l->bigram1, s = l->bigram2; c < NBG; c++, len-= 2) {
+ p[c] = check_bigram_char(*adr++);
+ s[c] = check_bigram_char(*adr++);
+ }
+
+ l->addr = adr;
+ l->size = len;
+
+ if(pdb) l->prefixdb = pdb;
+}
+
+void load_index(lindex *l, char *path, char *ppath) {
+ size_t size;
+ uchar_t *adr;
+ uint32_t *paddr = NULL;
+
+ debug("load_index(0x%x, %s, %s)", l, path, ppath);
+
+ adr = xmmapf(path, &size);
+ if(ppath) paddr = xmmapf(ppath, 0);
+
+ init_index(l, adr, size, paddr);
+}
+
+void scan(lindex *l, char *path) {
+ int len = l->size;
+ int c;
+ uchar_t *adr = l->addr;
+ int count = 0;
+ uint32_t prefixdb[CHAR_MAX] = {0};
+
+ /* don't bother supporting >2gb... an index file that large
+ * would have search times so high as to be useless
+ */
+
+ debug("scanning through %d chars...", len);
+
+ c = *adr++;
+ len--;
+ while(len > 0) {
+ uchar_t *this_switch = adr - 1;
+
+ if(c == SWITCH) {
+ count += *(int*) adr - OFFSET;
+ len -= sizeof(int);
+ adr += sizeof(int);
+ } else {
+ count += c - OFFSET;
+ }
+
+ if(count == 0) {
+ char x[4];
+ int i = 0;
+ c = *adr++; len--;
+ for(; i < 4; i++) {
+ c = *(adr + i - 1);
+ if(c >= PARITY) {
+ TO7BIT(c);
+ x[i] = l->bigram1[c];
+ } else if(c > UMLAUT)
+ x[i] = c;
+ else if(c == UMLAUT) {
+ x[i] = *(adr + i);
+ adr++; len--;
+ }
+ }
+
+ if(isalnum(x[0])) {
+ /* ignore unicode chars for now */
+ prefixdb[x[0]] = this_switch - l->addr;
+ debug("%c starts at 0x%x (unicode: %4s)", x[0], this_switch - l->addr, x);
+ }
+ }
+
+ while(len-- > 0 && (c = *adr++) >= UMLAUT);
+ }
+
+ FILE *fp = xfopen(path, "w");
+ fwrite(prefixdb, sizeof(uint32_t), CHAR_MAX, fp);
+ fclose(fp);
+}
+
+uchar_t *pathprep(uchar_t *path) {
+ uchar_t *patend;
+ size_t patlen = strlen(path);
+ uchar_t *patptr = xalloc(patlen + 2);
+ *patptr = '\0';
+ strncpy(patptr, path, patlen);
+ patend = patptr + patlen - 1;
+ *(patptr + patlen) = '\0';
+ return patend;
+}
+
+int search(lindex *l, uchar_t *pathpart, resultf f, donef df, bool icase, bool strict) {
+ register uchar_t *p, *s, *patend, *q, *foundchar;
+ register int c, cc;
+ int count, found;
+ size_t len = l->size;
+ addr = l->addr;
+ uchar_t *cutoff, path[MAXSTR];
+
+ kill_switch = 0;
+
+ /* use a lookup table for case insensitive search */
+ uchar_t table[UCHAR_MAX + 1];
+
+ if(icase) tolower_word(pathpart);
+
+ patend = pathprep(pathpart);
+ cc = *patend;
+
+ /* set patend char to true */
+ for (c = 0; c < UCHAR_MAX + 1; c++)
+ table[c] = 0;
+
+ if(icase) {
+ table[TOLOWER(*patend)] = 1;
+ table[toupper(*patend)] = 1;
+ } else
+ table[*patend] = 1;
+
+ /* main loop */
+ found = count = 0;
+ foundchar = 0;
+
+ debug("checking for prefixdb...");
+ int offset = -1;
+ bool skip = false;
+ if(l->prefixdb) offset = *(l->prefixdb + toupper(*pathpart));
+ debug("offset: %d", offset);
+ if(strict && l->prefixdb && (offset > 0)) {
+ debug("using prefix db");
+ addr = l->addr + offset;
+ len -= offset;
+ skip = true;
+ }
+
+ c = *addr++;
+ len--;
+ while(len > 0) {
+ if(kill_switch)
+ return -1;
+
+ if (c == SWITCH && !skip) {
+ count += *(int*) addr - OFFSET;
+ len -= sizeof(int);
+ addr += sizeof(int);
+ } else if(!skip) {
+ count += c - OFFSET;
+ }
+
+ skip = false;
+
+ p = path + count;
+ foundchar = p - 1;
+
+ while(len > 0) {
+ if(kill_switch)
+ return -1;
+
+ c = *addr++;
+ len--;
+
+ if (c < PARITY) {
+ if (c <= UMLAUT) {
+ if (c == UMLAUT) {
+ c = (uchar_t)*addr++;
+ len--;
+ } else
+ break;
+ }
+
+ if (table[c])
+ foundchar = p;
+ *p++ = c;
+ } else {
+ TO7BIT(c);
+
+ if (table[l->bigram1[c]] || table[l->bigram2[c]])
+ foundchar = p + 1;
+
+ *p++ = l->bigram1[c];
+ *p++ = l->bigram2[c];
+ }
+ }
+
+ if (found) {
+ cutoff = path;
+ *p-- = '\0';
+ foundchar = p;
+ } else if (foundchar >= path + count) {
+ *p-- = '\0';
+ cutoff = path + count;
+ } else if(!strict)
+ continue;
+ else
+ *p-- = '\0';
+
+ found = 0;
+ if(strict) {
+ for(s = path, q = pathpart; *q; s++, q++)
+ if((icase && TOLOWER(*s) != *q) || (!icase && *s != *q))
+ break;
+ if(*q == '\0') {
+ if(!f(path))
+ return 0;
+ }
+ } else {
+ for (s = foundchar; s >= cutoff; s--) {
+ if (*s == cc || (icase && TOLOWER(*s) == cc)) {
+ for (p = patend - 1, q = s - 1; *p != '\0'; p--, q--)
+ if (*q != *p && (!icase || TOLOWER(*q) != *p))
+ break;
+ if (*p == '\0' && \
+ (icase ? \
+ strncasecmp(path, pathpart, strlen(pathpart)) : \
+ strncmp(path, pathpart, strlen(pathpart)))) {
+ found = 1;
+ if(!f(path))
+ return 0;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if(strict) search(l, pathpart, f, df, icase, false);
+ else if(df) df();
+}
+
+#ifdef INCLUDE_MAIN
+void usage(char *prog) {
+ fatal("%s -f <indexFile> [-c <scanFile>] [-s <search>]", prog);
+}
+
+int main(int argc, char **argv) {
+ size_t size;
+ uchar_t *dbptr;
+ extern char *optarg;
+ char scanFile[MAXSTR], indexFile[MAXSTR], needle[MAXSTR];
+ unsigned char ch;
+ bool doScan = false, doSearch = false, haveScanFile = false, icase = true;
+ lindex l = {0};
+
+ debug = false;
+
+ while ((ch = getopt(argc, argv, "dac:s:f:hn")) != 255) {
+ switch (ch) {
+ case 'c':
+ haveScanFile = true;
+ strncpy(scanFile, optarg, MAXSTR);
+ break;
+ case 'n':
+ doScan = true;
+ case 'd':
+ debug = true;
+ break;
+ case 'f':
+ strncpy(indexFile, optarg, MAXSTR);
+ break;
+ case 's':
+ doSearch = true;
+ strncpy(needle, optarg, MAXSTR);
+ break;
+ case 'a':
+ icase = false;
+ break;
+ case 'h':
+ default:
+ usage(argv[0]);
+ }
+ }
+
+ if(indexFile)
+ load_index(&l, indexFile, (haveScanFile && doSearch) ? scanFile : NULL);
+ else {
+ debug("no index file");
+ usage(argv[0]);
+ }
+
+ if(doScan)
+ scan(&l, scanFile);
+ else if(doSearch)
+ search(&l, (uchar_t *) needle, handle_match, NULL, icase, true);
+ else {
+ debug("no action");
+ usage(argv[0]);
+ }
+}
+#endif