diff options
Diffstat (limited to 'woip/c/lsearcher.c')
-rw-r--r-- | woip/c/lsearcher.c | 338 |
1 files changed, 338 insertions, 0 deletions
diff --git a/woip/c/lsearcher.c b/woip/c/lsearcher.c new file mode 100644 index 0000000..6f69b5f --- /dev/null +++ b/woip/c/lsearcher.c @@ -0,0 +1,338 @@ +#include "lsearcher.h" + +int kill_switch; +ucaddr_t addr = 0; + +void kill_search() { + kill_switch = 1; +} + +void search_progress(lindex *l, double *d) { + double prog = ((double) (addr - l->addr)) / ((double) l->size); + debug("prog: %lf", prog); + *d = prog; +} + +int check_bigram_char(int ch) { + /* legal bigram: 0, ASCII_MIN ... ASCII_MAX */ + if (ch == 0 || (ch >= ASCII_MIN && ch <= ASCII_MAX)) + return(ch); + + fatal("locate database header corrupt, bigram char outside 0, %d-%d: %d", + ASCII_MIN, ASCII_MAX, ch); + exit(1); +} + +uchar_t *tolower_word(uchar_t *word) { + register uchar_t *p; + + for(p = word; *p != '\0'; p++) + *p = TOLOWER(*p); + + return(word); +} + +int matches = 0; + +bool handle_match(uchar_t *s) { + printf("%s\n", s); + return true; +} + +void init_index(lindex *l, ucaddr_t adr, size_t len, uint32_t *pdb) { + if (len < (2*NBG)) + fatal("database too small"); + uchar_t *p, *s; + int c; + + for (c = 0, p = l->bigram1, s = l->bigram2; c < NBG; c++, len-= 2) { + p[c] = check_bigram_char(*adr++); + s[c] = check_bigram_char(*adr++); + } + + l->addr = adr; + l->size = len; + + if(pdb) l->prefixdb = pdb; +} + +void load_index(lindex *l, char *path, char *ppath) { + size_t size; + uchar_t *adr; + uint32_t *paddr = NULL; + + debug("load_index(0x%x, %s, %s)", l, path, ppath); + + adr = xmmapf(path, &size); + if(ppath) paddr = xmmapf(ppath, 0); + + init_index(l, adr, size, paddr); +} + +void scan(lindex *l, char *path) { + int len = l->size; + int c; + uchar_t *adr = l->addr; + int count = 0; + uint32_t prefixdb[CHAR_MAX] = {0}; + + /* don't bother supporting >2gb... an index file that large + * would have search times so high as to be useless + */ + + debug("scanning through %d chars...", len); + + c = *adr++; + len--; + while(len > 0) { + uchar_t *this_switch = adr - 1; + + if(c == SWITCH) { + count += *(int*) adr - OFFSET; + len -= sizeof(int); + adr += sizeof(int); + } else { + count += c - OFFSET; + } + + if(count == 0) { + char x[4]; + int i = 0; + c = *adr++; len--; + for(; i < 4; i++) { + c = *(adr + i - 1); + if(c >= PARITY) { + TO7BIT(c); + x[i] = l->bigram1[c]; + } else if(c > UMLAUT) + x[i] = c; + else if(c == UMLAUT) { + x[i] = *(adr + i); + adr++; len--; + } + } + + if(isalnum(x[0])) { + /* ignore unicode chars for now */ + prefixdb[x[0]] = this_switch - l->addr; + debug("%c starts at 0x%x (unicode: %4s)", x[0], this_switch - l->addr, x); + } + } + + while(len-- > 0 && (c = *adr++) >= UMLAUT); + } + + FILE *fp = xfopen(path, "w"); + fwrite(prefixdb, sizeof(uint32_t), CHAR_MAX, fp); + fclose(fp); +} + +uchar_t *pathprep(uchar_t *path) { + uchar_t *patend; + size_t patlen = strlen(path); + uchar_t *patptr = xalloc(patlen + 2); + *patptr = '\0'; + strncpy(patptr, path, patlen); + patend = patptr + patlen - 1; + *(patptr + patlen) = '\0'; + return patend; +} + +int search(lindex *l, uchar_t *pathpart, resultf f, donef df, bool icase, bool strict) { + register uchar_t *p, *s, *patend, *q, *foundchar; + register int c, cc; + int count, found; + size_t len = l->size; + addr = l->addr; + uchar_t *cutoff, path[MAXSTR]; + + kill_switch = 0; + + /* use a lookup table for case insensitive search */ + uchar_t table[UCHAR_MAX + 1]; + + if(icase) tolower_word(pathpart); + + patend = pathprep(pathpart); + cc = *patend; + + /* set patend char to true */ + for (c = 0; c < UCHAR_MAX + 1; c++) + table[c] = 0; + + if(icase) { + table[TOLOWER(*patend)] = 1; + table[toupper(*patend)] = 1; + } else + table[*patend] = 1; + + /* main loop */ + found = count = 0; + foundchar = 0; + + debug("checking for prefixdb..."); + int offset = -1; + bool skip = false; + if(l->prefixdb) offset = *(l->prefixdb + toupper(*pathpart)); + debug("offset: %d", offset); + if(strict && l->prefixdb && (offset > 0)) { + debug("using prefix db"); + addr = l->addr + offset; + len -= offset; + skip = true; + } + + c = *addr++; + len--; + while(len > 0) { + if(kill_switch) + return -1; + + if (c == SWITCH && !skip) { + count += *(int*) addr - OFFSET; + len -= sizeof(int); + addr += sizeof(int); + } else if(!skip) { + count += c - OFFSET; + } + + skip = false; + + p = path + count; + foundchar = p - 1; + + while(len > 0) { + if(kill_switch) + return -1; + + c = *addr++; + len--; + + if (c < PARITY) { + if (c <= UMLAUT) { + if (c == UMLAUT) { + c = (uchar_t)*addr++; + len--; + } else + break; + } + + if (table[c]) + foundchar = p; + *p++ = c; + } else { + TO7BIT(c); + + if (table[l->bigram1[c]] || table[l->bigram2[c]]) + foundchar = p + 1; + + *p++ = l->bigram1[c]; + *p++ = l->bigram2[c]; + } + } + + if (found) { + cutoff = path; + *p-- = '\0'; + foundchar = p; + } else if (foundchar >= path + count) { + *p-- = '\0'; + cutoff = path + count; + } else if(!strict) + continue; + else + *p-- = '\0'; + + found = 0; + if(strict) { + for(s = path, q = pathpart; *q; s++, q++) + if((icase && TOLOWER(*s) != *q) || (!icase && *s != *q)) + break; + if(*q == '\0') { + if(!f(path)) + return 0; + } + } else { + for (s = foundchar; s >= cutoff; s--) { + if (*s == cc || (icase && TOLOWER(*s) == cc)) { + for (p = patend - 1, q = s - 1; *p != '\0'; p--, q--) + if (*q != *p && (!icase || TOLOWER(*q) != *p)) + break; + if (*p == '\0' && \ + (icase ? \ + strncasecmp(path, pathpart, strlen(pathpart)) : \ + strncmp(path, pathpart, strlen(pathpart)))) { + found = 1; + if(!f(path)) + return 0; + break; + } + } + } + } + } + + if(strict) search(l, pathpart, f, df, icase, false); + else if(df) df(); +} + +#ifdef INCLUDE_MAIN +void usage(char *prog) { + fatal("%s -f <indexFile> [-c <scanFile>] [-s <search>]", prog); +} + +int main(int argc, char **argv) { + size_t size; + uchar_t *dbptr; + extern char *optarg; + char scanFile[MAXSTR], indexFile[MAXSTR], needle[MAXSTR]; + unsigned char ch; + bool doScan = false, doSearch = false, haveScanFile = false, icase = true; + lindex l = {0}; + + debug = false; + + while ((ch = getopt(argc, argv, "dac:s:f:hn")) != 255) { + switch (ch) { + case 'c': + haveScanFile = true; + strncpy(scanFile, optarg, MAXSTR); + break; + case 'n': + doScan = true; + case 'd': + debug = true; + break; + case 'f': + strncpy(indexFile, optarg, MAXSTR); + break; + case 's': + doSearch = true; + strncpy(needle, optarg, MAXSTR); + break; + case 'a': + icase = false; + break; + case 'h': + default: + usage(argv[0]); + } + } + + if(indexFile) + load_index(&l, indexFile, (haveScanFile && doSearch) ? scanFile : NULL); + else { + debug("no index file"); + usage(argv[0]); + } + + if(doScan) + scan(&l, scanFile); + else if(doSearch) + search(&l, (uchar_t *) needle, handle_match, NULL, icase, true); + else { + debug("no action"); + usage(argv[0]); + } +} +#endif |