Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/woip/c/wp.c
diff options
context:
space:
mode:
Diffstat (limited to 'woip/c/wp.c')
-rw-r--r--woip/c/wp.c157
1 files changed, 157 insertions, 0 deletions
diff --git a/woip/c/wp.c b/woip/c/wp.c
new file mode 100644
index 0000000..3947c5c
--- /dev/null
+++ b/woip/c/wp.c
@@ -0,0 +1,157 @@
+#include "wp.h"
+
+#define MAXSTR 1024
+#define BOUNDS_HIT_END -2
+#define BOUNDS_NO_START -3
+#define BOUNDS_NO_TEXT -4
+#define BOUNDS_FAIL -10
+
+char *exact_match;
+int exact_block;
+
+void load_dump(wp_dump *d, char *dump, char *loc, char *ploc, char *blocks) {
+ d->fp = xfopen(dump, "r");
+ d->block_map = load_block_map(blocks);
+
+ load_index(&d->index, loc, ploc);
+}
+
+void init_article(wp_article *a) {
+ a->text = malloc(BZ_MAX_BLOCK);
+}
+
+bool handle_exact_match(char *s) {
+ char buf[MAXSTR], *end;
+ strncpy(buf, s, MAXSTR);
+
+ debug("handle_exact_match(%s)", s);
+
+ end = strrchr(buf, ' ') - 1;
+ *end = '\0';
+
+ if(strcasecmp(buf, exact_match)) return true;
+ else {
+ exact_block = atoi(end + 1);
+ return false;
+ }
+}
+
+int block_load(wp_dump *d, int block, char *text, int *text_len) {
+ uint32_t bzres = 0;
+ uint64_t size;
+
+ BitBuffer *bb = bbOfSize(BZ_MAX_BLOCK);
+ size = readBlock(d->fp, (d->block_map)[block], bb);
+
+ if((bzres = decompressBlock(bb->buff, bb->pos, text, text_len)) != BZ_OK)
+ fatal("error decompressing block: %d. article_file: 0x%x", bzres, d->fp);
+
+ bbClose(bb);
+}
+
+int article_bounds(char *name, char *buf, char **start, int bufsize) {
+ debug("find article start: %s, 0x%x, 0x%x, %d", name, buf, *start, bufsize);
+
+ int len = 0;
+ int nl = strlen(name);
+ char *bufstart = buf;
+
+ /* start of article */
+ while(*start = strcasestr(buf, name))
+ if(*start && *(*start - 2) == START_HEADING && *(*start + nl) == '\n')
+ break;
+ else
+ buf = *start + 1;
+
+ if(*start == NULL)
+ return BOUNDS_NO_START;
+
+ /* start of text */
+ while(*(*start)++ != START_TEXT)
+ if(**start == START_HEADING)
+ return BOUNDS_NO_TEXT;
+
+ /* end of text */
+ while(((*start - bufstart) + len) < bufsize && (*(*start + len++) != END_TEXT));
+
+ int size = ((*start - bufstart) + len);
+ debug("bufsize: %d, size: %d, len: %d, *start: %d, bufstart: %d, char: %d", bufsize, size, len, *start, bufstart, *(*start + len - 1));
+ if(size == bufsize && *(*start + len) != END_TEXT) {
+ debug("Hit the end early");
+ return BOUNDS_HIT_END;
+ }
+
+ return len - 1;
+}
+
+int load_article(wp_dump *d, char *name, wp_article *a) {
+ exact_match = name;
+ exact_block = -1;
+
+ debug("load_article(0x%x, %s, 0x%x)", d, name, a);
+
+ search(&d->index, name, handle_exact_match, NULL, false, true);
+
+ if(exact_block < 0)
+ return -1;
+ else {
+ return block_load_article(d, name, exact_block, a);
+ }
+}
+
+int block_load_article(wp_dump *d, char *name, int block, wp_article *a) {
+ char *text = xalloc(BZ_MAX_BLOCK);
+ int text_len = BZ_MAX_BLOCK, article_len;
+ char *start;
+
+ debug("opening %s from block %d (%llu)", name, block, (d->block_map)[block]);
+ block_load(d, block, text, &text_len);
+
+ if((article_len = article_bounds(name, text, &start, text_len)) < 0) {
+ if(article_len == BOUNDS_HIT_END) {
+ /* the start was in the block, but we hit the end of the block before finding
+ * the end of the article text. Load the next block and re-enter...
+ * We only consider the case of loading one extra block; no Wikipedia article will span
+ * more than two (it'd have to be >900kb for it to be possible).
+ */
+
+ if(!d->block_map[block + 1])
+ fatal("trying to load another block, but don't have any blocks left. "
+ "This probably indicates a malformed dump.");
+
+ debug("Loading another block");
+ if(!(text = realloc(text, BZ_MAX_BLOCK * 2))) fatal("realloc");
+ int n_text_len = BZ_MAX_BLOCK;
+
+ block_load(d, block + 1, text + text_len, &n_text_len);
+
+ if((article_len = article_bounds(name, text, &start, n_text_len + text_len)) < 0)
+ fatal("additional block loading failure: %d", article_len);
+ } else
+ fatal("couldn't find %s in block %d", name, block);
+ }
+
+ debug("start: 0x%x, len: %d", start, article_len);
+ debug("a->text: 0x%x", a->text);
+
+ *(start + article_len) = '\0';
+ strncpy(a->text, start, article_len + 1);
+ a->block = block;
+ free(text);
+ return article_len;
+}
+
+#ifndef WP_INCLUDE
+int main(int argc, char **argv) {
+ debug = true;
+
+ wp_dump d;
+ wp_article a = {0};
+
+ load_dump(&d, argv[1], argv[2], argv[3], argv[4]);
+ load_article(&d, argv[5], &a);
+ printf("%s\n", a.text);
+
+ return 0;
+}
+#endif