#include "wp.h"

#define MAXSTR 1024
#define BOUNDS_HIT_END -2
#define BOUNDS_NO_START -3
#define BOUNDS_NO_TEXT -4
#define BOUNDS_FAIL -10

char *exact_match;
int exact_block;

void load_dump(wp_dump *d, char *dump, char *loc, char *ploc, char *blocks) {
  d->fp = xfopen(dump, "r");
  d->block_map = load_block_map(blocks);

  load_index(&d->index, loc, ploc);
}

void init_article(wp_article *a) {
  a->text = malloc(BZ_MAX_BLOCK);
}

bool handle_exact_match(char *s) {
  char buf[MAXSTR], *end;
  strncpy(buf, s, MAXSTR);
  
  debug("handle_exact_match(%s)", s);

  end = strrchr(buf, ' ') - 1;
  *end = '\0';

  if(strcasecmp(buf, exact_match)) return true;
  else {
    exact_block = atoi(end + 1);
    return false;
  }
}

int block_load(wp_dump *d, int block, char *text, int *text_len) {
  uint32_t bzres = 0;
  uint64_t size;

  BitBuffer *bb = bbOfSize(BZ_MAX_BLOCK);
  size = readBlock(d->fp, (d->block_map)[block], bb);   

  if((bzres = decompressBlock(bb->buff, bb->pos, text, text_len)) != BZ_OK)
    fatal("error decompressing block: %d. article_file: 0x%x", bzres, d->fp);

  bbClose(bb);
}

int article_bounds(char *name, char *buf, char **start, int bufsize) {
  debug("find article start: %s, 0x%x, 0x%x, %d", name, buf, *start, bufsize);

  int len = 0;
  int nl = strlen(name);
  char *bufstart = buf;

  /* start of article */
  while(*start = strcasestr(buf, name))
    if(*start && *(*start - 2) == START_HEADING && *(*start + nl) == '\n')
      break;
    else
      buf = *start + 1;

  if(*start == NULL)
    return BOUNDS_NO_START;

  /* start of text */
  while(*(*start)++ != START_TEXT)
    if(**start == START_HEADING)
      return BOUNDS_NO_TEXT;

  /* end of text */
  while(((*start - bufstart) + len) < bufsize && (*(*start + len++) != END_TEXT));

  int size = ((*start - bufstart) + len);
  debug("bufsize: %d, size: %d, len: %d, *start: %d, bufstart: %d, char: %d", bufsize, size, len, *start, bufstart, *(*start + len - 1));
  if(size == bufsize && *(*start + len) != END_TEXT) {
    debug("Hit the end early");
    return BOUNDS_HIT_END;  
  }

  return len - 1;
}

int load_article(wp_dump *d, char *name, wp_article *a) {
  exact_match = name;
  exact_block = -1;
  
  debug("load_article(0x%x, %s, 0x%x)", d, name, a);

  search(&d->index, name, handle_exact_match, NULL, false, true);

  if(exact_block < 0)
    return -1;
  else {
    return block_load_article(d, name, exact_block, a);
  }
}

int block_load_article(wp_dump *d, char *name, int block, wp_article *a) {
  char *text = xalloc(BZ_MAX_BLOCK);
  int text_len = BZ_MAX_BLOCK, article_len;
  char *start;

  debug("opening %s from block %d (%llu)", name, block, (d->block_map)[block]);
  block_load(d, block, text, &text_len);

  if((article_len = article_bounds(name, text, &start, text_len)) < 0) {
    if(article_len == BOUNDS_HIT_END) {
      /* the start was in the block, but we hit the end of the block before finding
       * the end of the article text. Load the next block and re-enter...
       * We only consider the case of loading one extra block; no Wikipedia article will span
       * more than two (it'd have to be >900kb for it to be possible). 
       */

      if(!d->block_map[block + 1])
        fatal("trying to load another block, but don't have any blocks left. "
              "This probably indicates a malformed dump.");

      debug("Loading another block");
      if(!(text = realloc(text, BZ_MAX_BLOCK * 2))) fatal("realloc");
      int n_text_len = BZ_MAX_BLOCK;

      block_load(d, block + 1, text + text_len, &n_text_len);

      if((article_len = article_bounds(name, text, &start, n_text_len + text_len)) < 0)
        fatal("additional block loading failure: %d", article_len);
    } else
      fatal("couldn't find %s in block %d", name, block);
  }

  debug("start: 0x%x, len: %d", start, article_len);
  debug("a->text: 0x%x", a->text);

  *(start + article_len) = '\0';
  strncpy(a->text, start, article_len + 1);
  a->block = block;
  free(text);
  return article_len;
}

#ifndef WP_INCLUDE
int main(int argc, char **argv) {
  debug = true;

  wp_dump d;
  wp_article a = {0};

  load_dump(&d, argv[1], argv[2], argv[3], argv[4]);
  load_article(&d, argv[5], &a);
  printf("%s\n", a.text);
  
  return 0;
}
#endif