1 files changed, 210 insertions, 0 deletions
diff --git a/tools2/create_index.py b/tools2/create_index.py
new file mode 100755
index 0000000..51550b8
--- /dev/null
+++ b/tools2/create_index.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# create index
+# use https://bitbucket.org/james_taylor/seek-bzip2
+
+import codecs
+import os
+import sys
+from subprocess import call, Popen, PIPE, STDOUT
+import shutil
+import re
+import logging
+import config
+
+input_xml_file_name = config.input_xml_file_name
+
+
+def normalize_title(title):
+    return title.strip().replace(' ', '_').capitalize()
+
+
+def create_index(pages_blacklist):
+    output_file = open("%s.processed.idx" % input_xml_file_name, mode='w')
+    num_block = 1
+    index_file = open("%s.processed.bz2t" % input_xml_file_name, mode='r')
+    index_line = index_file.readline()
+    while index_line:
+        parts = index_line.split()
+        block_start = int(parts[0])
+        print "Block %d starts at %d" % (num_block, block_start)
+        position = 0
+        # extract the block
+        bzip_file = open("%s.processed.bz2" % input_xml_file_name, mode='r')
+        cmd = ['../bin/%s/seek-bunzip' % config.system_id, str(block_start)]
+        p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
+                close_fds=True)
+        data_line = p.stdout.readline()
+        while data_line:
+            position += len(data_line)
+            #print data_line
+            if len(data_line) == 2:
+                if ord(data_line[0]) == 1:
+                    title = p.stdout.readline()
+                    position += len(title)
+                    # read article size
+                    # size
+                    size_line = p.stdout.readline()
+                    position += len(size_line)
+                    # \02
+                    data_line = p.stdout.readline()
+                    position += len(data_line)
+                    title = title[0:-1].strip().capitalize()
+                    if title not in pages_blacklist:
+                        output_file.write("%s %d %d\n" % \
+                            (title, num_block, position))
+                        print "Article %s block %d position %d" % \
+                            (title, num_block, position)
+                    else:
+                        print "* Blacklisted %s " % title
+
+            data_line = p.stdout.readline()
+
+        num_block += 1
+        index_line = index_file.readline()
+
+    output_file.close()
+
+
+class FileListReader():
+
+    def __init__(self, file_name):
+        _file = codecs.open(file_name,
+                                encoding='utf-8', mode='r')
+        self.list = []
+        line = _file.readline()
+        while line:
+            self.list.append(normalize_title(line))
+            line = _file.readline()
+
+
+class RedirectParser:
+
+    def __init__(self, file_name):
+        self.link_re = re.compile('\[\[.*?\]\]')
+        # Load redirects
+        input_redirects = codecs.open('%s.redirects_used' % file_name,
+                encoding='utf-8', mode='r')
+
+        self.redirects = {}
+        for line in input_redirects.readlines():
+            links = self.link_re.findall(unicode(line))
+            if len(links) == 2:
+                origin = links[0][2:-2]
+                destination = links[1][2:-2]
+                self.redirects[origin] = destination
+            #print "Processing %s" % normalize_title(origin)
+        logging.error("Loaded %d redirects" % len(self.redirects))
+        input_redirects.close()
+
+
+def create_sql_index(input_xml_file_name, pages_blacklist):
+    import sqlite3
+    dbpath = './search.db'
+    if os.path.exists(dbpath):
+        return
+    print 'Creating sqlite database file'
+    conn = sqlite3.connect(dbpath)
+    conn.execute("create table articles(title, block INTEGER, " +
+            "position INTEGER, redirect_to)")
+
+    text_index_file = codecs.open("%s.processed.idx" % input_xml_file_name,
+            encoding='utf-8', mode='r')
+    line = text_index_file.readline()
+    while line:
+        parts = line.split()
+        if len(parts) > 0:
+            title_article = parts[0]
+            block_article = parts[1]
+            position_article = parts[2]
+            title_article = normalize_title(title_article)
+            if title_article not in pages_blacklist:
+                if title_article.find("'") > -1:
+                    title_article = title_article.replace("'", "\\'")
+                if title_article.find('"') > -1:
+                    title_article = title_article.replace('"', '')
+
+                command = 'insert into articles values ("%s", %s, %s, "%s")' \
+                     % (unicode(title_article), int(block_article),
+                    int(position_article), unicode(''))
+                print ".",
+                conn.execute(command)
+            else:
+                print "* Blacklisted %s " % title_article
+        line = text_index_file.readline()
+    conn.commit()
+    # add redirects
+    redirects_parser = RedirectParser(input_xml_file_name)
+    for origin in redirects_parser.redirects.keys():
+        origin = normalize_title(origin)
+        try:
+            destination = normalize_title(redirects_parser.redirects[origin])
+            if origin not in pages_blacklist and \
+                    destination not in pages_blacklist:
+                if origin.find("'") > -1:
+                    origin = origin.replace("'", "\\'")
+                if destination.find("'") > -1:
+                    destination = destination.replace("'", "\\'")
+                print ".",
+                conn.execute(
+                    'insert into articles values ("%s", %s, %s, "%s")' %
+                    (unicode(origin), 0, 0, unicode(destination)))
+            else:
+                print "* Blacklisted %s " % origin
+        except:
+            print "ERROR: origin %s destination %s" % (origin, destination)
+    text_index_file.close()
+    conn.commit()
+
+
+def create_bzip_table():
+    """
+    ../seek-bzip2/seek-bzip2/bzip-table <
+    eswiki-20110810-pages-articles.xml.processed.bz2 >
+    eswiki-20110810-pages-articles.xml.processed.bz2t
+    """
+    cmd = ['../bin/%s/bzip-table' % config.system_id]
+    bzip_file = open('%s.processed.bz2' % input_xml_file_name, mode='r')
+    table_file = open('%s.processed.bz2t' % input_xml_file_name, mode='w')
+    call(cmd, stdin=bzip_file, stdout=table_file, close_fds=True)
+
+if len(sys.argv) > 1:
+    if sys.argv[1] == '--delete_old':
+        if os.path.exists('%s.processed.bz2' % input_xml_file_name):
+            os.remove('%s.processed.bz2' % input_xml_file_name)
+        if os.path.exists('%s.processed.bz2t' % input_xml_file_name):
+            os.remove('%s.processed.bz2t' % input_xml_file_name)
+        if os.path.exists('%s.processed.idx' % input_xml_file_name):
+            os.remove('%s.processed.idx' % input_xml_file_name)
+        if os.path.exists('search.db'):
+            os.remove('search.db')
+
+if os.path.exists(config.blacklist_file_name):
+    pages_blacklisted_reader = FileListReader(config.blacklist_file_name)
+    pages_blacklist = pages_blacklisted_reader.list
+    print "Loaded %d blacklisted pages" % len(pages_blacklist)
+else:
+    pages_blacklist = []
+
+print 'Compressing .processed file'
+if not os.path.exists('%s.processed.bz2' % input_xml_file_name):
+    cmd = ['bzip2', '-zk', '%s.processed' % input_xml_file_name]
+    p = call(cmd)
+    if os.path.exists('%s.processed.bz2t' % input_xml_file_name):
+        os.remove('%s.processed.bz2t' % input_xml_file_name)
+else:
+    print '.bz2 already exists. Skipping'
+
+if not os.path.exists('%s.processed.bz2t' % input_xml_file_name):
+    print 'Creating bzip2 table file'
+    create_bzip_table()
+else:
+    print '.bz2t already exists. Skipping'
+
+if not os.path.exists('%s.processed.idx' % input_xml_file_name):
+    print 'Creating index file'
+    create_index(pages_blacklist)
+else:
+    print '.idx already exists. Skipping'
+
+create_sql_index(input_xml_file_name, pages_blacklist)