Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools2/create_index.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools2/create_index.py')
-rwxr-xr-xtools2/create_index.py210
1 files changed, 210 insertions, 0 deletions
diff --git a/tools2/create_index.py b/tools2/create_index.py
new file mode 100755
index 0000000..51550b8
--- /dev/null
+++ b/tools2/create_index.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# create index
+# use https://bitbucket.org/james_taylor/seek-bzip2
+
+import codecs
+import os
+import sys
+from subprocess import call, Popen, PIPE, STDOUT
+import shutil
+import re
+import logging
+import config
+
+input_xml_file_name = config.input_xml_file_name
+
+
+def normalize_title(title):
+ return title.strip().replace(' ', '_').capitalize()
+
+
+def create_index(pages_blacklist):
+ output_file = open("%s.processed.idx" % input_xml_file_name, mode='w')
+ num_block = 1
+ index_file = open("%s.processed.bz2t" % input_xml_file_name, mode='r')
+ index_line = index_file.readline()
+ while index_line:
+ parts = index_line.split()
+ block_start = int(parts[0])
+ print "Block %d starts at %d" % (num_block, block_start)
+ position = 0
+ # extract the block
+ bzip_file = open("%s.processed.bz2" % input_xml_file_name, mode='r')
+ cmd = ['../bin/%s/seek-bunzip' % config.system_id, str(block_start)]
+ p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
+ close_fds=True)
+ data_line = p.stdout.readline()
+ while data_line:
+ position += len(data_line)
+ #print data_line
+ if len(data_line) == 2:
+ if ord(data_line[0]) == 1:
+ title = p.stdout.readline()
+ position += len(title)
+ # read article size
+ # size
+ size_line = p.stdout.readline()
+ position += len(size_line)
+ # \02
+ data_line = p.stdout.readline()
+ position += len(data_line)
+ title = title[0:-1].strip().capitalize()
+ if title not in pages_blacklist:
+ output_file.write("%s %d %d\n" % \
+ (title, num_block, position))
+ print "Article %s block %d position %d" % \
+ (title, num_block, position)
+ else:
+ print "* Blacklisted %s " % title
+
+ data_line = p.stdout.readline()
+
+ num_block += 1
+ index_line = index_file.readline()
+
+ output_file.close()
+
+
+class FileListReader():
+
+ def __init__(self, file_name):
+ _file = codecs.open(file_name,
+ encoding='utf-8', mode='r')
+ self.list = []
+ line = _file.readline()
+ while line:
+ self.list.append(normalize_title(line))
+ line = _file.readline()
+
+
+class RedirectParser:
+
+ def __init__(self, file_name):
+ self.link_re = re.compile('\[\[.*?\]\]')
+ # Load redirects
+ input_redirects = codecs.open('%s.redirects_used' % file_name,
+ encoding='utf-8', mode='r')
+
+ self.redirects = {}
+ for line in input_redirects.readlines():
+ links = self.link_re.findall(unicode(line))
+ if len(links) == 2:
+ origin = links[0][2:-2]
+ destination = links[1][2:-2]
+ self.redirects[origin] = destination
+ #print "Processing %s" % normalize_title(origin)
+ logging.error("Loaded %d redirects" % len(self.redirects))
+ input_redirects.close()
+
+
+def create_sql_index(input_xml_file_name, pages_blacklist):
+ import sqlite3
+ dbpath = './search.db'
+ if os.path.exists(dbpath):
+ return
+ print 'Creating sqlite database file'
+ conn = sqlite3.connect(dbpath)
+ conn.execute("create table articles(title, block INTEGER, " +
+ "position INTEGER, redirect_to)")
+
+ text_index_file = codecs.open("%s.processed.idx" % input_xml_file_name,
+ encoding='utf-8', mode='r')
+ line = text_index_file.readline()
+ while line:
+ parts = line.split()
+ if len(parts) > 0:
+ title_article = parts[0]
+ block_article = parts[1]
+ position_article = parts[2]
+ title_article = normalize_title(title_article)
+ if title_article not in pages_blacklist:
+ if title_article.find("'") > -1:
+ title_article = title_article.replace("'", "\\'")
+ if title_article.find('"') > -1:
+ title_article = title_article.replace('"', '')
+
+ command = 'insert into articles values ("%s", %s, %s, "%s")' \
+ % (unicode(title_article), int(block_article),
+ int(position_article), unicode(''))
+ print ".",
+ conn.execute(command)
+ else:
+ print "* Blacklisted %s " % title_article
+ line = text_index_file.readline()
+ conn.commit()
+ # add redirects
+ redirects_parser = RedirectParser(input_xml_file_name)
+ for origin in redirects_parser.redirects.keys():
+ origin = normalize_title(origin)
+ try:
+ destination = normalize_title(redirects_parser.redirects[origin])
+ if origin not in pages_blacklist and \
+ destination not in pages_blacklist:
+ if origin.find("'") > -1:
+ origin = origin.replace("'", "\\'")
+ if destination.find("'") > -1:
+ destination = destination.replace("'", "\\'")
+ print ".",
+ conn.execute(
+ 'insert into articles values ("%s", %s, %s, "%s")' %
+ (unicode(origin), 0, 0, unicode(destination)))
+ else:
+ print "* Blacklisted %s " % origin
+ except:
+ print "ERROR: origin %s destination %s" % (origin, destination)
+ text_index_file.close()
+ conn.commit()
+
+
+def create_bzip_table():
+ """
+ ../seek-bzip2/seek-bzip2/bzip-table <
+ eswiki-20110810-pages-articles.xml.processed.bz2 >
+ eswiki-20110810-pages-articles.xml.processed.bz2t
+ """
+ cmd = ['../bin/%s/bzip-table' % config.system_id]
+ bzip_file = open('%s.processed.bz2' % input_xml_file_name, mode='r')
+ table_file = open('%s.processed.bz2t' % input_xml_file_name, mode='w')
+ call(cmd, stdin=bzip_file, stdout=table_file, close_fds=True)
+
+if len(sys.argv) > 1:
+ if sys.argv[1] == '--delete_old':
+ if os.path.exists('%s.processed.bz2' % input_xml_file_name):
+ os.remove('%s.processed.bz2' % input_xml_file_name)
+ if os.path.exists('%s.processed.bz2t' % input_xml_file_name):
+ os.remove('%s.processed.bz2t' % input_xml_file_name)
+ if os.path.exists('%s.processed.idx' % input_xml_file_name):
+ os.remove('%s.processed.idx' % input_xml_file_name)
+ if os.path.exists('search.db'):
+ os.remove('search.db')
+
+if os.path.exists(config.blacklist_file_name):
+ pages_blacklisted_reader = FileListReader(config.blacklist_file_name)
+ pages_blacklist = pages_blacklisted_reader.list
+ print "Loaded %d blacklisted pages" % len(pages_blacklist)
+else:
+ pages_blacklist = []
+
+print 'Compressing .processed file'
+if not os.path.exists('%s.processed.bz2' % input_xml_file_name):
+ cmd = ['bzip2', '-zk', '%s.processed' % input_xml_file_name]
+ p = call(cmd)
+ if os.path.exists('%s.processed.bz2t' % input_xml_file_name):
+ os.remove('%s.processed.bz2t' % input_xml_file_name)
+else:
+ print '.bz2 already exists. Skipping'
+
+if not os.path.exists('%s.processed.bz2t' % input_xml_file_name):
+ print 'Creating bzip2 table file'
+ create_bzip_table()
+else:
+ print '.bz2t already exists. Skipping'
+
+if not os.path.exists('%s.processed.idx' % input_xml_file_name):
+ print 'Creating index file'
+ create_index(pages_blacklist)
+else:
+ print '.idx already exists. Skipping'
+
+create_sql_index(input_xml_file_name, pages_blacklist)