#!/usr/bin/env python # -*- coding: utf-8 -*- # create index # use https://bitbucket.org/james_taylor/seek-bzip2 import codecs import os import sys from subprocess import call, Popen, PIPE, STDOUT import shutil import re import logging import config input_xml_file_name = config.input_xml_file_name def normalize_title(title): return title.strip().replace(' ', '_').capitalize() def create_index(pages_blacklist): output_file = open("%s.processed.idx" % input_xml_file_name, mode='w') num_block = 1 index_file = open("%s.processed.bz2t" % input_xml_file_name, mode='r') index_line = index_file.readline() while index_line: parts = index_line.split() block_start = int(parts[0]) print "Block %d starts at %d" % (num_block, block_start) position = 0 # extract the block bzip_file = open("%s.processed.bz2" % input_xml_file_name, mode='r') cmd = ['../bin/%s/seek-bunzip' % config.system_id, str(block_start)] p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT, close_fds=True) data_line = p.stdout.readline() while data_line: position += len(data_line) #print data_line if len(data_line) == 2: if ord(data_line[0]) == 1: title = p.stdout.readline() position += len(title) # read article size # size size_line = p.stdout.readline() position += len(size_line) # \02 data_line = p.stdout.readline() position += len(data_line) title = title[0:-1].strip().capitalize() if title not in pages_blacklist: output_file.write("%s %d %d\n" % \ (title, num_block, position)) print "Article %s block %d position %d" % \ (title, num_block, position) else: print "* Blacklisted %s " % title data_line = p.stdout.readline() num_block += 1 index_line = index_file.readline() output_file.close() class FileListReader(): def __init__(self, file_name): _file = codecs.open(file_name, encoding='utf-8', mode='r') self.list = [] line = _file.readline() while line: self.list.append(normalize_title(line)) line = _file.readline() class RedirectParser: def __init__(self, file_name): self.link_re = re.compile('\[\[.*?\]\]') # Load redirects input_redirects = codecs.open('%s.redirects_used' % file_name, encoding='utf-8', mode='r') self.redirects = {} for line in input_redirects.readlines(): links = self.link_re.findall(unicode(line)) if len(links) == 2: origin = links[0][2:-2] destination = links[1][2:-2] self.redirects[origin] = destination #print "Processing %s" % normalize_title(origin) logging.error("Loaded %d redirects" % len(self.redirects)) input_redirects.close() def create_sql_index(input_xml_file_name, pages_blacklist): import sqlite3 dbpath = './search.db' if os.path.exists(dbpath): return print 'Creating sqlite database file' conn = sqlite3.connect(dbpath) conn.execute("create table articles(title, block INTEGER, " + "position INTEGER, redirect_to)") text_index_file = codecs.open("%s.processed.idx" % input_xml_file_name, encoding='utf-8', mode='r') line = text_index_file.readline() while line: parts = line.split() if len(parts) > 0: title_article = parts[0] block_article = parts[1] position_article = parts[2] title_article = normalize_title(title_article) if title_article not in pages_blacklist: if title_article.find("'") > -1: title_article = title_article.replace("'", "\\'") if title_article.find('"') > -1: title_article = title_article.replace('"', '') command = 'insert into articles values ("%s", %s, %s, "%s")' \ % (unicode(title_article), int(block_article), int(position_article), unicode('')) print ".", conn.execute(command) else: print "* Blacklisted %s " % title_article line = text_index_file.readline() conn.commit() # add redirects redirects_parser = RedirectParser(input_xml_file_name) for origin in redirects_parser.redirects.keys(): origin = normalize_title(origin) try: destination = normalize_title(redirects_parser.redirects[origin]) if origin not in pages_blacklist and \ destination not in pages_blacklist: if origin.find("'") > -1: origin = origin.replace("'", "\\'") if destination.find("'") > -1: destination = destination.replace("'", "\\'") print ".", conn.execute( 'insert into articles values ("%s", %s, %s, "%s")' % (unicode(origin), 0, 0, unicode(destination))) else: print "* Blacklisted %s " % origin except: print "ERROR: origin %s destination %s" % (origin, destination) text_index_file.close() conn.commit() def create_bzip_table(): """ ../seek-bzip2/seek-bzip2/bzip-table < eswiki-20110810-pages-articles.xml.processed.bz2 > eswiki-20110810-pages-articles.xml.processed.bz2t """ cmd = ['../bin/%s/bzip-table' % config.system_id] bzip_file = open('%s.processed.bz2' % input_xml_file_name, mode='r') table_file = open('%s.processed.bz2t' % input_xml_file_name, mode='w') call(cmd, stdin=bzip_file, stdout=table_file, close_fds=True) if len(sys.argv) > 1: if sys.argv[1] == '--delete_old': if os.path.exists('%s.processed.bz2' % input_xml_file_name): os.remove('%s.processed.bz2' % input_xml_file_name) if os.path.exists('%s.processed.bz2t' % input_xml_file_name): os.remove('%s.processed.bz2t' % input_xml_file_name) if os.path.exists('%s.processed.idx' % input_xml_file_name): os.remove('%s.processed.idx' % input_xml_file_name) if os.path.exists('search.db'): os.remove('search.db') if os.path.exists(config.blacklist_file_name): pages_blacklisted_reader = FileListReader(config.blacklist_file_name) pages_blacklist = pages_blacklisted_reader.list print "Loaded %d blacklisted pages" % len(pages_blacklist) else: pages_blacklist = [] print 'Compressing .processed file' if not os.path.exists('%s.processed.bz2' % input_xml_file_name): cmd = ['bzip2', '-zk', '%s.processed' % input_xml_file_name] p = call(cmd) if os.path.exists('%s.processed.bz2t' % input_xml_file_name): os.remove('%s.processed.bz2t' % input_xml_file_name) else: print '.bz2 already exists. Skipping' if not os.path.exists('%s.processed.bz2t' % input_xml_file_name): print 'Creating bzip2 table file' create_bzip_table() else: print '.bz2t already exists. Skipping' if not os.path.exists('%s.processed.idx' % input_xml_file_name): print 'Creating index file' create_index(pages_blacklist) else: print '.idx already exists. Skipping' create_sql_index(input_xml_file_name, pages_blacklist)