From 1c0b2a4263c0193cf7205ec80033959ac699b6c7 Mon Sep 17 00:00:00 2001 From: Gonzalo Odiard Date: Mon, 05 Dec 2011 07:31:24 +0000 Subject: create_index.py use seek_bzip2 to create the indes needed to decombress blocks --- diff --git a/create_index.py b/create_index.py new file mode 100755 index 0000000..b46663f --- /dev/null +++ b/create_index.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# create index +# use https://bitbucket.org/james_taylor/seek-bzip2 + +import codecs +import os +from subprocess import call, Popen, PIPE, STDOUT + +input_xml_file_name = './eswiki-20111112-pages-articles.xml' + +def create_index(): + output_file = open("%s.processed.idx" % input_xml_file_name, mode='w') + num_block = 1 + index_file = open("%s.processed.bz2t" % input_xml_file_name, mode='r') + index_line = index_file.readline() + while index_line: + parts = index_line.split() + block_start = int(parts[0]) + print "Block %d starts at %d" % (num_block, block_start) + position = 0 + # extract the block + bzip_file = open("%s.processed.bz2" % input_xml_file_name, mode='r') + cmd = ['./seek-bzip2/seek-bunzip', str(block_start)] + p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT, + close_fds=True) + data_line = p.stdout.readline() + while data_line: + position += len(data_line) + #print data_line + if len(data_line) == 2: + if ord(data_line[0]) == 1: + title = p.stdout.readline() + position += len(title) + # read article size + # size + size_line = p.stdout.readline() + position += len(size_line) + # \02 + data_line = p.stdout.readline() + position += len(data_line) + output_file.write("%s %d %d\n" % \ + (title[0:-1], num_block, position)) + print "Article %s block %d position %d" % \ + (title[0:-1], num_block, position) + + data_line = p.stdout.readline() + + num_block += 1 + index_line = index_file.readline() + + output_file.close() + +def create_bzip_table(): + """ + ../seek-bzip2/seek-bzip2/bzip-table < + eswiki-20110810-pages-articles.xml.processed.bz2 > + eswiki-20110810-pages-articles.xml.processed.bz2t + """ + cmd = ['./seek-bzip2/bzip-table'] + bzip_file = open('%s.processed.bz2' % input_xml_file_name, mode='r') + table_file = open('%s.processed.bz2t' % input_xml_file_name, mode='w') + call(cmd, stdin=bzip_file, stdout=table_file, close_fds=True) + + +print 'Compressing .processed file' +if not os.path.exists('%s.processed.bz2' % input_xml_file_name): + cmd = ['bzip2', '-zk', '%s.processed' % input_xml_file_name] + p = call(cmd) +else: + print '.bz2 already exists. Skipping' + +print 'Creating bzip2 table file' +create_bzip_table() + +print 'Creating index file' +create_index() + + -- cgit v0.9.1