Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGonzalo Odiard <godiard@gmail.com>2011-12-05 07:31:24 (GMT)
committer Gonzalo Odiard <godiard@gmail.com>2011-12-05 07:31:24 (GMT)
commit1c0b2a4263c0193cf7205ec80033959ac699b6c7 (patch)
treea93f83155a725b969f5e0b675f44864fee69f309
parente7fc1f03b3057ce32483b3e16e172519a037130c (diff)
create_index.py use seek_bzip2 to create the indes needed to decombress blocks
-rwxr-xr-xcreate_index.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/create_index.py b/create_index.py
new file mode 100755
index 0000000..b46663f
--- /dev/null
+++ b/create_index.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# create index
+# use https://bitbucket.org/james_taylor/seek-bzip2
+
+import codecs
+import os
+from subprocess import call, Popen, PIPE, STDOUT
+
+input_xml_file_name = './eswiki-20111112-pages-articles.xml'
+
+def create_index():
+ output_file = open("%s.processed.idx" % input_xml_file_name, mode='w')
+ num_block = 1
+ index_file = open("%s.processed.bz2t" % input_xml_file_name, mode='r')
+ index_line = index_file.readline()
+ while index_line:
+ parts = index_line.split()
+ block_start = int(parts[0])
+ print "Block %d starts at %d" % (num_block, block_start)
+ position = 0
+ # extract the block
+ bzip_file = open("%s.processed.bz2" % input_xml_file_name, mode='r')
+ cmd = ['./seek-bzip2/seek-bunzip', str(block_start)]
+ p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
+ close_fds=True)
+ data_line = p.stdout.readline()
+ while data_line:
+ position += len(data_line)
+ #print data_line
+ if len(data_line) == 2:
+ if ord(data_line[0]) == 1:
+ title = p.stdout.readline()
+ position += len(title)
+ # read article size
+ # size
+ size_line = p.stdout.readline()
+ position += len(size_line)
+ # \02
+ data_line = p.stdout.readline()
+ position += len(data_line)
+ output_file.write("%s %d %d\n" % \
+ (title[0:-1], num_block, position))
+ print "Article %s block %d position %d" % \
+ (title[0:-1], num_block, position)
+
+ data_line = p.stdout.readline()
+
+ num_block += 1
+ index_line = index_file.readline()
+
+ output_file.close()
+
+def create_bzip_table():
+ """
+ ../seek-bzip2/seek-bzip2/bzip-table <
+ eswiki-20110810-pages-articles.xml.processed.bz2 >
+ eswiki-20110810-pages-articles.xml.processed.bz2t
+ """
+ cmd = ['./seek-bzip2/bzip-table']
+ bzip_file = open('%s.processed.bz2' % input_xml_file_name, mode='r')
+ table_file = open('%s.processed.bz2t' % input_xml_file_name, mode='w')
+ call(cmd, stdin=bzip_file, stdout=table_file, close_fds=True)
+
+
+print 'Compressing .processed file'
+if not os.path.exists('%s.processed.bz2' % input_xml_file_name):
+ cmd = ['bzip2', '-zk', '%s.processed' % input_xml_file_name]
+ p = call(cmd)
+else:
+ print '.bz2 already exists. Skipping'
+
+print 'Creating bzip2 table file'
+create_bzip_table()
+
+print 'Creating index file'
+create_index()
+
+