1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# create index
# use https://bitbucket.org/james_taylor/seek-bzip2
import codecs
import os
from subprocess import call, Popen, PIPE, STDOUT
input_xml_file_name = './eswiki-20111112-pages-articles.xml'
def create_index():
output_file = open("%s.processed.idx" % input_xml_file_name, mode='w')
num_block = 1
index_file = open("%s.processed.bz2t" % input_xml_file_name, mode='r')
index_line = index_file.readline()
while index_line:
parts = index_line.split()
block_start = int(parts[0])
print "Block %d starts at %d" % (num_block, block_start)
position = 0
# extract the block
bzip_file = open("%s.processed.bz2" % input_xml_file_name, mode='r')
cmd = ['./seek-bzip2/seek-bunzip', str(block_start)]
p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
close_fds=True)
data_line = p.stdout.readline()
while data_line:
position += len(data_line)
#print data_line
if len(data_line) == 2:
if ord(data_line[0]) == 1:
title = p.stdout.readline()
position += len(title)
# read article size
# size
size_line = p.stdout.readline()
position += len(size_line)
# \02
data_line = p.stdout.readline()
position += len(data_line)
output_file.write("%s %d %d\n" % \
(title[0:-1], num_block, position))
print "Article %s block %d position %d" % \
(title[0:-1], num_block, position)
data_line = p.stdout.readline()
num_block += 1
index_line = index_file.readline()
output_file.close()
def create_bzip_table():
"""
../seek-bzip2/seek-bzip2/bzip-table <
eswiki-20110810-pages-articles.xml.processed.bz2 >
eswiki-20110810-pages-articles.xml.processed.bz2t
"""
cmd = ['./seek-bzip2/bzip-table']
bzip_file = open('%s.processed.bz2' % input_xml_file_name, mode='r')
table_file = open('%s.processed.bz2t' % input_xml_file_name, mode='w')
call(cmd, stdin=bzip_file, stdout=table_file, close_fds=True)
print 'Compressing .processed file'
if not os.path.exists('%s.processed.bz2' % input_xml_file_name):
cmd = ['bzip2', '-zk', '%s.processed' % input_xml_file_name]
p = call(cmd)
else:
print '.bz2 already exists. Skipping'
print 'Creating bzip2 table file'
create_bzip_table()
print 'Creating index file'
create_index()
|