Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/create_index.py
blob: b46663f4246ea3707dda53e5d0999df430b634a9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# create index
# use https://bitbucket.org/james_taylor/seek-bzip2

import codecs
import os
from subprocess import call, Popen, PIPE, STDOUT

input_xml_file_name = './eswiki-20111112-pages-articles.xml'

def create_index():
    output_file  = open("%s.processed.idx" % input_xml_file_name, mode='w')
    num_block = 1
    index_file  = open("%s.processed.bz2t" % input_xml_file_name, mode='r')
    index_line = index_file.readline()
    while index_line:
        parts = index_line.split()
        block_start = int(parts[0])
        print "Block %d starts at %d" % (num_block, block_start) 
        position = 0
        # extract the block
        bzip_file = open("%s.processed.bz2" % input_xml_file_name, mode='r')
        cmd = ['./seek-bzip2/seek-bunzip', str(block_start)]
        p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
                close_fds=True)
        data_line = p.stdout.readline()
        while data_line:
            position += len(data_line)
            #print data_line
            if len(data_line) == 2:
                if ord(data_line[0]) == 1:
                    title = p.stdout.readline()
                    position += len(title)
                    # read article size
                    # size
                    size_line = p.stdout.readline()
                    position += len(size_line)
                    # \02
                    data_line = p.stdout.readline()
                    position += len(data_line)
                    output_file.write("%s %d %d\n" % \
                        (title[0:-1], num_block, position))
                    print "Article %s block %d position %d" % \
                        (title[0:-1], num_block, position)

            data_line = p.stdout.readline()

        num_block += 1
        index_line = index_file.readline()

    output_file.close()

def create_bzip_table():
    """
    ../seek-bzip2/seek-bzip2/bzip-table <
    eswiki-20110810-pages-articles.xml.processed.bz2 >
    eswiki-20110810-pages-articles.xml.processed.bz2t 
    """
    cmd = ['./seek-bzip2/bzip-table']
    bzip_file = open('%s.processed.bz2' % input_xml_file_name, mode='r')
    table_file = open('%s.processed.bz2t' % input_xml_file_name, mode='w')
    call(cmd, stdin=bzip_file, stdout=table_file, close_fds=True)


print 'Compressing .processed file'
if not os.path.exists('%s.processed.bz2' % input_xml_file_name):
    cmd = ['bzip2', '-zk', '%s.processed' % input_xml_file_name]
    p = call(cmd)
else:
    print '.bz2 already exists. Skipping'

print 'Creating bzip2 table file'
create_bzip_table()

print 'Creating index file'
create_index()