test_index.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# create index

import codecs
import os
from subprocess import Popen, PIPE, STDOUT


class FileListReader():

    def __init__(self, file_name):
        _file = codecs.open(file_name,
                                encoding='utf-8', mode='r')
        self.list = []
        line = _file.readline()
        while line:
            self.list.append(line.strip())
            line = _file.readline()


class DataRetriever():

    def __init__(self, data_files_base):
        self._bzip_file_name = '%s.processed.bz2' % data_files_base
        self._bzip_table_file_name = '%s.processed.bz2t' % data_files_base
        self._index_file_name = '%s.processed.idx' % data_files_base

    def _get_article_position(self, article_title):
        index_file = codecs.open(self._index_file_name, encoding='utf-8',
                mode='r')

        index_line = index_file.readline()
        num_block = -1
        position = -1
        while index_line:
            words = index_line.split()
            article = words[0]
            if article == article_title:
                num_block = int(words[1])
                position = int(words[2])
                break
            index_line = index_file.readline()
        index_file.close()
        return num_block, position

    def _get_block_start(self, num_block):
        bzip_table_file = open(self._bzip_table_file_name, mode='r')
        n = num_block
        while n > 0:
            table_line = bzip_table_file.readline()
            n -= 1

        parts = table_line.split()
        block_start = int(parts[0])
        bzip_table_file.close()
        return block_start

    def get_text_article(self, article_title):
        output = ''
        num_block, position = self._get_article_position(article_title)
        print "Looking for article %s at block %d position %d" % \
                (article_title, num_block, position)

        block_start = self._get_block_start(num_block)
        print "Block %d starts at %d" % (num_block, block_start)

        # extract the block
        bzip_file = open(self._bzip_file_name, mode='r')
        cmd = ['./seek-bzip2/seek-bunzip', str(block_start)]
        p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
                close_fds=True)

        while position > 0:
            line = p.stdout.readline()
            position -= len(line)

        finish = False
        while not finish:
            line = p.stdout.readline()
            if len(line) == 2:
                if ord(line[0]) == 3:
                    finish = True
                    break
            output += line
        return output

if __name__ == '__main__':
    data_retriever = DataRetriever('./eswiki-20111112-pages-articles.xml')
    print data_retriever.get_text_article('Argentina')