dataretriever.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# create index

from subprocess import Popen, PIPE, STDOUT
import re
import os
import logging

import sqlite3


def normalize_title(title):
    return title.strip().replace(' ', '_').capitalize()


class DataRetriever():

    def __init__(self, system_id, data_files_base):
        self.system_id = system_id
        self._bzip_file_name = '%s.processed.bz2' % data_files_base
        self._bzip_table_file_name = '%s.processed.bz2t' % data_files_base
        self.template_re = re.compile('({{.*?}})')
        base_path = os.path.dirname(data_files_base)
        self._db_path = os.path.join(base_path, "search.db")
        # TODO: I need control cache size
        self.templates_cache = {}

    def check_existence(self, article_title):
        article_title = normalize_title(article_title)
        num_block, posi = self._get_article_position(article_title)
        return num_block > -1 and posi > -1

    def _get_article_position(self, article_title):
        article_title = normalize_title(article_title)
        # look at the title in the index database
        conn = sqlite3.connect(self._db_path)
        if article_title.find('"'):
            article_title = article_title.replace('"', '')

        sql = 'SELECT * from articles where title ="%s"' % article_title
        results = conn.execute(sql)
        try:
            row = results.next()
            num_block = row[1]
            position = row[2]
            redirect_to = row[3]
            logging.error('Search article %s returns %s',
                    article_title, row)
        except:
            num_block = -1
            position = -1
        conn.close()

        if num_block == 0 and position == 0:
            # if block and position = 0 serach with the redirect_to value
            num_block2, position2 = \
                    self._get_article_position(redirect_to)
            if num_block2 == 0 and position2 == 0:
                logging.error('Prevent recursion')
                return -1, -1
            else:
                return num_block2, position2
        return num_block, position

    def check_existence_list(self, article_title_list):
        if not article_title_list:
            return []

        conn = sqlite3.connect(self._db_path)
        search_list = '('
        for article_title in article_title_list:
            search_list = search_list + \
                    '"' + normalize_title(article_title) + '",'
        search_list = search_list[:-1] + ')'
        #logging.error(search_list)
        sql = 'SELECT * from articles where title in %s' % search_list
        #logging.error(sql)
        results = conn.execute(sql)
        row = results.next()
        articles = []
        try:
            while row:
                articles.append(row[0])
                row = results.next()
        except:
            pass
        conn.close()
        return articles

    def search(self, article_title):
        conn = sqlite3.connect(self._db_path)
        search_word = '%' + article_title + '%'
        sql = "SELECT * from articles where title like'%s'" % search_word
        results = conn.execute(sql)
        row = results.next()
        articles = []
        try:
            while row:
                articles.append(row[0])
                row = results.next()
        except:
            pass
        conn.close()
        return articles

    def _get_block_start(self, num_block):
        bzip_table_file = open(self._bzip_table_file_name, mode='r')
        n = num_block
        table_line = ''
        while n > 0:
            table_line = bzip_table_file.readline()
            n -= 1
        if table_line == '':
            return -1
        parts = table_line.split()
        block_start = int(parts[0])
        bzip_table_file.close()
        return block_start

    def get_expanded_article(self, article_title):
        """
        This method does not do real template expansion
        is only used to test all the needed templates and redirects are
        available.
        """
        text_article = self.get_text_article(article_title)
        expanded_article = ''
        parts = self.template_re.split(text_article)
        for part in parts:
            if part.startswith('{{'):
                part = part[2:-2]
                #print "TEMPLATE: %s" % part
                if part.find('|') > -1:
                    template_name = part[:part.find('|')]
                else:
                    template_name = part
                # TODO: Plantilla should be a parameter
                template_name = normalize_title('Plantilla:%s' % template_name)
                if template_name in self.templates_cache:
                    expanded_article += self.templates_cache[template_name]
                else:
                    templates_content = self.get_text_article(template_name)
                    expanded_article += templates_content
                    self.templates_cache[template_name] = templates_content
            else:
                expanded_article += part
        return expanded_article

    def get_text_article(self, article_title):
        #print "Looking for article %s" % article_title
        num_block, position = self._get_article_position(article_title)
        #print "Found at block %d position %d" % (num_block, position)
        return self._get_block_text(num_block, position)

    def _get_block_text(self, num_block, position):
        output = ''
        block_start = self._get_block_start(num_block)
        #print "Block %d starts at %d" % (num_block, block_start)
        if block_start == -1:
            return ""

        # extract the block
        bzip_file = open(self._bzip_file_name, mode='r')
        cmd = ['./bin/%s/seek-bunzip' % self.system_id, str(block_start)]
        p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
                close_fds=True)

        while position > 0:
            line = p.stdout.readline()
            position -= len(line)

        finish = False
        while not finish:
            line = p.stdout.readline()
            if line == '':
                # end of block?
                output += self._get_block_text(num_block + 1, 0)
                break
            if len(line) == 2:
                if ord(line[0]) == 3:
                    finish = True
                    break
            output += line
        p.stdout.close()
        #logging.error(output)
        return output