1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# create index
from subprocess import Popen, PIPE, STDOUT
import re
import os
import logging
import sqlite3
def normalize_title(title):
return title.strip().replace(' ', '_').capitalize()
class DataRetriever():
def __init__(self, system_id, data_files_base):
self.system_id = system_id
self._bzip_file_name = '%s.processed.bz2' % data_files_base
self._bzip_table_file_name = '%s.processed.bz2t' % data_files_base
self.template_re = re.compile('({{.*?}})')
base_path = os.path.dirname(data_files_base)
self._db_path = os.path.join(base_path, "search.db")
# TODO: I need control cache size
self.templates_cache = {}
def check_existence(self, article_title):
article_title = normalize_title(article_title)
num_block, posi = self._get_article_position(article_title)
return num_block > -1 and posi > -1
def _get_article_position(self, article_title):
article_title = normalize_title(article_title)
# look at the title in the index database
conn = sqlite3.connect(self._db_path)
if article_title.find('"'):
article_title = article_title.replace('"', '')
sql = 'SELECT * from articles where title ="%s"' % article_title
results = conn.execute(sql)
try:
row = results.next()
num_block = row[1]
position = row[2]
redirect_to = row[3]
logging.error('Search article %s returns %s',
article_title, row)
except:
num_block = -1
position = -1
conn.close()
if num_block == 0 and position == 0:
# if block and position = 0 serach with the redirect_to value
num_block2, position2 = \
self._get_article_position(redirect_to)
if num_block2 == 0 and position2 == 0:
logging.error('Prevent recursion')
return -1, -1
else:
return num_block2, position2
return num_block, position
def check_existence_list(self, article_title_list):
if not article_title_list:
return []
conn = sqlite3.connect(self._db_path)
search_list = '('
for article_title in article_title_list:
search_list = search_list + \
'"' + normalize_title(article_title) + '",'
search_list = search_list[:-1] + ')'
#logging.error(search_list)
sql = 'SELECT * from articles where title in %s' % search_list
#logging.error(sql)
results = conn.execute(sql)
row = results.next()
articles = []
try:
while row:
articles.append(row[0])
row = results.next()
except:
pass
conn.close()
return articles
def search(self, article_title):
conn = sqlite3.connect(self._db_path)
search_word = '%' + article_title + '%'
sql = "SELECT * from articles where title like'%s'" % search_word
results = conn.execute(sql)
row = results.next()
articles = []
try:
while row:
articles.append(row[0])
row = results.next()
except:
pass
conn.close()
return articles
def _get_block_start(self, num_block):
bzip_table_file = open(self._bzip_table_file_name, mode='r')
n = num_block
table_line = ''
while n > 0:
table_line = bzip_table_file.readline()
n -= 1
if table_line == '':
return -1
parts = table_line.split()
block_start = int(parts[0])
bzip_table_file.close()
return block_start
def get_expanded_article(self, article_title):
"""
This method does not do real template expansion
is only used to test all the needed templates and redirects are
available.
"""
text_article = self.get_text_article(article_title)
expanded_article = ''
parts = self.template_re.split(text_article)
for part in parts:
if part.startswith('{{'):
part = part[2:-2]
#print "TEMPLATE: %s" % part
if part.find('|') > -1:
template_name = part[:part.find('|')]
else:
template_name = part
# TODO: Plantilla should be a parameter
template_name = normalize_title('Plantilla:%s' % template_name)
if template_name in self.templates_cache:
expanded_article += self.templates_cache[template_name]
else:
templates_content = self.get_text_article(template_name)
expanded_article += templates_content
self.templates_cache[template_name] = templates_content
else:
expanded_article += part
return expanded_article
def get_text_article(self, article_title):
#print "Looking for article %s" % article_title
num_block, position = self._get_article_position(article_title)
#print "Found at block %d position %d" % (num_block, position)
return self._get_block_text(num_block, position)
def _get_block_text(self, num_block, position):
output = ''
block_start = self._get_block_start(num_block)
#print "Block %d starts at %d" % (num_block, block_start)
if block_start == -1:
return ""
# extract the block
bzip_file = open(self._bzip_file_name, mode='r')
cmd = ['./bin/%s/seek-bunzip' % self.system_id, str(block_start)]
p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
close_fds=True)
while position > 0:
line = p.stdout.readline()
position -= len(line)
finish = False
while not finish:
line = p.stdout.readline()
if line == '':
# end of block?
output += self._get_block_text(num_block + 1, 0)
break
if len(line) == 2:
if ord(line[0]) == 3:
finish = True
break
output += line
p.stdout.close()
#logging.error(output)
return output
|