Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools2/test_index.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools2/test_index.py')
-rwxr-xr-xtools2/test_index.py175
1 files changed, 175 insertions, 0 deletions
diff --git a/tools2/test_index.py b/tools2/test_index.py
new file mode 100755
index 0000000..bc35f87
--- /dev/null
+++ b/tools2/test_index.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# create index
+
+import codecs
+import os
+from subprocess import Popen, PIPE, STDOUT
+import re
+import sys
+import config
+
+input_xml_file_name = config.input_xml_file_name
+
+
+def normalize_title(title):
+ return title.strip().replace(' ', '_').capitalize()
+
+
+class RedirectParser:
+
+ def __init__(self, file_name):
+ self.link_re = re.compile('\[\[.*?\]\]')
+ # Load redirects
+ input_redirects = codecs.open('%s.redirects_used' % file_name,
+ encoding='utf-8', mode='r')
+
+ line = input_redirects.readline()
+ self.redirects = {}
+ count = 0
+ while line:
+ links = links = self.link_re.findall(unicode(line))
+ if len(links) == 2:
+ self.redirects[normalize_title(links[0])] = \
+ normalize_title(links[1])
+ line = input_redirects.readline()
+ count += 1
+ print "Processing %d\r" % count,
+ input_redirects.close()
+
+ def get_redirected(self, article_title):
+ try:
+ article_title = article_title.capitalize()
+ redirected = self.redirects[article_title]
+ except:
+ redirect = None
+ return redirect
+
+
+class DataRetriever():
+
+ def __init__(self, data_files_base, redirects_checker):
+ self._bzip_file_name = '%s.processed.bz2' % data_files_base
+ self._bzip_table_file_name = '%s.processed.bz2t' % data_files_base
+ self._index_file_name = '%s.processed.idx' % data_files_base
+ self.template_re = re.compile('({{.*?}})')
+ self.redirects_checker = redirects_checker
+
+ def _get_article_position(self, article_title):
+ article_title = normalize_title(article_title)
+ #index_file = codecs.open(self._index_file_name, encoding='utf-8',
+ # mode='r')
+ index_file = open(self._index_file_name, mode='r')
+
+ index_line = index_file.readline()
+ num_block = -1
+ position = -1
+ while index_line:
+ words = index_line.split()
+ article = words[0]
+ if article == article_title:
+ num_block = int(words[1])
+ position = int(words[2])
+ break
+ index_line = index_file.readline()
+ index_file.close()
+
+ if num_block == -1:
+ # look at redirects
+ redirect = self.redirects_checker.get_redirected(article_title)
+ if redirect is not None:
+ return self._get_article_position(redirect)
+
+ return num_block, position
+
+ def _get_block_start(self, num_block):
+ bzip_table_file = open(self._bzip_table_file_name, mode='r')
+ n = num_block
+ table_line = ''
+ while n > 0:
+ table_line = bzip_table_file.readline()
+ n -= 1
+ if table_line == '':
+ return -1
+ parts = table_line.split()
+ block_start = int(parts[0])
+ bzip_table_file.close()
+ return block_start
+
+ def get_expanded_article(self, article_title):
+ """
+ This method does not do real template expansion
+ is only used to test all the needed templates and redirects are
+ available.
+ """
+ text_article = self.get_text_article(article_title)
+ templates_cache = {}
+ expanded_article = ''
+ parts = self.template_re.split(text_article)
+ for part in parts:
+ if part.startswith('{{'):
+ part = part[2:-2]
+ #print "TEMPLATE: %s" % part
+ if part.find('|') > -1:
+ template_name = part[:part.find('|')]
+ else:
+ template_name = part
+ # TODO: Plantilla should be a parameter
+ template_name = normalize_title('Plantilla:%s' % template_name)
+ if template_name in templates_cache:
+ expanded_article += templates_cache[template_name]
+ else:
+ templates_content = self.get_text_article(template_name)
+ expanded_article += templates_content
+ templates_cache[template_name] = templates_content
+ else:
+ expanded_article += part
+ return expanded_article
+
+ def get_text_article(self, article_title):
+ output = ''
+ print "Looking for article %s" % article_title
+ num_block, position = self._get_article_position(article_title)
+ if num_block == -1:
+ print "Article not found"
+ else:
+ print "Found at block %d position %d" % (num_block, position)
+
+ block_start = self._get_block_start(num_block)
+ #print "Block %d starts at %d" % (num_block, block_start)
+ if block_start == -1:
+ return ""
+
+ # extract the block
+ bzip_file = open(self._bzip_file_name, mode='r')
+ cmd = ['../bin/%s/seek-bunzip' % config.system_id, str(block_start)]
+ p = Popen(cmd, stdin=bzip_file, stdout=PIPE, stderr=STDOUT,
+ close_fds=True)
+
+ while position > 0:
+ line = p.stdout.readline()
+ position -= len(line)
+
+ finish = False
+ while not finish:
+ line = p.stdout.readline()
+ if len(line) == 2:
+ if ord(line[0]) == 3:
+ finish = True
+ break
+ output += line
+ return output
+
+
+if __name__ == '__main__':
+
+ page_title = ''
+ if len(sys.argv) > 1:
+ page_title = sys.argv[1]
+ else:
+ print "Use ../tools2/test_index.py page_title"
+ exit()
+
+ redirects_checker = RedirectParser(input_xml_file_name)
+ data_retriever = DataRetriever(input_xml_file_name, redirects_checker)
+ print data_retriever.get_expanded_article(page_title)