diff options
Diffstat (limited to 'tools2/expandtemplates.py')
-rwxr-xr-x | tools2/expandtemplates.py | 134 |
1 files changed, 134 insertions, 0 deletions
diff --git a/tools2/expandtemplates.py b/tools2/expandtemplates.py new file mode 100755 index 0000000..db30b0b --- /dev/null +++ b/tools2/expandtemplates.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2007, One Laptop Per Child +# +# License: GPLv2 +# +# Usage: +# ./tools2/expandtemplates.py directory 2>expand.log +# Ex: +# ./tools2/expandtemplates.py es_lat + +import sys +reload(sys) +# Important! We'll be using stdout and stderr with +# UTF-8 chars. Without this, errors galore. +sys.setdefaultencoding('utf-8') + +sys.path.append('.') + +import os +import re +import codecs +from server import WPWikiDB +from make_selection import FileListReader + +START_HEADING = chr(1) +START_TEXT = chr(2) +END_TEXT = chr(3) + +import config + +# __main__ + +only_page = None +start_at = None +stdout = False + +if len(sys.argv) > 1: + directory = sys.argv[1] + + for argn in range(1, len(sys.argv)): + arg = sys.argv[argn] + if arg.startswith('--only='): + only_page = arg[len('--only='):] + print "Processing only article '%s'" % only_page + if arg.startswith('--start_at='): + start_at = arg[len('--start_at='):] + print "Starting to process at article '%s'" % start_at + if arg == '--stdout': + stdout = True + print "Writing output to stdout" + +else: + print "Use expandtemplates.py directory" + exit() + + +xml_file_name = config.input_xml_file_name +if xml_file_name.find('/') > -1: + xml_file_name = xml_file_name[xml_file_name.find('/') + 1:] +path = os.path.join(directory, xml_file_name) + +articles_list = [] +if only_page is not None: + articles_list = [unicode(only_page)] +else: + articles_reader = FileListReader('%s.pages_selected-level-1' % path) + + articles_list = articles_reader.list + if start_at is not None: + filtered_list = [] + found = False + for title in articles_list: + if title == start_at: + found = True + if found: + filtered_list.append(title) + articles_list = filtered_list + +lang = os.path.basename(path)[0:2] + +templateprefix = config.TEMPLATE_NAMESPACES[0] + +# load blacklist only once +templateblacklist = set() +templateblacklistpath = os.path.join(os.path.dirname(path), + 'template_blacklist') +if os.path.exists(templateblacklistpath): + with open(templateblacklistpath, 'r') as f: + for line in f.readlines(): + templateblacklist.add(line.rstrip().decode('utf8')) + +wikidb = WPWikiDB(path, lang, templateprefix, templateblacklist) +rx = re.compile('(' + templateprefix + '|Wikipedia:)') + +if not stdout: + file_mode = 'w' + if os.path.exists('%s.processed_expanded' % path): + file_mode = 'a' + + _output = codecs.open('%s.processed_expanded' % path, + encoding='utf-8', mode=file_mode) +else: + _output = sys.stdout + +for title in articles_list: + if title.find('#') > -1: + if title.find('#') == 0: + continue + else: + title = title[:title.find('#')] + + if rx.match(title): + sys.stderr.write('SKIPPING: ' + title + "\n") + continue + + sys.stderr.write('PROCESSING: ' + title + "\n") + + article_text = wikidb.getExpandedArticle(title) + if article_text == None: + sys.stderr.write('ERROR - SKIPPING: ' + title + "\n") + continue + + _output.write(START_HEADING + '\n') + _output.write(title + '\n') + # in Python 2.x, len() over a unicode string + # gives us the bytecount. Not compat w Python 3. + _output.write("%s\n" % len(article_text)) + _output.write(START_TEXT + '\n') + _output.write(article_text + '\n') + _output.write(END_TEXT + '\n') + +_output.close() |