Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools2/expandtemplates.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools2/expandtemplates.py')
-rwxr-xr-xtools2/expandtemplates.py134
1 files changed, 134 insertions, 0 deletions
diff --git a/tools2/expandtemplates.py b/tools2/expandtemplates.py
new file mode 100755
index 0000000..db30b0b
--- /dev/null
+++ b/tools2/expandtemplates.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2007, One Laptop Per Child
+#
+# License: GPLv2
+#
+# Usage:
+# ./tools2/expandtemplates.py directory 2>expand.log
+# Ex:
+# ./tools2/expandtemplates.py es_lat
+
+import sys
+reload(sys)
+# Important! We'll be using stdout and stderr with
+# UTF-8 chars. Without this, errors galore.
+sys.setdefaultencoding('utf-8')
+
+sys.path.append('.')
+
+import os
+import re
+import codecs
+from server import WPWikiDB
+from make_selection import FileListReader
+
+START_HEADING = chr(1)
+START_TEXT = chr(2)
+END_TEXT = chr(3)
+
+import config
+
+# __main__
+
+only_page = None
+start_at = None
+stdout = False
+
+if len(sys.argv) > 1:
+ directory = sys.argv[1]
+
+ for argn in range(1, len(sys.argv)):
+ arg = sys.argv[argn]
+ if arg.startswith('--only='):
+ only_page = arg[len('--only='):]
+ print "Processing only article '%s'" % only_page
+ if arg.startswith('--start_at='):
+ start_at = arg[len('--start_at='):]
+ print "Starting to process at article '%s'" % start_at
+ if arg == '--stdout':
+ stdout = True
+ print "Writing output to stdout"
+
+else:
+ print "Use expandtemplates.py directory"
+ exit()
+
+
+xml_file_name = config.input_xml_file_name
+if xml_file_name.find('/') > -1:
+ xml_file_name = xml_file_name[xml_file_name.find('/') + 1:]
+path = os.path.join(directory, xml_file_name)
+
+articles_list = []
+if only_page is not None:
+ articles_list = [unicode(only_page)]
+else:
+ articles_reader = FileListReader('%s.pages_selected-level-1' % path)
+
+ articles_list = articles_reader.list
+ if start_at is not None:
+ filtered_list = []
+ found = False
+ for title in articles_list:
+ if title == start_at:
+ found = True
+ if found:
+ filtered_list.append(title)
+ articles_list = filtered_list
+
+lang = os.path.basename(path)[0:2]
+
+templateprefix = config.TEMPLATE_NAMESPACES[0]
+
+# load blacklist only once
+templateblacklist = set()
+templateblacklistpath = os.path.join(os.path.dirname(path),
+ 'template_blacklist')
+if os.path.exists(templateblacklistpath):
+ with open(templateblacklistpath, 'r') as f:
+ for line in f.readlines():
+ templateblacklist.add(line.rstrip().decode('utf8'))
+
+wikidb = WPWikiDB(path, lang, templateprefix, templateblacklist)
+rx = re.compile('(' + templateprefix + '|Wikipedia:)')
+
+if not stdout:
+ file_mode = 'w'
+ if os.path.exists('%s.processed_expanded' % path):
+ file_mode = 'a'
+
+ _output = codecs.open('%s.processed_expanded' % path,
+ encoding='utf-8', mode=file_mode)
+else:
+ _output = sys.stdout
+
+for title in articles_list:
+ if title.find('#') > -1:
+ if title.find('#') == 0:
+ continue
+ else:
+ title = title[:title.find('#')]
+
+ if rx.match(title):
+ sys.stderr.write('SKIPPING: ' + title + "\n")
+ continue
+
+ sys.stderr.write('PROCESSING: ' + title + "\n")
+
+ article_text = wikidb.getExpandedArticle(title)
+ if article_text == None:
+ sys.stderr.write('ERROR - SKIPPING: ' + title + "\n")
+ continue
+
+ _output.write(START_HEADING + '\n')
+ _output.write(title + '\n')
+ # in Python 2.x, len() over a unicode string
+ # gives us the bytecount. Not compat w Python 3.
+ _output.write("%s\n" % len(article_text))
+ _output.write(START_TEXT + '\n')
+ _output.write(article_text + '\n')
+ _output.write(END_TEXT + '\n')
+
+_output.close()