diff options
author | Martin Langhoff <martin@laptop.org> | 2010-12-08 17:14:15 (GMT) |
---|---|---|
committer | Martin Langhoff <martin@laptop.org> | 2010-12-08 17:14:15 (GMT) |
commit | 8f45d8db8820b3a30fd9981230d7a418266efe88 (patch) | |
tree | 71df8878665f50a312d12b91a9927992167a1549 /tools | |
parent | 6e8ec04ff35ee4bb2226d68df5ba02e8aa1446a0 (diff) |
Language portability fixes to expandtemplates.py
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/expandtemplates.py | 40 |
1 files changed, 31 insertions, 9 deletions
diff --git a/tools/expandtemplates.py b/tools/expandtemplates.py index 6b8a760..58b3905 100755 --- a/tools/expandtemplates.py +++ b/tools/expandtemplates.py @@ -68,6 +68,11 @@ class ArticleIndex: class WPWikiDB: """Retrieves article contents for mwlib.""" + def __init__(self, lang, templateprefix, templateblacklist): + self.lang = lang + self.templateprefix = templateprefix + self.templateblacklist = templateblacklist + def getRawArticle(self, title, followRedirects=True): # Retrieve article text, recursively following #redirects. if title == '': @@ -113,7 +118,10 @@ class WPWikiDB: return self.getRawArticle(title) def expandArticle(self, article_text, title): - template_expander = expander.Expander(article_text, pagename=title, wikidb=self) + template_expander = expander.Expander(article_text, pagename=title, + wikidb=self, lang=self.lang, + templateprefix = self.templateprefix, + templateblacklist = self.templateblacklist) return template_expander.expandTemplates() def getExpandedArticle(self, title): @@ -178,19 +186,33 @@ def wp_load_article_fork(title): # __main__ -# prep a isting of redirects. wp.so hides them from -# us, which would bloat our - -load_db(sys.argv[1]) +path = sys.argv[1] +load_db(path) index = ArticleIndex('%s.index.txt' % sys.argv[1]) rawindex = index.rawindex() -wikidb = WPWikiDB() -rx = re.compile('(Plantilla|Template|Wikipedia):') - -for title in rawindex: #['1812 invasion of Russia', '1857 revolt']: #rawindex: +lang = os.path.basename(path)[0:2] +## FIXME GETTEXT +templateprefixes = { 'en': 'Template:', + 'es': 'Plantilla:' } +templateprefix = templateprefixes[ lang ] + +# load blacklist only once +templateblacklist = set() +templateblacklistpath = os.path.join(os.path.dirname(path), + 'template_blacklist') +if os.path.exists(templateblacklistpath): + with open(templateblacklistpath, 'r') as f: + for line in f.readlines(): + templateblacklist.add(line.rstrip().decode('utf8')) + +wikidb = WPWikiDB(lang, templateprefix, templateblacklist) +rx = re.compile('('+templateprefix+'|Wikipedia:)') + +for title in rawindex: #['1812 invasion of Russia', '1857 revolt']: if rx.match(title): + sys.stderr.write('SKIPPING: ' + title + "\n") continue sys.stderr.write('PROCESSING: ' + title + "\n") |