Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorMartin Langhoff <martin@laptop.org>2010-11-29 16:54:21 (GMT)
committer Martin Langhoff <martin@laptop.org>2010-11-29 16:59:44 (GMT)
commitac6e3fc7cf363bb1efb22413a41b69475f5d08ff (patch)
tree8294641372150062f5b36fbd944c1a89dee3d7cf /tools
parentae4209017bb67643140fb2a35026d2ab68eee57c (diff)
expandtemplates.py: expands templates in the compressed file format
Run expandtemplates.py over the data files and it will output a new 'processed' file that is sorted and has the templates expanded. This yields a file that is smaller, and requires only one lookup per Wikipage. The performance impact is huge.
Diffstat (limited to 'tools')
-rwxr-xr-xtools/expandtemplates.py187
-rwxr-xr-xtools/templatestats.py38
2 files changed, 225 insertions, 0 deletions
diff --git a/tools/expandtemplates.py b/tools/expandtemplates.py
new file mode 100755
index 0000000..ab8148c
--- /dev/null
+++ b/tools/expandtemplates.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2007, One Laptop Per Child
+#
+# License: GPLv2
+#
+# Usage:
+# expandtemplates.py <dbdir/dbfile> 2>expand.log | bzip -c -9 - > foo/bar.processed
+#
+from __future__ import with_statement
+import sys
+reload(sys)
+# Important! We'll be using stdout and stderr with
+# UTF-8 chars. Without this, errors galore.
+sys.setdefaultencoding('utf-8')
+
+import os
+import subprocess
+import select
+import codecs
+from StringIO import StringIO
+import cgi
+import errno
+import tempfile
+import re
+import wp
+import xml.dom.minidom
+from pylru import lrudecorator
+
+START_HEADING = chr(1)
+START_TEXT = chr(2)
+END_TEXT = chr(3)
+
+# Uncomment to print out a large dump from the template expander.
+#os.environ['DEBUG_EXPANDER'] = '1'
+
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
+
+import mwlib.htmlwriter
+from mwlib import parser, scanner, expander
+
+
+class ArticleIndex:
+ # Prepare an in-memory index, using the already generated
+ # index file.
+
+ def __init__(self, path):
+ self.article_index = []
+ with codecs.open(path, mode='r', encoding='utf-8') as f:
+ for line in f.readlines():
+ m = re.search(r'(.*?)\s*\d+$', line)
+ if m is None:
+ raise AssertionError("Match didn't work")
+ self.article_index.append(m.group(1))
+ self.article_index.sort()
+
+ def __contains__(self, x):
+ return x in self.article_index
+
+ def rawindex(self):
+ return self.article_index
+
+class WPWikiDB:
+ """Retrieves article contents for mwlib."""
+
+ def getRawArticle(self, title, followRedirects=True):
+ # Retrieve article text, recursively following #redirects.
+ oldtitle = ""
+
+ if title == '':
+ return ''
+
+ while True:
+ # Replace underscores with spaces in title.
+ title = title.replace("_", " ")
+ # Capitalize the first letter of the article -- Trac #6991.
+ title = title[0].capitalize() + title[1:]
+
+ if title == oldtitle:
+ article_text = ""
+ break
+
+ article_text = unicode(wp_load_article(title.encode('utf8')), 'utf8')
+
+ # To see unmodified article_text, uncomment here.
+ # print article_text
+ if not followRedirects:
+ break
+
+ m = re.match(r'^\s*\#?redirect\s*\:?\s*\[\[(.*)\]\]', article_text, re.IGNORECASE|re.MULTILINE)
+ if not m:
+ break
+
+ oldtitle = title
+ title = m.group(1)
+
+ # Stripping leading & trailing whitespace fixes template expansion.
+ article_text = article_text.lstrip()
+ article_text = article_text.rstrip()
+
+ return article_text
+
+ def getTemplate(self, title, followRedirects=False):
+ return self.getRawArticle(title)
+
+ def expandArticle(self, article_text, title):
+ template_expander = expander.Expander(article_text, pagename=title, wikidb=self)
+ return template_expander.expandTemplates()
+
+ def getExpandedArticle(self, title):
+ return self.expandArticle(self.getRawArticle(title), title)
+
+class HTMLOutputBuffer:
+ """Buffers output and converts to utf8 as needed."""
+
+ def __init__(self):
+ self.buffer = ''
+
+ def write(self, obj):
+ if isinstance(obj, unicode):
+ self.buffer += obj.encode('utf8')
+ else:
+ self.buffer += obj
+
+ def getvalue(self):
+ return self.buffer
+
+def load_db(dbname):
+ wp.wp_load_dump(
+ dbname + '.processed',
+ dbname + '.locate.db',
+ dbname + '.locate.prefixdb',
+ dbname + '.blocks.db')
+
+# Cache articles and specially templates
+@lrudecorator(100)
+def wp_load_article(title):
+
+ return wp.wp_load_article(title)
+
+# __main__
+
+# prep a isting of redirects. wp.so hides them from
+# us, which would bloat our
+
+load_db(sys.argv[1])
+index = ArticleIndex('%s.index.txt' % sys.argv[1])
+
+rawindex = index.rawindex()
+
+wikidb = WPWikiDB()
+rx = re.compile('Plantilla:')
+
+# The index is sometimes slightly corrupt and
+# names articles we don't have
+badarts = ['Ciclo hidr', 'Mar de Aral', 'Salario Mínimo Interpr']
+
+for title in rawindex:
+ if rx.match(title):
+ continue
+ if title in badarts:
+ continue
+
+ sys.stderr.write('PROCESSING: ' + title + "\n")
+
+ article_text = wikidb.getRawArticle(title, followRedirects=False)
+
+ # we don't expand nor follow redirects
+ m = re.match(r'^\s*\#?redirect\s*\:?\s*\[\[(.*)\]\]',
+ article_text, re.IGNORECASE|re.MULTILINE)
+ if not m:
+ article_text = wikidb.getExpandedArticle(title)
+
+ sys.stdout.write(START_HEADING + '\n')
+ sys.stdout.write(title + '\n')
+ # in Python 2.x, len() over a unicode string
+ # gives us the bytecount. Not compat w Python 3.
+ sys.stdout.write("%s\n" % len(article_text))
+ sys.stdout.write(START_TEXT + '\n')
+ sys.stdout.write(article_text + '\n')
+ sys.stdout.write(END_TEXT + '\n')
+
+ # break
diff --git a/tools/templatestats.py b/tools/templatestats.py
new file mode 100755
index 0000000..57ed5ff
--- /dev/null
+++ b/tools/templatestats.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+#
+# Trivial script -- check usage frequency of templates
+# (Shows templates have a 'long-tail')
+# Usage:
+#
+# bzcat -d -c es_PE/es_PE.xml.bz2.processed | ./templatestats.py > templatestats.txt
+#
+# Author: Martin Langhoff <martin@laptop.org>
+#
+import sys, re
+
+rx = re.compile('\{\{.+?\}\}')
+seen = {}
+
+while 1:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ m = rx.findall(line)
+ for p in m:
+ # strip away curly braces
+ p = p[2:-2]
+ p = re.sub('\{+', '', p)
+ if p in seen:
+ seen[p] = seen[p]+1
+ else:
+ seen[p] = 1
+
+order = []
+for p in seen.keys():
+ order.append(tuple([seen[p], p]))
+
+order.sort(cmp=lambda x,y: cmp(y[0], x[0]))
+
+for p in order:
+ print "%i : %s" % p
+