expandtemplates.py: expands templates in the compressed file format

Run expandtemplates.py over the data files and it will output a new 'processed' file that is sorted and has the templates expanded. This yields a file that is smaller, and requires only one lookup per Wikipage. The performance impact is huge.
author: Martin Langhoff <martin@laptop.org> 2010-11-29 16:54:21 (GMT)
committer: Martin Langhoff <martin@laptop.org> 2010-11-29 16:59:44 (GMT)
commit: ac6e3fc7cf363bb1efb22413a41b69475f5d08ff (patch)
tree: 8294641372150062f5b36fbd944c1a89dee3d7cf /tools
parent: ae4209017bb67643140fb2a35026d2ab68eee57c (diff)
2 files changed, 225 insertions, 0 deletions
diff --git a/tools/expandtemplates.py b/tools/expandtemplates.py
new file mode 100755
index 0000000..ab8148c
--- /dev/null
+++ b/tools/expandtemplates.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2007, One Laptop Per Child
+#
+# License: GPLv2
+#
+# Usage:
+#  expandtemplates.py <dbdir/dbfile> 2>expand.log | bzip -c -9 - > foo/bar.processed
+#
+from __future__ import with_statement
+import sys
+reload(sys)
+# Important! We'll be using stdout and stderr with
+# UTF-8 chars. Without this, errors galore.
+sys.setdefaultencoding('utf-8')
+
+import os
+import subprocess
+import select
+import codecs
+from StringIO import StringIO
+import cgi
+import errno
+import tempfile
+import re
+import wp
+import xml.dom.minidom
+from pylru import lrudecorator
+
+START_HEADING = chr(1)
+START_TEXT = chr(2)
+END_TEXT = chr(3)
+
+# Uncomment to print out a large dump from the template expander.
+#os.environ['DEBUG_EXPANDER'] = '1'
+
+try:
+    from hashlib import md5
+except ImportError:
+    from md5 import md5
+
+import mwlib.htmlwriter
+from mwlib import parser, scanner, expander
+
+
+class ArticleIndex:
+    # Prepare an in-memory index, using the already generated 
+    # index file.  
+
+    def __init__(self, path):
+        self.article_index = []
+        with codecs.open(path, mode='r', encoding='utf-8') as f:
+            for line in f.readlines():
+                m = re.search(r'(.*?)\s*\d+$', line)
+                if m is None:
+                    raise AssertionError("Match didn't work")
+                self.article_index.append(m.group(1))
+        self.article_index.sort()
+        
+    def __contains__(self, x):
+        return x in self.article_index
+
+    def rawindex(self):
+        return self.article_index
+    
+class WPWikiDB:
+    """Retrieves article contents for mwlib."""
+
+    def getRawArticle(self, title, followRedirects=True):
+        # Retrieve article text, recursively following #redirects.
+        oldtitle = ""
+
+        if title == '':
+            return ''
+        
+        while True:
+            # Replace underscores with spaces in title.
+            title = title.replace("_", " ")
+            # Capitalize the first letter of the article -- Trac #6991.
+            title = title[0].capitalize() + title[1:]
+
+            if title == oldtitle:
+                article_text = ""
+                break
+
+            article_text = unicode(wp_load_article(title.encode('utf8')), 'utf8')
+            
+            # To see unmodified article_text, uncomment here.
+            # print article_text
+            if not followRedirects:
+                break
+
+            m = re.match(r'^\s*\#?redirect\s*\:?\s*\[\[(.*)\]\]', article_text, re.IGNORECASE|re.MULTILINE)
+            if not m:
+                break
+
+            oldtitle = title
+            title = m.group(1)
+
+        # Stripping leading & trailing whitespace fixes template expansion.
+        article_text = article_text.lstrip()
+        article_text = article_text.rstrip()
+
+        return article_text
+
+    def getTemplate(self, title, followRedirects=False):
+        return self.getRawArticle(title)
+
+    def expandArticle(self, article_text, title):
+        template_expander = expander.Expander(article_text, pagename=title, wikidb=self)
+        return template_expander.expandTemplates()
+        
+    def getExpandedArticle(self, title):
+        return self.expandArticle(self.getRawArticle(title), title)
+
+class HTMLOutputBuffer:
+    """Buffers output and converts to utf8 as needed."""
+
+    def __init__(self):
+        self.buffer = ''
+
+    def write(self, obj):
+        if isinstance(obj, unicode):
+            self.buffer += obj.encode('utf8')
+        else:
+            self.buffer += obj
+    
+    def getvalue(self):
+        return self.buffer
+
+def load_db(dbname):
+    wp.wp_load_dump(
+        dbname + '.processed',
+        dbname + '.locate.db',
+        dbname + '.locate.prefixdb',
+        dbname + '.blocks.db')
+
+# Cache articles and specially templates
+@lrudecorator(100)
+def wp_load_article(title):
+    
+    return wp.wp_load_article(title)
+
+# __main__
+
+# prep a isting of redirects. wp.so hides them from
+# us, which would bloat our
+
+load_db(sys.argv[1])
+index = ArticleIndex('%s.index.txt' % sys.argv[1])
+
+rawindex = index.rawindex()
+
+wikidb = WPWikiDB()
+rx = re.compile('Plantilla:')
+
+# The index is sometimes slightly corrupt and
+# names articles we don't have
+badarts = ['Ciclo hidr', 'Mar de Aral',  'Salario Mínimo Interpr']
+
+for title in rawindex:
+    if rx.match(title):
+        continue
+    if title in badarts:
+        continue
+    
+    sys.stderr.write('PROCESSING: ' + title + "\n")
+    
+    article_text  = wikidb.getRawArticle(title, followRedirects=False)
+                
+    # we don't expand nor follow redirects
+    m = re.match(r'^\s*\#?redirect\s*\:?\s*\[\[(.*)\]\]',
+                 article_text, re.IGNORECASE|re.MULTILINE)
+    if not m:
+        article_text = wikidb.getExpandedArticle(title)
+
+    sys.stdout.write(START_HEADING + '\n')
+    sys.stdout.write(title + '\n')
+    # in Python 2.x, len() over a unicode string
+    # gives us the bytecount. Not compat w Python 3.
+    sys.stdout.write("%s\n" % len(article_text))
+    sys.stdout.write(START_TEXT + '\n')
+    sys.stdout.write(article_text + '\n')
+    sys.stdout.write(END_TEXT + '\n')
+
+    # break
diff --git a/tools/templatestats.py b/tools/templatestats.py
new file mode 100755
index 0000000..57ed5ff
--- /dev/null
+++ b/tools/templatestats.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python 
+#
+# Trivial script -- check usage frequency of templates
+# (Shows templates have a 'long-tail')
+# Usage:
+#
+#  bzcat -d -c es_PE/es_PE.xml.bz2.processed  | ./templatestats.py > templatestats.txt 
+#
+# Author: Martin Langhoff <martin@laptop.org>
+#
+import sys, re
+
+rx = re.compile('\{\{.+?\}\}')
+seen = {}
+
+while 1:
+    line = sys.stdin.readline()
+    if not line:
+        break
+    m = rx.findall(line)
+    for p in m:
+        # strip away curly braces
+        p = p[2:-2]
+        p = re.sub('\{+', '', p)
+        if p in seen:
+            seen[p] = seen[p]+1
+        else:
+            seen[p] = 1
+
+order = []
+for p in seen.keys():
+    order.append(tuple([seen[p], p]))
+
+order.sort(cmp=lambda x,y: cmp(y[0], x[0]))
+
+for p in order:
+    print "%i : %s" % p 
+
author	Martin Langhoff <martin@laptop.org>	2010-11-29 16:54:21 (GMT)
committer	Martin Langhoff <martin@laptop.org>	2010-11-29 16:59:44 (GMT)
commit	ac6e3fc7cf363bb1efb22413a41b69475f5d08ff (patch)
tree	8294641372150062f5b36fbd944c1a89dee3d7cf /tools
parent	ae4209017bb67643140fb2a35026d2ab68eee57c (diff)