Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/mwlib/cdbwiki.py
diff options
context:
space:
mode:
Diffstat (limited to 'mwlib/cdbwiki.py')
-rwxr-xr-xmwlib/cdbwiki.py243
1 files changed, 243 insertions, 0 deletions
diff --git a/mwlib/cdbwiki.py b/mwlib/cdbwiki.py
new file mode 100755
index 0000000..98bb6a7
--- /dev/null
+++ b/mwlib/cdbwiki.py
@@ -0,0 +1,243 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2007-2008 PediaPress GmbH
+# See README.txt for additional licensing information.
+
+import sys
+import os
+import zlib
+import re
+
+from mwlib import cdb
+
+try:
+ from xml.etree import cElementTree
+except ImportError:
+ import cElementTree
+
+ns = '{http://www.mediawiki.org/xml/export-0.3/}'
+
+wikiindex = "wikiidx"
+wikidata = "wikidata.bin"
+
+
+
+def normname(name):
+ name = name.strip().replace("_", " ")
+ name = name[:1].upper()+name[1:]
+ return name
+
+class Tags:
+ page = ns + 'page'
+
+ # <title> inside <page>
+ title = ns + 'title'
+
+ # <revision> inside <page>
+ revision = ns + 'revision'
+
+ # <id> inside <revision>
+ revid = ns + 'id'
+
+ # <contributor><username> inside <revision>
+ username = ns + 'contributor/' + ns + 'username'
+
+ # <text> inside <revision>
+ text = ns + 'text'
+
+ # <timestamp> inside <revision>
+ timestamp = ns + 'timestamp'
+
+ # <revision><text> inside <page>
+ revision_text = ns + 'revision/' + ns + 'text'
+
+ siteinfo = ns + "siteinfo"
+
+class DumpParser(object):
+ category_ns = set(['category', 'kategorie'])
+ image_ns = set(['image', 'bild'])
+ template_ns = set(['template', 'vorlage'])
+ wikipedia_ns = set(['wikipedia'])
+
+ tags = Tags()
+
+
+ def __init__(self, xmlfilename):
+ self.xmlfilename = xmlfilename
+
+ def _write(self, msg):
+ sys.stdout.write(msg)
+ sys.stdout.flush()
+
+ def openInputStream(self):
+ if self.xmlfilename.lower().endswith(".bz2"):
+ f = os.popen("bunzip2 -c %s" % self.xmlfilename, "r")
+ elif self.xmlfilename.lower().endswith(".7z"):
+ f = os.popen("7z -so x %s" % self.xmlfilename, "r")
+ else:
+ f = open(self.xmlfilename, "r")
+
+ return f
+
+ def __call__(self):
+ f = self.openInputStream()
+
+ count = 0
+ for event, elem in cElementTree.iterparse(f):
+ if elem.tag != self.tags.page:
+ continue
+ self.handlePageElement(elem)
+ elem.clear()
+ count += 1
+
+ if count % 5000 == 0:
+ self._write(" %s\n" % count)
+ elif count % 100 == 0:
+ self._write(".")
+
+
+ def handlePageElement(self, page):
+ title = page.find(self.tags.title).text
+ revisions = page.findall(self.tags.revision)
+ if not revisions:
+ return
+ revision = revisions[-1]
+
+ texttag = revision.find(self.tags.text)
+ timestamptag = revision.find(self.tags.timestamp)
+ revision.clear()
+
+ if texttag is not None:
+ text = texttag.text
+ texttag.clear()
+ else:
+ text = None
+
+ if timestamptag is not None:
+ timestamp = timestamptag.text
+ timestamptag.clear()
+ else:
+ timestamp = None
+
+ if not text:
+ return
+
+ if isinstance(title, str):
+ title = unicode(title)
+ if isinstance(text, str):
+ text = unicode(text)
+
+
+ if ':' in title:
+ ns, rest = title.split(':', 1)
+ ns = ns.lower()
+ if ns not in self.template_ns:
+ return
+ self.handleTemplate(rest, text, timestamp)
+ else:
+ self.handleArticle(title, text, timestamp)
+
+ def handleArticle(self, title, text, timestamp):
+ print "ART:", repr(title), len(text), timestamp
+
+ def handleTemplate(self, title, text, timestamp):
+ print "TEMPL:", repr(title), len(text), timestamp
+
+class BuildWiki(DumpParser):
+ def __init__(self, xmlfilename, outputdir):
+ DumpParser.__init__(self, xmlfilename)
+ self.outputdir = outputdir
+
+ def __call__(self):
+ if not os.path.exists(self.outputdir):
+ os.makedirs(self.outputdir)
+
+ n = os.path.join(self.outputdir, wikiindex)
+ out = open(os.path.join(self.outputdir, wikidata), "wb")
+ self.out = out
+ f = open(n+'.cdb', 'wb')
+ c = cdb.CdbMake(f)
+ self.cdb = c
+
+ DumpParser.__call__(self)
+ c.finish()
+ f.close()
+
+
+ def _writeobj(self, key, val):
+ key = key.encode("utf-8")
+ val = zlib.compress(val)
+ pos = self.out.tell()
+ self.out.write(val)
+ self.cdb.add(key, "%s %s" % (pos, len(val)))
+
+ def handleArticle(self, title, text, timestamp):
+ self._writeobj(u":"+title, text.encode("utf-8"))
+
+ def handleTemplate(self, title, text, timestamp):
+ self._writeobj(u"T:"+title, text.encode("utf-8"))
+
+
+
+class WikiDB(object):
+ redirect_rex = re.compile(r'^#Redirect:?\s*?\[\[(?P<redirect>.*?)\]\]', re.IGNORECASE)
+
+ def __init__(self, dir):
+ self.dir = dir
+ self.obj2pos_path = os.path.join(self.dir, wikidata)
+ self.cdb = cdb.Cdb(open(os.path.join(self.dir, wikiindex+'.cdb'), 'rb'))
+
+ def _readobj(self, key):
+ key = key.encode("utf-8")
+
+ try:
+ data = self.cdb[key]
+ except KeyError:
+ return None
+
+ pos, len = map(int, data.split())
+
+ f=open(self.obj2pos_path, "rb")
+ f.seek(pos)
+ d=f.read(len)
+ f.close()
+ return zlib.decompress(d)
+
+ def getRawArticle(self, title, raw=None, revision=None):
+ title = normname(title)
+ res = self._readobj(":"+title)
+ if res is None:
+ return None
+
+ res = unicode(res, 'utf-8')
+ mo = self.redirect_rex.search(res)
+ if mo:
+ redirect = mo.group('redirect')
+ redirect = normname(redirect.split("|", 1)[0].split("#", 1)[0])
+
+ return self.getRawArticle(redirect)
+
+ return res
+
+ def getTemplate(self, title, followRedirects=False):
+ if ":" in title:
+ title = title.split(':', 1)[1]
+
+ title = normname(title)
+ res = unicode(self._readobj(u"T:"+title) or "", 'utf-8')
+ if not res:
+ return res
+
+ mo = self.redirect_rex.search(res)
+ if mo:
+ redirect = mo.group('redirect')
+ redirect = normname(redirect.split("|", 1)[0].split("#", 1)[0])
+ return self.getTemplate(redirect)
+ return res
+
+
+ def articles(self):
+ for k, v in self.cdb:
+ if k[0]==':':
+ k = unicode(k[1:], "utf-8")
+ yield k