diff options
Diffstat (limited to 'mwlib/cdbwiki.py')
-rwxr-xr-x | mwlib/cdbwiki.py | 243 |
1 files changed, 243 insertions, 0 deletions
diff --git a/mwlib/cdbwiki.py b/mwlib/cdbwiki.py new file mode 100755 index 0000000..98bb6a7 --- /dev/null +++ b/mwlib/cdbwiki.py @@ -0,0 +1,243 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import sys +import os +import zlib +import re + +from mwlib import cdb + +try: + from xml.etree import cElementTree +except ImportError: + import cElementTree + +ns = '{http://www.mediawiki.org/xml/export-0.3/}' + +wikiindex = "wikiidx" +wikidata = "wikidata.bin" + + + +def normname(name): + name = name.strip().replace("_", " ") + name = name[:1].upper()+name[1:] + return name + +class Tags: + page = ns + 'page' + + # <title> inside <page> + title = ns + 'title' + + # <revision> inside <page> + revision = ns + 'revision' + + # <id> inside <revision> + revid = ns + 'id' + + # <contributor><username> inside <revision> + username = ns + 'contributor/' + ns + 'username' + + # <text> inside <revision> + text = ns + 'text' + + # <timestamp> inside <revision> + timestamp = ns + 'timestamp' + + # <revision><text> inside <page> + revision_text = ns + 'revision/' + ns + 'text' + + siteinfo = ns + "siteinfo" + +class DumpParser(object): + category_ns = set(['category', 'kategorie']) + image_ns = set(['image', 'bild']) + template_ns = set(['template', 'vorlage']) + wikipedia_ns = set(['wikipedia']) + + tags = Tags() + + + def __init__(self, xmlfilename): + self.xmlfilename = xmlfilename + + def _write(self, msg): + sys.stdout.write(msg) + sys.stdout.flush() + + def openInputStream(self): + if self.xmlfilename.lower().endswith(".bz2"): + f = os.popen("bunzip2 -c %s" % self.xmlfilename, "r") + elif self.xmlfilename.lower().endswith(".7z"): + f = os.popen("7z -so x %s" % self.xmlfilename, "r") + else: + f = open(self.xmlfilename, "r") + + return f + + def __call__(self): + f = self.openInputStream() + + count = 0 + for event, elem in cElementTree.iterparse(f): + if elem.tag != self.tags.page: + continue + self.handlePageElement(elem) + elem.clear() + count += 1 + + if count % 5000 == 0: + self._write(" %s\n" % count) + elif count % 100 == 0: + self._write(".") + + + def handlePageElement(self, page): + title = page.find(self.tags.title).text + revisions = page.findall(self.tags.revision) + if not revisions: + return + revision = revisions[-1] + + texttag = revision.find(self.tags.text) + timestamptag = revision.find(self.tags.timestamp) + revision.clear() + + if texttag is not None: + text = texttag.text + texttag.clear() + else: + text = None + + if timestamptag is not None: + timestamp = timestamptag.text + timestamptag.clear() + else: + timestamp = None + + if not text: + return + + if isinstance(title, str): + title = unicode(title) + if isinstance(text, str): + text = unicode(text) + + + if ':' in title: + ns, rest = title.split(':', 1) + ns = ns.lower() + if ns not in self.template_ns: + return + self.handleTemplate(rest, text, timestamp) + else: + self.handleArticle(title, text, timestamp) + + def handleArticle(self, title, text, timestamp): + print "ART:", repr(title), len(text), timestamp + + def handleTemplate(self, title, text, timestamp): + print "TEMPL:", repr(title), len(text), timestamp + +class BuildWiki(DumpParser): + def __init__(self, xmlfilename, outputdir): + DumpParser.__init__(self, xmlfilename) + self.outputdir = outputdir + + def __call__(self): + if not os.path.exists(self.outputdir): + os.makedirs(self.outputdir) + + n = os.path.join(self.outputdir, wikiindex) + out = open(os.path.join(self.outputdir, wikidata), "wb") + self.out = out + f = open(n+'.cdb', 'wb') + c = cdb.CdbMake(f) + self.cdb = c + + DumpParser.__call__(self) + c.finish() + f.close() + + + def _writeobj(self, key, val): + key = key.encode("utf-8") + val = zlib.compress(val) + pos = self.out.tell() + self.out.write(val) + self.cdb.add(key, "%s %s" % (pos, len(val))) + + def handleArticle(self, title, text, timestamp): + self._writeobj(u":"+title, text.encode("utf-8")) + + def handleTemplate(self, title, text, timestamp): + self._writeobj(u"T:"+title, text.encode("utf-8")) + + + +class WikiDB(object): + redirect_rex = re.compile(r'^#Redirect:?\s*?\[\[(?P<redirect>.*?)\]\]', re.IGNORECASE) + + def __init__(self, dir): + self.dir = dir + self.obj2pos_path = os.path.join(self.dir, wikidata) + self.cdb = cdb.Cdb(open(os.path.join(self.dir, wikiindex+'.cdb'), 'rb')) + + def _readobj(self, key): + key = key.encode("utf-8") + + try: + data = self.cdb[key] + except KeyError: + return None + + pos, len = map(int, data.split()) + + f=open(self.obj2pos_path, "rb") + f.seek(pos) + d=f.read(len) + f.close() + return zlib.decompress(d) + + def getRawArticle(self, title, raw=None, revision=None): + title = normname(title) + res = self._readobj(":"+title) + if res is None: + return None + + res = unicode(res, 'utf-8') + mo = self.redirect_rex.search(res) + if mo: + redirect = mo.group('redirect') + redirect = normname(redirect.split("|", 1)[0].split("#", 1)[0]) + + return self.getRawArticle(redirect) + + return res + + def getTemplate(self, title, followRedirects=False): + if ":" in title: + title = title.split(':', 1)[1] + + title = normname(title) + res = unicode(self._readobj(u"T:"+title) or "", 'utf-8') + if not res: + return res + + mo = self.redirect_rex.search(res) + if mo: + redirect = mo.group('redirect') + redirect = normname(redirect.split("|", 1)[0].split("#", 1)[0]) + return self.getTemplate(redirect) + return res + + + def articles(self): + for k, v in self.cdb: + if k[0]==':': + k = unicode(k[1:], "utf-8") + yield k |