%s

#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2007, One Laptop Per Child # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # Web server script for Wikiserver project. # # Usage: server.py # from __future__ import with_statement import sys import os import subprocess import select import codecs from StringIO import StringIO import BaseHTTPServer from SimpleHTTPServer import SimpleHTTPRequestHandler import cgi import errno import urllib import tempfile import re import wp import xml.dom.minidom from pylru import lrudecorator # Uncomment to print out a large dump from the template expander. #os.environ['DEBUG_EXPANDER'] = '1' try: from hashlib import md5 except ImportError: from md5 import md5 import mwlib.htmlwriter from mwlib import parser, scanner, expander class MyHTTPServer(BaseHTTPServer.HTTPServer): def serve_forever(self, poll_interval=0.5): """Overridden version of BaseServer.serve_forever that does not fail to work when EINTR is received. """ self._BaseServer__serving = True self._BaseServer__is_shut_down.clear() while self._BaseServer__serving: # XXX: Consider using another file descriptor or # connecting to the socket to wake this up instead of # polling. Polling reduces our responsiveness to a # shutdown request and wastes cpu at all other times. try: r, w, e = select.select([self], [], [], poll_interval) except select.error, e: if e[0] == errno.EINTR: print "got eintr" continue raise if r: self._handle_request_noblock() self._BaseServer__is_shut_down.set() class LinkStats: allhits = 1 alltotal = 1 pagehits = 1 pagetotal = 1 class ArticleIndex: # Prepare an in-memory index, using the already generated # index file. def __init__(self, path): self.article_index = set() with codecs.open(path, mode='r', encoding='utf-8') as f: for line in f.readlines(): m = re.search(r'(.*?)\s*\d+$', line) if m is None: raise AssertionError("Match didn't work") self.article_index.add(m.group(1)) def __contains__(self, x): return x in self.article_index class WPWikiDB: """Retrieves article contents for mwlib.""" def __init__(self, lang, templateprefix, templateblacklist): self.lang = lang self.templateprefix = templateprefix self.templateblacklist = templateblacklist def getRawArticle(self, title, followRedirects=True): # Retrieve article text, recursively following #redirects. if title == '': return '' oldtitle = "" while True: # Replace underscores with spaces in title. title = title.replace("_", " ") # Capitalize the first letter of the article -- Trac #6991. title = title[0].capitalize() + title[1:] if title == oldtitle: article_text = "" break article_text = wp_load_article(title.encode('utf8')) if article_text == None: # something's wrong return None article_text = unicode(article_text, 'utf8') # To see unmodified article_text, uncomment here. # print article_text if not followRedirects: break m = re.match(r'^\s*\#?redirect\s*\:?\s*\[\[(.*)\]\]', article_text, re.IGNORECASE|re.MULTILINE) if not m: break oldtitle = title title = m.group(1) # Stripping leading & trailing whitespace fixes template expansion. article_text = article_text.lstrip() article_text = article_text.rstrip() return article_text def getTemplate(self, title, followRedirects=False): return self.getRawArticle(title) def expandArticle(self, article_text, title): template_expander = expander.Expander(article_text, pagename=title, wikidb=self, lang=self.lang, templateprefix = self.templateprefix, templateblacklist = self.templateblacklist) return template_expander.expandTemplates() def getExpandedArticle(self, title): return self.expandArticle(self.getRawArticle(title), title) class WPImageDB: """Retrieves images for mwlib.""" def __init__(self, basepath): self.basepath = basepath def hashpath(self, name): name = name.replace(' ', '_') name = name[:1].upper()+name[1:] d = md5(name.encode('utf-8')).hexdigest() return "/".join([d[0], d[:2], name]) def getPath(self, name, size=None): hashed_name = self.hashpath(name).encode('utf8') path = self.basepath + '/%s' % hashed_name #print "getPath: %s -> %s" % (name.encode('utf8'), path.encode('utf8')) return path def getURL(self, name, size=None): hashed_name = self.hashpath(name).encode('utf8') if os.path.exists(self.basepath + hashed_name): url = '/' + self.basepath + hashed_name else: url = 'http://upload.wikimedia.org/wikipedia/commons/' + hashed_name #print "getUrl: %s -> %s" % (name.encode('utf8'), url.encode('utf8')) return url class HTMLOutputBuffer: """Buffers output and converts to utf8 as needed.""" def __init__(self): self.buffer = '' def write(self, obj): if isinstance(obj, unicode): self.buffer += obj.encode('utf8') else: self.buffer += obj def getvalue(self): return self.buffer class WPMathRenderer: def render(self, latex): process = subprocess.Popen(('bin/blahtex', '--mathml', '--texvc-compatible-commands'), stdin=subprocess.PIPE, stdout=subprocess.PIPE) (mathml, err) = process.communicate(latex.encode('utf8')) if process.returncode is not 0: return "" # Ugly! There is certainly a better way to do this, but my DOM skills are weak, and this works. try: dom = xml.dom.minidom.parseString(mathml) dom = dom.getElementsByTagName('blahtex')[0] dom = dom.getElementsByTagName('mathml')[0] dom = dom.getElementsByTagName('markup')[0] mathml = dom.toxml() mathml = mathml.replace('markup', 'math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"') dom.unlink() except: print "BLAHTEX XML PARSING FAILED:\nINPUT: '%s'\nOUTPUT: '%s'" % (latex, mathml) return "" # Straight embedding. Requires parent document to be XHTML. return mathml class WPHTMLWriter(mwlib.htmlwriter.HTMLWriter): """Customizes HTML output from mwlib.""" def __init__(self, index, wfile, images=None, lang='en'): self.index = index self.gallerylevel = 0 self.lang = lang math_renderer = WPMathRenderer() mwlib.htmlwriter.HTMLWriter.__init__(self, wfile, images, math_renderer=math_renderer) def writeLink(self, obj): if obj.target is None: return article = obj.target # Parser appending '/' characters to link targets for some reason. article = article.rstrip('/') title = article title = title[0].capitalize() + title[1:] title = title.replace("_", " ") article_exists = title.encode('utf8') in self.index if article_exists: # Exact match. Internal link. LinkStats.allhits += 1 LinkStats.alltotal += 1 LinkStats.pagehits += 1 LinkStats.pagetotal += 1 link_attr = '' link_baseurl = '/wiki/' else: # No match. External link. Use {lang}.wikipedia.org. # FIXME: Decide between {lang}.w.o and schoolserver. LinkStats.alltotal += 1 LinkStats.pagetotal += 1 link_attr = "class='offsite' " link_baseurl = 'http://' + self.lang + '.wikipedia.org/wiki/' parts = article.encode('utf-8').split('#') parts[0] = parts[0].replace(" ", "_") url = ("#".join([x for x in parts])) self.out.write("" % (link_attr, link_baseurl, url)) if obj.children: for x in obj.children: self.write(x) else: self._write(obj.target) self.out.write("") def writeImageLink(self, obj): if self.images is None: return width = obj.width height = obj.height if width and height: path = self.images.getPath(obj.target, size=max(width, height)) url = self.images.getURL(obj.target, size=max(width, height)) else: path = self.images.getPath(obj.target) url = self.images.getURL(obj.target) if url is None: return # The following HTML generation code is copied closely from InstaView, which seems to # approximate the nest of

tags needed to render images close to right. # It's also been extended to support Gallery tags. if self.imglevel==0: self.imglevel += 1 align = obj.align thumb = obj.thumb frame = obj.frame caption = obj.caption # SVG images must be included using

" + ( self.resultstitle % title.encode('utf8') ) + "