#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2007, One Laptop Per Child # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # Web server script for Wikiserver project. # # Usage: server.py # from __future__ import with_statement import sys import os import subprocess import select import codecs from StringIO import StringIO import BaseHTTPServer from SimpleHTTPServer import SimpleHTTPRequestHandler import cgi import errno import urllib import tempfile import re import wp import xml.dom.minidom from pylru import lrudecorator # Uncomment to print out a large dump from the template expander. #os.environ['DEBUG_EXPANDER'] = '1' try: from hashlib import md5 except ImportError: from md5 import md5 import mwlib.htmlwriter from mwlib import parser, scanner, expander class MyHTTPServer(BaseHTTPServer.HTTPServer): def serve_forever(self, poll_interval=0.5): """Overridden version of BaseServer.serve_forever that does not fail to work when EINTR is received. """ self._BaseServer__serving = True self._BaseServer__is_shut_down.clear() while self._BaseServer__serving: # XXX: Consider using another file descriptor or # connecting to the socket to wake this up instead of # polling. Polling reduces our responsiveness to a # shutdown request and wastes cpu at all other times. try: r, w, e = select.select([self], [], [], poll_interval) except select.error, e: if e[0] == errno.EINTR: print "got eintr" continue raise if r: self._handle_request_noblock() self._BaseServer__is_shut_down.set() class LinkStats: allhits = 1 alltotal = 1 pagehits = 1 pagetotal = 1 class ArticleIndex: # Prepare an in-memory index, using the already generated # index file. def __init__(self, path): self.article_index = set() with codecs.open(path, mode='r', encoding='utf-8') as f: for line in f.readlines(): m = re.search(r'(.*?)\s*\d+$', line) if m is None: raise AssertionError("Match didn't work") self.article_index.add(m.group(1)) def __contains__(self, x): return x in self.article_index class WPWikiDB: """Retrieves article contents for mwlib.""" def __init__(self, lang, templateprefix, templateblacklist): self.lang = lang self.templateprefix = templateprefix self.templateblacklist = templateblacklist def getRawArticle(self, title, followRedirects=True): # Retrieve article text, recursively following #redirects. if title == '': return '' oldtitle = "" while True: # Replace underscores with spaces in title. title = title.replace("_", " ") # Capitalize the first letter of the article -- Trac #6991. title = title[0].capitalize() + title[1:] if title == oldtitle: article_text = "" break article_text = wp_load_article(title.encode('utf8')) if article_text == None: # something's wrong return None article_text = unicode(article_text, 'utf8') # To see unmodified article_text, uncomment here. # print article_text if not followRedirects: break m = re.match(r'^\s*\#?redirect\s*\:?\s*\[\[(.*)\]\]', article_text, re.IGNORECASE|re.MULTILINE) if not m: break oldtitle = title title = m.group(1) # Stripping leading & trailing whitespace fixes template expansion. article_text = article_text.lstrip() article_text = article_text.rstrip() return article_text def getTemplate(self, title, followRedirects=False): return self.getRawArticle(title) def expandArticle(self, article_text, title): template_expander = expander.Expander(article_text, pagename=title, wikidb=self, lang=self.lang, templateprefix = self.templateprefix, templateblacklist = self.templateblacklist) return template_expander.expandTemplates() def getExpandedArticle(self, title): return self.expandArticle(self.getRawArticle(title), title) class WPImageDB: """Retrieves images for mwlib.""" def __init__(self, basepath): self.basepath = basepath def hashpath(self, name): name = name.replace(' ', '_') name = name[:1].upper()+name[1:] d = md5(name.encode('utf-8')).hexdigest() return "/".join([d[0], d[:2], name]) def getPath(self, name, size=None): hashed_name = self.hashpath(name).encode('utf8') path = self.basepath + '/%s' % hashed_name #print "getPath: %s -> %s" % (name.encode('utf8'), path.encode('utf8')) return path def getURL(self, name, size=None): hashed_name = self.hashpath(name).encode('utf8') if os.path.exists(self.basepath + hashed_name): url = '/' + self.basepath + hashed_name else: url = 'http://upload.wikimedia.org/wikipedia/commons/' + hashed_name #print "getUrl: %s -> %s" % (name.encode('utf8'), url.encode('utf8')) return url class HTMLOutputBuffer: """Buffers output and converts to utf8 as needed.""" def __init__(self): self.buffer = '' def write(self, obj): if isinstance(obj, unicode): self.buffer += obj.encode('utf8') else: self.buffer += obj def getvalue(self): return self.buffer class WPMathRenderer: def render(self, latex): process = subprocess.Popen(('bin/blahtex', '--mathml', '--texvc-compatible-commands'), stdin=subprocess.PIPE, stdout=subprocess.PIPE) (mathml, err) = process.communicate(latex.encode('utf8')) if process.returncode is not 0: return "" # Ugly! There is certainly a better way to do this, but my DOM skills are weak, and this works. try: dom = xml.dom.minidom.parseString(mathml) dom = dom.getElementsByTagName('blahtex')[0] dom = dom.getElementsByTagName('mathml')[0] dom = dom.getElementsByTagName('markup')[0] mathml = dom.toxml() mathml = mathml.replace('markup', 'math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"') dom.unlink() except: print "BLAHTEX XML PARSING FAILED:\nINPUT: '%s'\nOUTPUT: '%s'" % (latex, mathml) return "" # Straight embedding. Requires parent document to be XHTML. return mathml class WPHTMLWriter(mwlib.htmlwriter.HTMLWriter): """Customizes HTML output from mwlib.""" def __init__(self, index, wfile, images=None, lang='en'): self.index = index self.gallerylevel = 0 self.lang = lang math_renderer = WPMathRenderer() mwlib.htmlwriter.HTMLWriter.__init__(self, wfile, images, math_renderer=math_renderer) def writeLink(self, obj): if obj.target is None: return article = obj.target # Parser appending '/' characters to link targets for some reason. article = article.rstrip('/') title = article title = title[0].capitalize() + title[1:] title = title.replace("_", " ") article_exists = title.encode('utf8') in self.index if article_exists: # Exact match. Internal link. LinkStats.allhits += 1 LinkStats.alltotal += 1 LinkStats.pagehits += 1 LinkStats.pagetotal += 1 link_attr = '' link_baseurl = '/wiki/' else: # No match. External link. Use {lang}.wikipedia.org. # FIXME: Decide between {lang}.w.o and schoolserver. LinkStats.alltotal += 1 LinkStats.pagetotal += 1 link_attr = "class='offsite' " link_baseurl = 'http://' + self.lang + '.wikipedia.org/wiki/' parts = article.encode('utf-8').split('#') parts[0] = parts[0].replace(" ", "_") url = ("#".join([x for x in parts])) self.out.write("" % (link_attr, link_baseurl, url)) if obj.children: for x in obj.children: self.write(x) else: self._write(obj.target) self.out.write("") def writeImageLink(self, obj): if self.images is None: return width = obj.width height = obj.height if width and height: path = self.images.getPath(obj.target, size=max(width, height)) url = self.images.getURL(obj.target, size=max(width, height)) else: path = self.images.getPath(obj.target) url = self.images.getURL(obj.target) if url is None: return # The following HTML generation code is copied closely from InstaView, which seems to # approximate the nest of
tags needed to render images close to right. # It's also been extended to support Gallery tags. if self.imglevel==0: self.imglevel += 1 align = obj.align thumb = obj.thumb frame = obj.frame caption = obj.caption # SVG images must be included using rather than . if re.match(r'.*\.svg$', url, re.IGNORECASE): tag = 'object' ref = 'data' else: tag = 'img' ref = 'src' # Hack to get galleries to look okay, in the absence of image dimensions. if self.gallerylevel > 0: width = 120 if thumb and not width: width = 180 #FIXME: This should not be hardcoded attr = '' if width: attr += 'width="%d" ' % width img = '<%(tag)s %(ref)s="%(url)s" longdesc="%(caption)s" %(attr)s>' % \ {'tag':tag, 'ref':ref, 'url':url, 'caption':caption, 'attr':attr} center = False if align == 'center': center = True align = None if center: self.out.write('
'); if self.gallerylevel > 0: self.out.write('
') self.out.write('
') self.out.write('
') self.out.write('' % (url, caption)) self.out.write(img) self.out.write('') self.out.write('
') self.out.write('
') self.out.write('
') self.out.write('

') for x in obj.children: self.write(x) self.out.write('

') self.out.write('
') self.out.write('
') elif frame or thumb: if not align: align = "right" self.out.write('
' % align) if not width: width = 180 # default thumb width self.out.write('
' % (int(width)+2)) if thumb: self.out.write(img) self.out.write('
') self.out.write('
') self.out.write('' % url) self.out.write('') self.out.write('') self.out.write('
') for x in obj.children: self.write(x) self.out.write('
') else: self.out.write(img) self.out.write('
') for x in obj.children: self.write(x) self.out.write('
') self.out.write('
') self.out.write('
') elif align: self.out.write('
' % align) self.out.write(img) self.out.write('
') else: self.out.write(img) if center: self.out.write('
'); self.imglevel -= 1 else: self.out.write('' % url.encode('utf8')) for x in obj.children: self.write(x) self.out.write('') def writeTagNode(self, t): if t.caption == 'gallery': self.out.write('') self.gallerylevel += 1 # TODO: More than one row. self.out.write('') for x in t.children: self.out.write('') self.out.write('') self.gallerylevel -= 1 self.out.write('') else: # All others handled by base class. mwlib.htmlwriter.HTMLWriter.writeTagNode(self, t) class WikiRequestHandler(SimpleHTTPRequestHandler): def __init__(self, index, conf, request, client_address, server): # pullcord is currently offline # self.reporturl = 'pullcord.laptop.org:8000' self.reporturl = False self.index = index self.port = conf['port'] self.lang = conf['lang'] self.flang = conf['flang'] self.templateprefix = conf['templateprefix'] self.templateblacklist = set() self.imgbasepath = self.flang + '/images/' self.wpheader = conf['wpheader'] self.wpfooter = conf['wpfooter'] self.resultstitle = conf['resultstitle'] if conf.has_key('editdir'): self.editdir = conf['editdir'] else: self.editdir = False if conf.has_key('giturl'): self.giturl = conf['giturl'] else: self.giturl = False self.wikidb = WPWikiDB(self.lang, self.templateprefix, self.templateblacklist) self.client_address = client_address SimpleHTTPRequestHandler.__init__( self, request, client_address, server) def get_wikitext(self, title): article_text = self.wikidb.getRawArticle(title) if self.editdir: edited = self.get_editedarticle(title) if edited: article_text = edited # Pass ?override=1 in the url to replace wikitext for testing the renderer. if self.params.get('override', 0): override = codecs.open('override.txt', 'r', 'utf-8') article_text = override.read() override.close() # Pass ?noexpand=1 in the url to disable template expansion. if not self.params.get('noexpand', 0) \ and not self.params.get('edit', 0): article_text = self.wikidb.expandArticle(article_text, title) return article_text def write_wiki_html(self, htmlout, title, article_text): tokens = scanner.tokenize(article_text, title) wiki_parsed = parser.Parser(tokens, title).parse() wiki_parsed.caption = title imagedb = WPImageDB(self.flang + '/images/') writer = WPHTMLWriter(self.index, htmlout, images=imagedb, lang=self.lang) writer.write(wiki_parsed) def send_article(self, title): article_text = self.get_wikitext(title) # Capitalize the first letter of the article -- Trac #6991. title = title[0].capitalize() + title[1:] # Replace underscores with spaces in title. title = title.replace("_", " ") # Redirect to Wikipedia if the article text is empty (e.g. an image link) if article_text == "": self.send_response(301) self.send_header("Location", 'http://' + self.lang + '.wikipedia.org/wiki/' + title.encode('utf8')) self.end_headers() return # Pass ?raw=1 in the URL to see the raw wikitext (post expansion, unless noexpand=1 is also set). if self.params.get('raw', 0): self.send_response(200) self.send_header("Content-Type", "text/plain; charset=utf-8") self.end_headers() self.wfile.write(article_text.encode('utf8')) elif self.params.get('edit', 0): self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() self.wfile.write('
') # self.wfile.write('User:
') # self.wfile.write('Comment:
') self.wfile.write('
') self.wfile.write('
") else: htmlout = HTMLOutputBuffer() self.send_response(200) self.send_header("Content-Type", "text/xml; charset=utf-8") self.end_headers() htmlout.write( ''\ ' ]> ') htmlout.write(' ') htmlout.write("") htmlout.write("%s" % title.encode('utf8')) htmlout.write("") htmlout.write("") htmlout.write("") htmlout.write("

") htmlout.write(title) htmlout.write(' · '+ self.wpheader + ' ') if self.reporturl: # Report rendering problem. htmlout.write('· Haz clic aquí si esta página contiene errores de presentación ') # Report inappropriate content. htmlout.write(' · Esta página contiene material inapropiado') if self.editdir: htmlout.write(' · [ Editar ]') htmlout.write(' · [ Vista OK ]') if self.giturl: htmlout.write(' · [ Historial ]') htmlout.write("") htmlout.write('

') self.write_wiki_html(htmlout, title, article_text) htmlout.write('
' + self.wpfooter + '
') htmlout.write("") htmlout.write("") html = htmlout.getvalue() # Fix any non-XHTML tags using tidy. process = subprocess.Popen(('bin/tidy', '-q', '-config', 'bin/tidy.conf', '-numeric', '-utf8', '-asxhtml'), stdin=subprocess.PIPE, stdout=subprocess.PIPE) (xhtml, err) = process.communicate(html) if len(xhtml): html = xhtml else: print "FAILED to tidy '%s'" % title self.wfile.write(html) def do_POST(self): real_path = urllib.unquote(self.path) real_path = unicode(real_path, 'utf8') (real_path, sep, param_text) = real_path.partition('?') # Wiki requests return article contents or redirect to Wikipedia. m = re.match(r'^/wiki/(.+)$', real_path) if self.editdir and m: title = m.group(1) self._save_page(title) self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() htmlout = HTMLOutputBuffer() htmlout.write(title.encode('utf8')) self.wfile.write('Editado: ') self.wfile.write('') self.wfile.write(htmlout.getvalue()) self.wfile.write('') return # Any other request redirects to the index page. self.send_response(301) self.send_header("Location", "/static/") self.end_headers() def _save_page(self, title): formdata = cgi.FieldStorage(fp=self.rfile, headers=self.headers, environ = {'REQUEST_METHOD':'POST'}, keep_blank_values = 1) user = formdata.getfirst('user') comment = formdata.getfirst('comment') wmcontent = formdata.getfirst('wmcontent') # fix newlines wmcontent = re.sub('\r', '', wmcontent) fpath = self.getfpath('wiki', title) # UGLY: racy. if not os.path.exists(fpath): self._saveorig(title) (fh, tmpfpath) = tempfile.mkstemp(dir=os.path.dirname(fpath)) os.write(fh, wmcontent) os.close(fh) os.rename(tmpfpath, fpath) return True def getfpath(self, dir, title): # may want to hash it fpath = os.path.join(self.editdir, dir, title) return fpath def _saveorig(self, title): article_text = self.wikidb.getRawArticle(title) fpath = self.getfpath('wiki.orig', title) fh = codecs.open(fpath, 'w', encoding='utf-8') fh.write(article_text) fh.close() def get_editedarticle(self, title): buf = None fpath = self.getfpath('wiki', title) if os.path.exists(fpath): buf = codecs.open(fpath, 'r', encoding='utf-8').read() return buf def send_searchresult(self, title): self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() self.wfile.write("" + ( self.resultstitle % title.encode('utf8') ) + "") self.wfile.write("") self.wfile.write("") self.wfile.write("") self.wfile.write("

" + ( self.resultstitle % title.encode('utf8') ) + "

") self.wfile.write("
") self.wfile.write("") def send_image(self, path): if os.path.exists(self.imgbasepath + path.encode('utf8')): # If image exists locally, serve it as normal. SimpleHTTPRequestHandler.do_GET(self) else: # If not, redirect to wikimedia. redirect_url = "http://upload.wikimedia.org/wikipedia/commons/%s" \ % path.encode('utf8') self.send_response(301) self.send_header("Location", redirect_url.encode('utf8')) self.end_headers() def handle_feedback(self, feedtype, article): with codecs.open("feedback.log", "a", "utf-8") as f: f.write(feedtype +"\t"+ article +"\t" + self.client_address[0] +"\n") f.close() self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() if feedtype == "render": strtype = "un error de presentación" elif feedtype == "report": strtype = "material inapropriado" self.wfile.write("Comentario recibidoGracias por reportar %s en la pagina %s." % (strtype, article.encode('utf8'))) def do_GET(self): real_path = urllib.unquote(self.path) real_path = unicode(real_path, 'utf8') (real_path, sep, param_text) = real_path.partition('?') self.params = {} for p in param_text.split('&'): (key, sep, value) = p.partition('=') self.params[key] = value # Wiki requests return article contents or redirect to Wikipedia. m = re.match(r'^/wiki/(.+)$', real_path) if m: self.send_article(m.group(1)) return # Search requests return search results. m = re.match(r'^/search$', real_path) if m: self.send_searchresult(self.params.get('q', '')) return # Image requests are handled locally or are referenced from Wikipedia. # matches /es_PE/images/, /en_US/images/ etc m = re.match(r'^/\w\w_\w\w/images/(.+)$', real_path) if m: self.send_image(m.group(1)) return # Static requests handed off to SimpleHTTPServer. m = re.match(r'^/(static|generated)/(.*)$', real_path) if m: SimpleHTTPRequestHandler.do_GET(self) return # Feedback links. m = re.match(r'^/(report|render)$', real_path) if m: self.handle_feedback(m.group(1), self.params.get('q', '')) return # Any other request redirects to the index page. self.send_response(301) self.send_header("Location", "/static/") self.end_headers() def load_db(dbname): wp.wp_load_dump( dbname + '.processed', dbname + '.locate.db', dbname + '.locate.prefixdb', dbname + '.blocks.db') # Cache articles and specially templates @lrudecorator(100) def wp_load_article(title): return wp.wp_load_article(title) def run_server(confvars): index = ArticleIndex('%s.index.txt' % confvars['path']) if confvars.has_key('editdir'): try: for dir in ['wiki', 'wiki.orig']: fdirpath = os.path.join(confvars['editdir'], dir) if not os.path.exists(fdirpath): os.mkdir(fdirpath) except: print "Error setting up directories:" print "%s must be a writable directory" % confvars['editdir'] blacklistpath = os.path.join(os.path.dirname(confvars['path']), 'template_blacklist') blacklist = set() if os.path.exists(blacklistpath): with open(blacklistpath, 'r') as f: for line in f.readlines(): blacklist.add(line.rstrip().decode('utf8')) confvars['templateblacklist'] = blacklist confvars['lang'] = os.path.basename(confvars['path'])[0:2] confvars['flang'] = os.path.basename(confvars['path'])[0:5] ## FIXME GETTEXT templateprefixes = { 'en': 'Template:', 'es': 'Plantilla:' } wpheader = {'en': 'From Wikipedia, The Free Encyclopedia', 'es': 'De Wikipedia, la enciclopedia libre'} wpfooter = {'en': 'Content available under the GNU Free Documentation License.
Wikipedia is a registered trademark of the non-profit Wikimedia Foundation, Inc. ', 'es': 'Contenido disponible bajo los términos de la Licencia de documentación libre de GNU.
Wikipedia es una marca registrada de la organización sin ánimo de lucro Wikimedia Foundation, Inc.
Acerca de Wikipedia'} resultstitle = { 'en': "Search results for '%s'.", 'es': "Resultados de la búsqueda sobre '%s'." } confvars['templateprefix'] = templateprefixes[ confvars['lang'] ] confvars['wpheader'] = wpheader[ confvars['lang'] ] confvars['wpfooter'] = wpfooter[ confvars['lang'] ] confvars['resultstitle'] = resultstitle[confvars['lang']] httpd = MyHTTPServer(('', confvars['port']), lambda *args: WikiRequestHandler(index, confvars, *args)) if __name__ == '__main__': httpd.serve_forever() else: from threading import Thread server = Thread(target=httpd.serve_forever) server.setDaemon(True) server.start() # Tell the world that we're ready to accept request. print 'ready' if __name__ == '__main__': conf = {'path': sys.argv[1], 'port': int(sys.argv[2])} if len(sys.argv) > 3: conf['editdir'] = sys.argv[3] if len(sys.argv) > 4: conf['giturl'] = sys.argv[4] load_db(conf['path']) run_server(conf)