#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2007, One Laptop Per Child # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # Web server script for Wikiserver project. # # Usage: server.py # ## Standard libs from __future__ import with_statement import logging import sys import os import platform import select import codecs import BaseHTTPServer from SimpleHTTPServer import SimpleHTTPRequestHandler import SocketServer import socket import cgi import errno import urllib import tempfile import re try: from hashlib import md5 except ImportError: from md5 import md5 import dataretriever import pylru import simplejson ## ## Libs we ship -- add lib path for ## shared objects ## _root_path = os.path.dirname(__file__) # linux32_27" for Linux 32bits Python 2.7 system_id = "%s%s" % (platform.system().lower(), platform.architecture()[0][0:2]) if platform.processor().startswith('arm'): system_id = platform.processor() platform_dir = "%s_%s%s" % (system_id, sys.version_info[0], # major sys.version_info[1]) # minor sys.path.append(os.path.join(_root_path, 'binarylibs', platform_dir)) import mwlib.htmlwriter from mwlib import parser, scanner, expander # Uncomment to print out a large dump from the template expander. #os.environ['DEBUG_EXPANDER'] = '1' class MyHTTPServer(BaseHTTPServer.HTTPServer): def serve_forever(self, poll_interval=0.5): """Overridden version of BaseServer.serve_forever that does not fail to work when EINTR is received. """ self._BaseServer__serving = True self._BaseServer__is_shut_down.clear() while self._BaseServer__serving: # XXX: Consider using another file descriptor or # connecting to the socket to wake this up instead of # polling. Polling reduces our responsiveness to a # shutdown request and wastes cpu at all other times. try: r, w, e = select.select([self], [], [], poll_interval) except select.error, e: if e[0] == errno.EINTR: logging.debug("got eintr") continue raise if r: self._handle_request_noblock() self._BaseServer__is_shut_down.set() def server_bind(self): """Override server_bind in HTTPServer to not use getfqdn to get the server name because is very slow.""" SocketServer.TCPServer.server_bind(self) host, port = self.socket.getsockname()[:2] self.server_name = 'localhost' self.server_port = port class WPWikiDB: """Retrieves article contents for mwlib.""" def __init__(self, path, lang, templateprefix, templateblacklist): self.lang = lang self.templateprefix = templateprefix self.templateblacklist = templateblacklist self.dataretriever = dataretriever.DataRetriever(system_id, path) self.templates_cache = {'!' : '|', u'!': '|'} # a special case def getRawArticle(self, title, followRedirects=True): # Retrieve article text, recursively following #redirects. if title == '': return '' article_text = \ self.dataretriever.get_text_article(title).decode('utf-8') # Stripping leading & trailing whitespace fixes template expansion. article_text = article_text.lstrip() article_text = article_text.rstrip() return article_text def getTemplate(self, title, followRedirects=False): if title in self.templates_cache: return self.templates_cache[title] else: try: template_content = self.getRawArticle(title) # check recursion in templates template_name = title[title.find(':') + 1:] # Remove because expandtemplates doesn't detect it # and follow recursions lower_content = template_content.lower() start_noinclude = lower_content.find('') while start_noinclude > -1: end_noinclude = lower_content.find('') content = template_content[:start_noinclude] if end_noinclude > -1: content = content + template_content[end_noinclude + \ len(''):] template_content = content lower_content = template_content.lower() start_noinclude = lower_content.find('') if re.search('{{' + template_name, template_content, \ re.IGNORECASE) is not None: logging.error("Found recursion template %s" % title) template_content = re.sub(template_name, '_not_found_', template_content, re.IGNORECASE) # Search again if re.search('{{' + template_name, template_content, \ re.IGNORECASE) is not None: template_content = '' except: template_content = '' self.templates_cache[title] = template_content return template_content def expandArticle(self, article_text, title): template_expander = expander.Expander(article_text, pagename=title, wikidb=self, lang=self.lang, templateprefix=self.templateprefix, templateblacklist=self.templateblacklist) expanded_article = template_expander.expandTemplates() return expanded_article def getExpandedArticle(self, title): return self.expandArticle(self.getRawArticle(title), title) class WPImageDB: """Retrieves images for mwlib.""" def __init__(self, basepath): self.basepath = basepath def hashpath(self, name): name = name.replace(' ', '_') name = name[:1].upper() + name[1:] d = md5(name.encode('utf-8')).hexdigest() return "/".join([d[0], d[:2], name]) def hashpath_dir(self, name): name = name.replace(' ', '_') name = name[:1].upper() + name[1:] d = md5(name.encode('utf-8')).hexdigest() return "/".join([d[0], d[:2]]) def getPath(self, name, size=None): hashed_name = self.hashpath(name).encode('utf8') path = self.basepath + '/%s' % hashed_name return path def getURL(self, name, size=None): hashed_name = self.hashpath(name).encode('utf8') if size is not None: file_name = self.basepath + self.hashpath_dir(name) + '/' + \ ('%dpx-' % size) + name.replace(' ', '_') else: file_name = self.basepath + self.hashpath_dir(name) + '/' + \ name.replace(' ', '_') if os.path.exists(file_name): url = '/' + file_name else: if size is None: url = 'http://upload.wikimedia.org/wikipedia/commons/' + \ hashed_name else: url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/' \ + hashed_name + ('/%dpx-' % size) + name.replace(' ', '_') if re.match(r'.*\.svg$', url, re.IGNORECASE): url = url + '.png' #print "getUrl: %s -> %s" % (name.encode('utf8'), url.encode('utf8')) return url class HTMLOutputBuffer: """Buffers output and converts to utf8 as needed.""" def __init__(self): self.buffer = '' def write(self, obj): if isinstance(obj, unicode): self.buffer += obj.encode('utf8') else: self.buffer += obj def getvalue(self): return self.buffer class WPMathRenderer: def __init__(self, html_writer): self.writer = html_writer def render(self, latex): logging.debug("MathRenderer %s" % latex) latex = latex.replace('\f', '\\f') latex = latex.replace('\t', '\\t') # \bold gives a error latex = latex.replace('\\bold', '') # postpone the process to do it with javascript at client side mathml = '' self.writer.math_processed = True return mathml class WPHTMLWriter(mwlib.htmlwriter.HTMLWriter): """Customizes HTML output from mwlib.""" def __init__(self, dataretriever, wfile, images=None, lang='en'): self.dataretriever = dataretriever self.gallerylevel = 0 self.lang = lang self.math_processed = False self.links_list = [] math_renderer = WPMathRenderer(self) mwlib.htmlwriter.HTMLWriter.__init__(self, wfile, images, math_renderer=math_renderer) def writeLink(self, obj): if obj.target is None: return article = obj.target #print "writeLink", article, obj.caption if article.startswith('#'): #print "----> " % article self.out.write("" % article) else: # Parser appending '/' characters to link targets for some reason. article = article.rstrip('/') title = article title = title[0].capitalize() + title[1:] title = title.replace("_", " ") self.links_list.append(article) parts = article.encode('utf-8').split('#') parts[0] = parts[0].replace(" ", "_") url = ("#".join([x for x in parts])) self.out.write("" % url) if obj.children: for x in obj.children: self.write(x) else: self._write(obj.target) self.out.write("") def writeImageLink(self, obj): if self.images is None: return width = obj.width height = obj.height is_svg = re.match(r'.*\.svg$', obj.target, re.IGNORECASE) is_thumb = obj.thumb or obj.frame or (self.gallerylevel > 0) if (width or height) or is_thumb: max_length = max(width, height) if obj.thumb: max_length = 180 if self.gallerylevel > 0: max_length = 120 path = self.images.getPath(obj.target, size=max_length) url_thumb = self.images.getURL(obj.target, size=max_length) url = self.images.getURL(obj.target) else: path = self.images.getPath(obj.target) url_thumb = self.images.getURL(obj.target) url = url_thumb if url_thumb is None: return # The following HTML generation code is copied closely from InstaView, # which seems to approximate the nest of
tags needed to render # images close to right. # It's also been extended to support Gallery tags. if self.imglevel == 0: self.imglevel += 1 align = obj.align thumb = obj.thumb frame = obj.frame caption = obj.caption # SVG images must be included using rather than # . if re.match(r'.*\.svg$', url_thumb, re.IGNORECASE): tag = 'object' ref = 'data' else: tag = 'img' ref = 'src' # Hack to get galleries to look okay, in the absence of image # dimensions. if self.gallerylevel > 0: width = 120 if thumb and not width: width = 180 # FIXME: This should not be hardcoded attr = '' if width: attr += 'width="%d" ' % width img = '<%(tag)s %(ref)s="%(url)s" longdesc="%(cap)s" %(att)s>' % \ {'tag': tag, 'ref': ref, 'url': url_thumb, 'cap': caption, 'att': attr} + '' % {'tag': tag} center = False if align == 'center': center = True align = None if center: self.out.write('
') if self.gallerylevel > 0: self.out.write('
') self.out.write('
') self.out.write('
') self.out.write('' % (url, caption)) self.out.write(img) self.out.write('') self.out.write('
') self.out.write('
') self.out.write('
') self.out.write('

') for x in obj.children: self.write(x) self.out.write('

') self.out.write('
') self.out.write('
') elif frame or thumb: if not align: align = "right" self.out.write('
' % align) if not width: width = 180 # default thumb width self.out.write('
' % (int(width) + 2)) if thumb: self.out.write(img) self.out.write('
') self.out.write('
') self.out.write('') self.out.write('' + '') self.out.write('') self.out.write('
') for x in obj.children: self.write(x) self.out.write('
') else: self.out.write(img) self.out.write('
') for x in obj.children: self.write(x) self.out.write('
') self.out.write('
') self.out.write('
') elif align: self.out.write('
' % align) self.out.write(img) self.out.write('
') else: self.out.write(img) if center: self.out.write('
') self.imglevel -= 1 else: self.out.write('' % url.encode('utf8')) for x in obj.children: self.write(x) self.out.write('') def writeTagNode(self, t): if t.caption == 'gallery': self.out.write('') self.gallerylevel += 1 # TODO: More than one row. self.out.write('') for x in t.children: self.out.write('') self.out.write('') self.gallerylevel -= 1 self.out.write('') else: # All others handled by base class. mwlib.htmlwriter.HTMLWriter.writeTagNode(self, t) class WikiRequestHandler(SimpleHTTPRequestHandler): def __init__(self, wikidb, conf, links_cache, request, client_address, server): # pullcord is currently offline # self.reporturl = 'pullcord.laptop.org:8000' self.reporturl = False self.port = conf['port'] self.lang = conf['lang'] self.templateprefix = conf['templateprefix'] self.templateblacklist = set(conf['templateblacklist']) self.wpheader = conf['wpheader'] self.wpfooter = conf['wpfooter'] self.resultstitle = conf['resultstitle'] self.base_path = os.path.dirname(conf['path']) self.links_cache = links_cache if 'editdir' in conf: self.editdir = conf['editdir'] else: self.editdir = False if 'giturl' in conf: self.giturl = conf['giturl'] else: self.giturl = False self.wikidb = wikidb self.client_address = client_address SimpleHTTPRequestHandler.__init__( self, request, client_address, server) def get_wikitext(self, title): article_text = self.wikidb.getRawArticle(title) #print article_text if self.editdir: edited = self.get_editedarticle(title) if edited: article_text = edited # Pass ?override=1 in the url to replace wikitext for testing # the renderer. if self.params.get('override', 0): override = codecs.open('override.txt', 'r', 'utf-8') article_text = override.read() override.close() # Pass ?noexpand=1 in the url to disable template expansion. if not self.params.get('noexpand', 0) \ and not self.params.get('edit', 0): article_text = self.wikidb.expandArticle(article_text, title) return article_text def write_wiki_html(self, htmlout, title, article_text): tokens = scanner.tokenize(article_text, title) wiki_parsed = parser.Parser(tokens, title).parse() wiki_parsed.caption = title imagedb = WPImageDB(self.base_path + '/images/') writer = WPHTMLWriter(self.wikidb.dataretriever, htmlout, images=imagedb, lang=self.lang) writer.write(wiki_parsed) self.links_cache[title] = writer.links_list return writer.math_processed def send_article(self, title): article_text = self.get_wikitext(title) # Capitalize the first letter of the article -- Trac #6991. title = title[0].capitalize() + title[1:] # Replace underscores with spaces in title. title = title.replace("_", " ") # Redirect to Wikipedia if the article text is empty # (e.g. an image link) if article_text == "": self.send_response(301) self.send_header("Location", 'http://' + self.lang + '.wikipedia.org/wiki/' + title.encode('utf8')) self.end_headers() return # Pass ?raw=1 in the URL to see the raw wikitext (post expansion, # unless noexpand=1 is also set). if self.params.get('raw', 0): self.send_response(200) self.send_header("Content-Type", "text/plain; charset=utf-8") self.end_headers() self.wfile.write(article_text.encode('utf8')) elif self.params.get('edit', 0): self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() self.wfile.write('
') self.wfile.write('
') self.wfile.write('
") else: htmlout = HTMLOutputBuffer() self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() htmlout.write(' ') htmlout.write("") htmlout.write("%s" % title.encode('utf8')) htmlout.write("") htmlout.write("") htmlout.write("") htmlout.write("

") htmlout.write(title) htmlout.write(' · ' + self.wpheader + ' ') if self.reporturl: # Report rendering problem. htmlout.write('· Haz clic aquí si esta página contiene ' + 'errores de presentación ') # Report inappropriate content. htmlout.write(' · Esta página contiene material inapropiado' + '') if self.editdir: htmlout.write(' · [ Editar ]') htmlout.write(' · [ Vista OK ]') if self.giturl: htmlout.write(' · [ Historial ]') htmlout.write("") htmlout.write('

') needs_math = self.write_wiki_html(htmlout, title, article_text) if needs_math: # MathJs config htmlout.write('') htmlout.write("") # validate links self.write_process_links_js(htmlout, title) htmlout.write('
' + self.wpfooter + '
') htmlout.write("") htmlout.write("") html = htmlout.getvalue() self.wfile.write(html) def write_process_links_js(self, htmlout, title): """ write javascript to request a array of external links using ajax and compare with the links in the page, if one link is external change the url and the className """ htmlout.write("") def send_links(self, title): """ send a json array of string with the list of url not availables in the local database """ links = self.links_cache[title] # validate the links external_links = [] articles_found = self.wikidb.dataretriever.check_existence_list(links) for article in links: if not dataretriever.normalize_title(article) in articles_found: article = article.replace(" ", "_").encode('utf8') # needed to have the same format than url in the page # when is compared in javascript quoted = urllib.quote(article, safe='~@#$&()*!+=:;,.?/\'') external_links.append(quoted) self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() self.wfile.write(simplejson.dumps(external_links)) def do_POST(self): real_path = urllib.unquote(self.path) real_path = unicode(real_path, 'utf8') (real_path, sep, param_text) = real_path.partition('?') # Wiki requests return article contents or redirect to Wikipedia. m = re.match(r'^/wiki/(.+)$', real_path) if self.editdir and m: title = m.group(1) self._save_page(title) self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() htmlout = HTMLOutputBuffer() htmlout.write(title.encode('utf8')) self.wfile.write('Editado: ') self.wfile.write('') self.wfile.write(htmlout.getvalue()) self.wfile.write('') return # Any other request redirects to the index page. self.send_response(301) self.send_header("Location", "/static/") self.end_headers() def _save_page(self, title): formdata = cgi.FieldStorage(fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD': 'POST'}, keep_blank_values=1) user = formdata.getfirst('user') comment = formdata.getfirst('comment') wmcontent = formdata.getfirst('wmcontent') # fix newlines wmcontent = re.sub('\r', '', wmcontent) fpath = self.getfpath('wiki', title) # UGLY: racy. if not os.path.exists(fpath): self._saveorig(title) (fh, tmpfpath) = tempfile.mkstemp(dir=os.path.dirname(fpath)) os.write(fh, wmcontent) os.close(fh) os.rename(tmpfpath, fpath) return True def getfpath(self, dir, title): # may want to hash it fpath = os.path.join(self.editdir, dir, title) return fpath def _saveorig(self, title): article_text = self.wikidb.getRawArticle(title) fpath = self.getfpath('wiki.orig', title) fh = codecs.open(fpath, 'w', encoding='utf-8') fh.write(article_text) fh.close() def get_editedarticle(self, title): buf = None fpath = self.getfpath('wiki', title) if os.path.exists(fpath): buf = codecs.open(fpath, 'r', encoding='utf-8').read() return buf def send_searchresult(self, title): self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() self.wfile.write("" + (self.resultstitle % title.encode('utf8')) + "") self.wfile.write("") self.wfile.write("") self.wfile.write("") self.wfile.write("

" + (self.resultstitle % title.encode('utf8')) + "

") self.wfile.write("
") self.wfile.write("") def search(self, article_title): return self.wikidb.dataretriever.search(article_title) def send_image(self, path): if os.path.exists(path.encode('utf8')[1:]): # If image exists locally, serve it as normal. SimpleHTTPRequestHandler.do_GET(self) else: # If not, redirect to wikimedia. redirect_url = "http://upload.wikimedia.org/wikipedia/commons/%s" \ % path.encode('utf8') self.send_response(301) self.send_header("Location", redirect_url.encode('utf8')) self.end_headers() def handle_feedback(self, feedtype, article): with codecs.open("feedback.log", "a", "utf-8") as f: f.write(feedtype + "\t" + article + "\t" + self.client_address[0] + "\n") f.close() self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() if feedtype == "render": strtype = "un error de presentación" elif feedtype == "report": strtype = "material inapropriado" self.wfile.write("Comentario recibido" + "Gracias por reportar %s en la pagina %s." % (strtype, article.encode('utf8'))) def do_GET(self): real_path = urllib.unquote(self.path) real_path = unicode(real_path, 'utf8') (real_path, sep, param_text) = real_path.partition('?') self.params = {} for p in param_text.split('&'): (key, sep, value) = p.partition('=') self.params[key] = value # Wiki requests return article contents or redirect to Wikipedia. m = re.match(r'^/wiki/(.+)$', real_path) if m: self.send_article(m.group(1)) return # Search requests return search results. m = re.match(r'^/search$', real_path) if m: self.send_searchresult(self.params.get('q', '')) return # Image requests are handled locally or are referenced from Wikipedia. # matches /es_PE/images/, /en_US/images/ etc m = re.match(r'^/\w*/images/(.+)$', real_path) if m: self.send_image(real_path) return # Static requests handed off to SimpleHTTPServer. m = re.match(r'^/(static|generated)/(.*)$', real_path) if m: SimpleHTTPRequestHandler.do_GET(self) return # Handle link validation requests m = re.match(r'^/links/(.*)$', real_path) if m: self.send_links(m.group(1)) return # Feedback links. m = re.match(r'^/(report|render)$', real_path) if m: self.handle_feedback(m.group(1), self.params.get('q', '')) return # Any other request redirects to the index page. self.send_response(301) self.send_header("Location", "/static/") self.end_headers() def run_server(confvars): if 'editdir' in confvars: try: for dir in ['wiki', 'wiki.orig']: fdirpath = os.path.join(confvars['editdir'], dir) if not os.path.exists(fdirpath): os.mkdir(fdirpath) except: logging.error("Error setting up directories:") logging.debug("%s must be a writable directory" % confvars['editdir']) blacklistpath = os.path.join(os.path.dirname(confvars['path']), 'template_blacklist') logging.debug("Reading template_blacklist %s" % blacklistpath) blacklist = set() if os.path.exists(blacklistpath): with open(blacklistpath, 'r') as f: for line in f.readlines(): blacklist.add(line.rstrip().decode('utf8')) logging.debug("Read %d blacklisted templates" % len(blacklist)) confvars['templateblacklist'] = blacklist confvars['lang'] = confvars['path'][0:2] confvars['flang'] = os.path.basename(confvars['path'])[0:5] wikidb = WPWikiDB(confvars['path'], confvars['lang'], confvars['templateprefix'], confvars['templateblacklist']) links_cache = pylru.lrucache(10) httpd = MyHTTPServer(('', confvars['port']), lambda *args: WikiRequestHandler(wikidb, confvars, links_cache, *args)) if confvars['comandline']: httpd.serve_forever() else: from threading import Thread server = Thread(target=httpd.serve_forever) server.setDaemon(True) logging.debug("Before start server") server.start() logging.debug("After start server") # Tell the world that we're ready to accept request. logging.debug('Ready') if __name__ == '__main__': logging.error("Execute the starting class for your language wikipedia") logging.error("Ex: activity_es.py")