diff options
Diffstat (limited to 'server.py')
-rw-r--r-- | server.py | 642 |
1 files changed, 642 insertions, 0 deletions
diff --git a/server.py b/server.py new file mode 100644 index 0000000..882a53f --- /dev/null +++ b/server.py @@ -0,0 +1,642 @@ +# -*- coding: utf-8 -*- +# +# Web server script for Wikiserver project. +# +# Usage: server.py <dbfile> <port> +# +from __future__ import with_statement +import sys +import os +from StringIO import StringIO +import BaseHTTPServer +from SimpleHTTPServer import SimpleHTTPRequestHandler +import urllib +import re +import wp + +# Uncomment to print out a large dump from the template expander. +#os.environ['DEBUG_EXPANDER'] = '1' + +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + +import mwlib.htmlwriter +from mwlib import parser, scanner, expander + +parsers = [ + '/js/wiki2html.js', + '/js/instaview-0.6.1.js', + '/js/instaview-0.6.4.js', + 'mwlib', +] + +default_parser = 3 + +class LinkStats: + allhits = 1 + alltotal = 1 + pagehits = 1 + pagetotal = 1 + +class ArticleIndex: + # Prepare an in-memory index, using the already generated + # index file. + + def __init__(self, path): + self.article_index = set() + with open(path, 'r') as f: + for line in f.readlines(): + m = re.search(r'(.*?)\s*\d+$', line) + if m is None: + raise AssertionError("Match didn't work") + self.article_index.add(m.group(1)) + + def __contains__(self, x): + return x in self.article_index + +class WPWikiDB: + """Retrieves article contents for mwlib.""" + + def getRawArticle(self, title): + # Retrieve article text, recursively following #redirects. + while True: + # Capitalize the first letter of the article -- Trac #6991. + title = title[0].capitalize() + title[1:] + # Replace underscores with spaces in title. + title = title.replace("_", " ") + article_text = unicode(wp.wp_load_article(title.encode('utf8')), 'utf8') + + # To see unmodified article_text, uncomment here. + # print article_text + + m = re.match(r'^\s*\#?redirect\s*\:?\s*\[\[(.*)\]\]', article_text, re.IGNORECASE|re.MULTILINE) + if not m: break + title = m.group(1) + + # WTB: Stripping whitespace improves template expansion. + # TODO: Where is it coming from? + article_text = article_text.lstrip() + article_text = article_text.rstrip() + + return article_text + + def getTemplate(self, title, followRedirects=False): + return self.getRawArticle(title) + + def getExpandedArticle(self, title): + article_text = self.getRawArticle(title) + template_expander = expander.Expander(article_text, pagename=title, wikidb=self) + article_text = template_expander.expandTemplates() + return article_text + +class WPImageDB: + """Retrieves images for mwlib.""" + + def hashpath(self, name): + name = name.replace(' ', '_') + name = name[:1].upper()+name[1:] + d = md5(name.encode('utf-8')).hexdigest() + return "/".join([d[0], d[:2], name]) + + def getPath(self, name, size=None): + hashed_name = self.hashpath(name) + path = 'images/%s' % hashed_name + #print "getPath: %s -> %s" % (name.encode('utf8'), path.encode('utf8')) + return path + + def getURL(self, name, size=None): + hashed_name = self.hashpath(name) + if os.path.exists('images/' + hashed_name): + url = '/images/' + hashed_name + else: + url = 'http://upload.wikimedia.org/wikipedia/commons/' + hashed_name + #print "getUrl: %s -> %s" % (name.encode('utf8'), url.encode('utf8')) + return url + +class HTMLOutputBuffer: + """Buffers output and converts to utf8 as needed.""" + def __init__(self): + self.buffer = '' + + def write(self, obj): + if isinstance(obj, unicode): + self.buffer += str(obj).encode('utf8') + else: + self.buffer += str(obj) + + def getvalue(self): + return self.buffer + +class WPHTMLWriter(mwlib.htmlwriter.HTMLWriter): + """Customizes HTML output from mwlib.""" + + def __init__(self, index, wfile, images=None, math_renderer=None): + self.index = index + self.gallerylevel = 0 + mwlib.htmlwriter.HTMLWriter.__init__(self, wfile, images, math_renderer) + + def writeLink(self, obj): + if obj.target is None: + return + + article = obj.target + + # Parser appending '/' characters to link targets for some reason. + article = article.rstrip('/') + + title = article + title = title[0].capitalize() + title[1:] + title = title.replace("_", " ") + + #article_exists = wp.wp_article_exists(title.encode('utf8')) + article_exists = title.encode('utf8') in self.index + + if article_exists: + # Exact match. Internal link. + LinkStats.allhits += 1 + LinkStats.alltotal += 1 + LinkStats.pagehits += 1 + LinkStats.pagetotal += 1 + link_attr = '' + link_baseurl = '/wiki/' + else: + # No match. External link. Use es.wikipedia.org. + # FIXME: Decide between es.w.o and schoolserver. + LinkStats.alltotal += 1 + LinkStats.pagetotal += 1 + link_attr = "class='offsite' " + link_baseurl = "http://es.wikipedia.org/wiki/" + + parts = article.encode('utf-8').split('#') + parts[0] = parts[0].replace(" ", "_") + url = ("#".join([x for x in parts])) + + self.out.write("<a %s href='%s%s'>" % (link_attr, link_baseurl, url)) + + if obj.children: + for x in obj.children: + self.write(x) + else: + self._write(obj.target) + + self.out.write("</a>") + + def writeImageLink(self, obj): + if self.images is None: + return + + width = obj.width + height = obj.height + + if width and height: + path = self.images.getPath(obj.target, size=max(width, height)) + url = self.images.getURL(obj.target, size=max(width, height)) + else: + path = self.images.getPath(obj.target) + url = self.images.getURL(obj.target) + + if url is None: + return + + # The following HTML generation code is copied closely from InstaView, which seems to + # approximate the nest of <div> tags needed to render images close to right. + # It's also been extended to support Gallery tags. + if self.imglevel==0: + self.imglevel += 1 + + align = obj.align + thumb = obj.thumb + frame = obj.frame + caption = obj.caption + + # SVG images must be included using <object data=''> rather than <img src=''>. + if re.match(r'.*\.svg$', url, re.IGNORECASE): + tag = 'object' + ref = 'data' + else: + tag = 'img' + ref = 'src' + + # Hack to get galleries to look okay, in the absence of image dimensions. + if self.gallerylevel > 0: + width = 120 + + if thumb and not width: + width = 180 #FIXME: This should not be hardcoded + + attr = '' + if width: + attr += 'width="%d" ' % width + + img = '<%(tag)s %(ref)s="%(url)s" longdesc="%(caption)s" %(attr)s></%(tag)s>' % \ + {'tag':tag, 'ref':ref, 'url':url.encode('utf8'), 'caption':caption.encode('utf8'), 'attr':attr} + + if thumb: + frame = True + + center = False + if align == 'center': + center = True + align = None + + if center: + self.out.write('<div class="center">'); + + if frame: + if not align: + align = "right" + self.out.write('<div class="thumb t%s">' % align) + if thumb: + if not width: + width = 180 # default thumb width + + self.out.write('<div style="width:%dpx;">' % (int(width)+2)) + self.out.write(img) + self.out.write('<div class="thumbcaption">') + self.out.write('<div class="magnify" style="float:right">') + self.out.write('<a href="%s" class="internal" title="Enlarge">' % url.encode("utf8")) + self.out.write('<img src="/static/magnify-clip.png">') + self.out.write('</a>') + self.out.write('</div>') + for x in obj.children: + self.write(x) + self.out.write('</div>') + self.out.write('</div>') + else: + self.out.write('<div>') + self.out.write(img) + self.out.write('<div class="thumbcaption">') + for x in obj.children: + self.write(x) + self.out.write('</div>') + self.out.write('</div>') + self.out.write('</div>') + elif align: + self.out.write('<div class="float%s">' % align) + self.out.write(img) + self.out.write('</div>') + elif self.gallerylevel > 0: + self.out.write('<div class="gallerybox" style="width: 155px;">') + + self.out.write('<div class="thumb" style="padding: 13px 0; width: 150px;">') + self.out.write('<div style="margin-left: auto; margin-right: auto; width: 120px;">') + self.out.write('<a href="%s" class="image" title="%s">' % (url.encode("utf8"), caption.encode('utf8'))) + self.out.write(img) + self.out.write('</a>') + self.out.write('</div>') + self.out.write('</div>') + + self.out.write('<div class="gallerytext">') + self.out.write('<p>') + for x in obj.children: + self.write(x) + self.out.write('</p>') + self.out.write('</div>') + + self.out.write('</div>') + else: + self.out.write(img) + + if center: + self.out.write('</div>'); + + self.imglevel -= 1 + else: + self.out.write('<a href="%s">' % url.encode('utf8')) + + for x in obj.children: + self.write(x) + + self.out.write('</a>') + + def writeTagNode(self, t): + if t.caption == 'gallery': + self.out.write('<table class="gallery" cellspacing="0" cellpadding="0">') + + self.gallerylevel += 1 + + # TODO: More than one row. + self.out.write('<tr>') + + for x in t.children: + self.out.write('<td>') + self.write(x) + self.out.write('</td>') + + self.out.write('</tr>') + + self.gallerylevel -= 1 + + self.out.write('</table>') + else: + # All others handled by base class. + mwlib.htmlwriter.HTMLWriter.writeTagNode(self, t) + +class WikiRequestHandler(SimpleHTTPRequestHandler): + def __init__(self, index, request, client_address, server): + self.index = index + SimpleHTTPRequestHandler.__init__( + self, request, client_address, server) + + def resolve_links(self, article_prelinks): + LinkStats.pagehits = 1 + LinkStats.pagetotal = 1 + + for match in re.finditer(r"\[\[(.*?)\]\]", article_prelinks): + if match: + link = match.group(1) + pipes = link.count("|") + toreplace = "[[" + link + "]]" + + if pipes > 1: + continue + + # First, see if we have a [[foo|bar]]-style link. + pipematch = re.search(r"(.*?)\|(.*)", link) + + if pipematch: + prepipe = pipematch.group(1) + postpipe = pipematch.group(2) + + title = prepipe + title = title[0].capitalize() + title[1:] + title = title.replace("_", " ") + #article_exists = wp.wp_article_exists(title.encode('utf8')) + article_exists = title.encode('utf8') in self.index + + if article_exists: + # Exact match. Internal link. + LinkStats.allhits += 1 + LinkStats.alltotal += 1 + LinkStats.pagehits += 1 + LinkStats.pagetotal += 1 + article_prelinks = article_prelinks.replace(toreplace, "<a href='/wiki/%s'>%s</a>" % (prepipe, postpipe)) + else: + # No match. External link. Use es.wikipedia.org. + # FIXME: Decide between es.w.o and schoolserver. + LinkStats.alltotal += 1 + LinkStats.pagetotal += 1 + article_prelinks = article_prelinks.replace(toreplace, "<a class='offsite' href='http://es.wikipedia.org/wiki/%s'>%s</a>" % (prepipe, postpipe)) + + else: + # [[foo]]-style link. + title = link + title = title[0].capitalize() + title[1:] + title = title.replace("_", " ") + #article_exists = wp.wp_article_exists(title.encode('utf8')) + article_exists = title.encode('utf8') in self.index + + if article_exists: + LinkStats.allhits += 1 + LinkStats.alltotal += 1 + LinkStats.pagehits += 1 + LinkStats.pagetotal += 1 + else: + article_prelinks = article_prelinks.replace(toreplace, "<a class='offsite' href='http://es.wikipedia.org/wiki/%s'>%s</a>" % (link, link)) + LinkStats.alltotal += 1 + LinkStats.pagetotal += 1 + + return article_prelinks + + def strip_templates(self, wikitext): + """Recursively strips all {{ }} style templates from 'wikitext'.""" + output = '' + nest_level = 0 + i = 0 + while i < len(wikitext)-1: + if wikitext[i] == '{' and wikitext[i+1] == '{': + nest_level += 1 + i += 2 + elif wikitext[i] == '}' and wikitext[i+1] == '}': + nest_level -= 1 + if nest_level < 0: + nest_level = 0 + i += 2 + else: + if nest_level == 0: + output += wikitext[i] + i += 1 + return output + + def get_wikitext(self, title): + wikidb = WPWikiDB() + article_text = wikidb.getRawArticle(title) + + # Pass ?noexpand=1 in the url to disable template expansion. + if self.params.get('noexpand', 0): + article_text = wikidb.getRawArticle(title) + article_text = self.strip_templates(article_text) + else: + article_text = wikidb.getExpandedArticle(title) + + # Pass ?override=1 in the url to replace wikitext for testing the renderer. + if self.params.get('override', 0): + try: + override = open('override.txt', 'r') + article_text = unicode(override.read(), 'utf8') + override.close() + except: + pass + + return article_text + + def send_wiki_html_js(self, article_text, parser): + self.wfile.write("<script type='text/javascript' src='%s'></script>" % parser) + + #self.wfile.write("Internal hits on this page: %d<br>" % LinkStats.pagehits) + #self.wfile.write("Total links on this page: %d<br>" % LinkStats.pagetotal) + #page_percent = ((1.0 * LinkStats.pagehits / LinkStats.pagetotal) * 100) + #self.wfile.write("Percentage: %.2f<br>" % page_percent) + #self.wfile.write("Internal hits so far: %d<br>" % LinkStats.allhits) + #self.wfile.write("Total links so far: %d<br>" % LinkStats.alltotal) + #total_percent = ((1.0 * LinkStats.allhits / LinkStats.alltotal) * 100) + #self.wfile.write("Percentage: %.2f<br>" % total_percent) + + # Link resolution. + article_text = self.resolve_links(article_text) + + # Embed article text and call parser. + jstext = '' + for l in article_text.split('\n'): + jstext += re.escape(l) + '\\n\\\n' + + self.wfile.write("<script type='text/javascript'>"); + self.wfile.write("var wikitext = \"%s\";" % jstext.encode('utf8')); + self.wfile.write("document.write(convert_wiki_to_html(unescape(wikitext)));"); + self.wfile.write("</script>") + + def send_wiki_html_mwlib(self, title, article_text): + tokens = scanner.tokenize(article_text, title) + + wiki_parsed = parser.Parser(tokens, title).parse() + wiki_parsed.caption = title + + htmlbuf = HTMLOutputBuffer() + + imagedb = WPImageDB() + writer = WPHTMLWriter(self.index, htmlbuf, images=imagedb) + writer.write(wiki_parsed) + + self.wfile.write(htmlbuf.getvalue()) + + def send_article(self, title): + article_text = self.get_wikitext(title) + + # Capitalize the first letter of the article -- Trac #6991. + title = title[0].capitalize() + title[1:] + # Replace underscores with spaces in title. + title = title.replace("_", " ") + + # Redirect to Wikipedia if the article text is empty (e.g. an image link) + if article_text == "": + self.send_response(301) + self.send_header("Location", + "http://es.wikipedia.org/wiki/" + title.encode('utf8')) + self.end_headers() + return + + # Pass ?raw=1 in the URL to see the raw wikitext (post expansion, unless noexpand=1 is also set). + if self.params.get('raw', 0): + self.send_response(200) + self.send_header("Content-Type", "text/plain; charset=utf-8") + self.end_headers() + + self.wfile.write(article_text.encode('utf8')) + else: + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.end_headers() + + self.wfile.write("<html><head><title>%s</title>" % title.encode('utf8')) + + self.wfile.write("<style type='text/css' media='screen, projection'>"\ + "@import '/static/common.css';"\ + "@import '/static/monobook.css';"\ + "@import '/static/styles.css';"\ + "@import '/static/shared.css';"\ + "</style>") + + self.wfile.write("</head>") + + self.wfile.write("<body>") + + parser_index = int(self.params.get('parser', default_parser)) + parser = parsers[parser_index] + + if parser == 'mwlib': + self.send_wiki_html_mwlib(title, article_text) + else: + self.send_wiki_html_js(article_text, parser) + + self.wfile.write('<center>Contenido disponible bajo los términos de la <a href="/static/es-gfdl.html">Licencia de documentación libre de GNU</a>. <br/> Wikipedia es una marca registrada de la organización sin ánimo de lucro Wikimedia Foundation, Inc.<br/><a href="/static/acerca.html">Acerca de Wikipedia</a> </center>') + self.wfile.write("</body></html>") + + def send_searchresult(self, title): + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.end_headers() + + self.wfile.write("<html><head><title>Search Results for '%s'</title></head>" % title.encode('utf8')) + + self.wfile.write("<style type='text/css' media='screen, projection'>"\ + "@import '/static/monobook.css';</style>") + + self.wfile.write("</head>") + + self.wfile.write("<body>") + + self.wfile.write("<h1>Search Results for '%s'.</h1>" % title.encode('utf8')) + self.wfile.write("<ul>") + + num_results = wp.wp_search(title.encode('utf8')) + for i in xrange(0, num_results): + result = unicode(wp.wp_result(i), 'utf8') + self.wfile.write('<li><a href="/wiki/%s">%s</a></li>' % + (result.encode('utf8'), result.encode('utf8'))) + + self.wfile.write("</ul>") + + self.wfile.write("</body></html>") + + def send_image(self, path): + if os.path.exists('images/' + path): + # If image exists locally, serve it as normal. + SimpleHTTPRequestHandler.do_GET(self) + else: + # If not, redirect to wikimedia. + redirect_url = "http://upload.wikimedia.org/wikipedia/commons/" + path + self.send_response(301) + self.send_header("Location", redirect_url.encode('utf8')) + self.end_headers() + + def do_GET(self): + real_path = urllib.unquote(self.path) + real_path = unicode(real_path, 'utf8') + + (real_path, sep, param_text) = real_path.partition('?') + self.params = {} + for p in param_text.split('&'): + (key, sep, value) = p.partition('=') + self.params[key] = value + + # Wiki requests return article contents or redirect to Wikipedia. + m = re.match(r'^/wiki/(.+)$', real_path) + if m: + self.send_article(m.group(1)) + return + + # Search requests return search results. + m = re.match(r'^/search$', real_path) + if m: + self.send_searchresult(self.params.get('q', '')) + return + + # Image requests are handled locally or are referenced from Wikipedia. + m = re.match(r'^/images/(.+)$', real_path) + if m: + self.send_image(m.group(1)) + return + + # Static requests handed off to SimpleHTTPServer. + m = re.match(r'^/static/(.*)$', real_path) + if m: + SimpleHTTPRequestHandler.do_GET(self) + return + + # Any other request redirects to the index page. + self.send_response(301) + self.send_header("Location", "/static/") + self.end_headers() + +def load_db(dbname): + wp.wp_load_dump( + dbname + '.processed', + dbname + '.locate.db', + dbname + '.locate.prefixdb', + dbname + '.blocks.db') + +def run_server(path, port): + index = ArticleIndex('%s.index.txt' % path) + + httpd = BaseHTTPServer.HTTPServer(('', port), + lambda *args: WikiRequestHandler(index, *args)) + + from threading import Thread + server = Thread(target=httpd.serve_forever) + server.run() + + # Tell the world that we're ready to accept request. + print 'ready' + + +if __name__ == '__main__': + load_db(sys.argv[1]) + + # This is an attempt to work around a race condition where Browse starts up before + # the server has loaded the index. Not working yet, though. + #if os.fork(): + # sys.exit(0) + + run_server(sys.argv[1], int(sys.argv[2])) |