Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
path: root/server.py
diff options
Diffstat (limited to 'server.py')
1 files changed, 642 insertions, 0 deletions
diff --git a/server.py b/server.py
new file mode 100644
index 0000000..882a53f
--- /dev/null
+++ b/server.py
@@ -0,0 +1,642 @@
+# -*- coding: utf-8 -*-
+# Web server script for Wikiserver project.
+# Usage: server.py <dbfile> <port>
+from __future__ import with_statement
+import sys
+import os
+from StringIO import StringIO
+import BaseHTTPServer
+from SimpleHTTPServer import SimpleHTTPRequestHandler
+import urllib
+import re
+import wp
+# Uncomment to print out a large dump from the template expander.
+#os.environ['DEBUG_EXPANDER'] = '1'
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
+import mwlib.htmlwriter
+from mwlib import parser, scanner, expander
+parsers = [
+ '/js/wiki2html.js',
+ '/js/instaview-0.6.1.js',
+ '/js/instaview-0.6.4.js',
+ 'mwlib',
+default_parser = 3
+class LinkStats:
+ allhits = 1
+ alltotal = 1
+ pagehits = 1
+ pagetotal = 1
+class ArticleIndex:
+ # Prepare an in-memory index, using the already generated
+ # index file.
+ def __init__(self, path):
+ self.article_index = set()
+ with open(path, 'r') as f:
+ for line in f.readlines():
+ m = re.search(r'(.*?)\s*\d+$', line)
+ if m is None:
+ raise AssertionError("Match didn't work")
+ self.article_index.add(m.group(1))
+ def __contains__(self, x):
+ return x in self.article_index
+class WPWikiDB:
+ """Retrieves article contents for mwlib."""
+ def getRawArticle(self, title):
+ # Retrieve article text, recursively following #redirects.
+ while True:
+ # Capitalize the first letter of the article -- Trac #6991.
+ title = title[0].capitalize() + title[1:]
+ # Replace underscores with spaces in title.
+ title = title.replace("_", " ")
+ article_text = unicode(wp.wp_load_article(title.encode('utf8')), 'utf8')
+ # To see unmodified article_text, uncomment here.
+ # print article_text
+ m = re.match(r'^\s*\#?redirect\s*\:?\s*\[\[(.*)\]\]', article_text, re.IGNORECASE|re.MULTILINE)
+ if not m: break
+ title = m.group(1)
+ # WTB: Stripping whitespace improves template expansion.
+ # TODO: Where is it coming from?
+ article_text = article_text.lstrip()
+ article_text = article_text.rstrip()
+ return article_text
+ def getTemplate(self, title, followRedirects=False):
+ return self.getRawArticle(title)
+ def getExpandedArticle(self, title):
+ article_text = self.getRawArticle(title)
+ template_expander = expander.Expander(article_text, pagename=title, wikidb=self)
+ article_text = template_expander.expandTemplates()
+ return article_text
+class WPImageDB:
+ """Retrieves images for mwlib."""
+ def hashpath(self, name):
+ name = name.replace(' ', '_')
+ name = name[:1].upper()+name[1:]
+ d = md5(name.encode('utf-8')).hexdigest()
+ return "/".join([d[0], d[:2], name])
+ def getPath(self, name, size=None):
+ hashed_name = self.hashpath(name)
+ path = 'images/%s' % hashed_name
+ #print "getPath: %s -> %s" % (name.encode('utf8'), path.encode('utf8'))
+ return path
+ def getURL(self, name, size=None):
+ hashed_name = self.hashpath(name)
+ if os.path.exists('images/' + hashed_name):
+ url = '/images/' + hashed_name
+ else:
+ url = 'http://upload.wikimedia.org/wikipedia/commons/' + hashed_name
+ #print "getUrl: %s -> %s" % (name.encode('utf8'), url.encode('utf8'))
+ return url
+class HTMLOutputBuffer:
+ """Buffers output and converts to utf8 as needed."""
+ def __init__(self):
+ self.buffer = ''
+ def write(self, obj):
+ if isinstance(obj, unicode):
+ self.buffer += str(obj).encode('utf8')
+ else:
+ self.buffer += str(obj)
+ def getvalue(self):
+ return self.buffer
+class WPHTMLWriter(mwlib.htmlwriter.HTMLWriter):
+ """Customizes HTML output from mwlib."""
+ def __init__(self, index, wfile, images=None, math_renderer=None):
+ self.index = index
+ self.gallerylevel = 0
+ mwlib.htmlwriter.HTMLWriter.__init__(self, wfile, images, math_renderer)
+ def writeLink(self, obj):
+ if obj.target is None:
+ return
+ article = obj.target
+ # Parser appending '/' characters to link targets for some reason.
+ article = article.rstrip('/')
+ title = article
+ title = title[0].capitalize() + title[1:]
+ title = title.replace("_", " ")
+ #article_exists = wp.wp_article_exists(title.encode('utf8'))
+ article_exists = title.encode('utf8') in self.index
+ if article_exists:
+ # Exact match. Internal link.
+ LinkStats.allhits += 1
+ LinkStats.alltotal += 1
+ LinkStats.pagehits += 1
+ LinkStats.pagetotal += 1
+ link_attr = ''
+ link_baseurl = '/wiki/'
+ else:
+ # No match. External link. Use es.wikipedia.org.
+ # FIXME: Decide between es.w.o and schoolserver.
+ LinkStats.alltotal += 1
+ LinkStats.pagetotal += 1
+ link_attr = "class='offsite' "
+ link_baseurl = "http://es.wikipedia.org/wiki/"
+ parts = article.encode('utf-8').split('#')
+ parts[0] = parts[0].replace(" ", "_")
+ url = ("#".join([x for x in parts]))
+ self.out.write("<a %s href='%s%s'>" % (link_attr, link_baseurl, url))
+ if obj.children:
+ for x in obj.children:
+ self.write(x)
+ else:
+ self._write(obj.target)
+ self.out.write("</a>")
+ def writeImageLink(self, obj):
+ if self.images is None:
+ return
+ width = obj.width
+ height = obj.height
+ if width and height:
+ path = self.images.getPath(obj.target, size=max(width, height))
+ url = self.images.getURL(obj.target, size=max(width, height))
+ else:
+ path = self.images.getPath(obj.target)
+ url = self.images.getURL(obj.target)
+ if url is None:
+ return
+ # The following HTML generation code is copied closely from InstaView, which seems to
+ # approximate the nest of <div> tags needed to render images close to right.
+ # It's also been extended to support Gallery tags.
+ if self.imglevel==0:
+ self.imglevel += 1
+ align = obj.align
+ thumb = obj.thumb
+ frame = obj.frame
+ caption = obj.caption
+ # SVG images must be included using <object data=''> rather than <img src=''>.
+ if re.match(r'.*\.svg$', url, re.IGNORECASE):
+ tag = 'object'
+ ref = 'data'
+ else:
+ tag = 'img'
+ ref = 'src'
+ # Hack to get galleries to look okay, in the absence of image dimensions.
+ if self.gallerylevel > 0:
+ width = 120
+ if thumb and not width:
+ width = 180 #FIXME: This should not be hardcoded
+ attr = ''
+ if width:
+ attr += 'width="%d" ' % width
+ img = '<%(tag)s %(ref)s="%(url)s" longdesc="%(caption)s" %(attr)s></%(tag)s>' % \
+ {'tag':tag, 'ref':ref, 'url':url.encode('utf8'), 'caption':caption.encode('utf8'), 'attr':attr}
+ if thumb:
+ frame = True
+ center = False
+ if align == 'center':
+ center = True
+ align = None
+ if center:
+ self.out.write('<div class="center">');
+ if frame:
+ if not align:
+ align = "right"
+ self.out.write('<div class="thumb t%s">' % align)
+ if thumb:
+ if not width:
+ width = 180 # default thumb width
+ self.out.write('<div style="width:%dpx;">' % (int(width)+2))
+ self.out.write(img)
+ self.out.write('<div class="thumbcaption">')
+ self.out.write('<div class="magnify" style="float:right">')
+ self.out.write('<a href="%s" class="internal" title="Enlarge">' % url.encode("utf8"))
+ self.out.write('<img src="/static/magnify-clip.png">')
+ self.out.write('</a>')
+ self.out.write('</div>')
+ for x in obj.children:
+ self.write(x)
+ self.out.write('</div>')
+ self.out.write('</div>')
+ else:
+ self.out.write('<div>')
+ self.out.write(img)
+ self.out.write('<div class="thumbcaption">')
+ for x in obj.children:
+ self.write(x)
+ self.out.write('</div>')
+ self.out.write('</div>')
+ self.out.write('</div>')
+ elif align:
+ self.out.write('<div class="float%s">' % align)
+ self.out.write(img)
+ self.out.write('</div>')
+ elif self.gallerylevel > 0:
+ self.out.write('<div class="gallerybox" style="width: 155px;">')
+ self.out.write('<div class="thumb" style="padding: 13px 0; width: 150px;">')
+ self.out.write('<div style="margin-left: auto; margin-right: auto; width: 120px;">')
+ self.out.write('<a href="%s" class="image" title="%s">' % (url.encode("utf8"), caption.encode('utf8')))
+ self.out.write(img)
+ self.out.write('</a>')
+ self.out.write('</div>')
+ self.out.write('</div>')
+ self.out.write('<div class="gallerytext">')
+ self.out.write('<p>')
+ for x in obj.children:
+ self.write(x)
+ self.out.write('</p>')
+ self.out.write('</div>')
+ self.out.write('</div>')
+ else:
+ self.out.write(img)
+ if center:
+ self.out.write('</div>');
+ self.imglevel -= 1
+ else:
+ self.out.write('<a href="%s">' % url.encode('utf8'))
+ for x in obj.children:
+ self.write(x)
+ self.out.write('</a>')
+ def writeTagNode(self, t):
+ if t.caption == 'gallery':
+ self.out.write('<table class="gallery" cellspacing="0" cellpadding="0">')
+ self.gallerylevel += 1
+ # TODO: More than one row.
+ self.out.write('<tr>')
+ for x in t.children:
+ self.out.write('<td>')
+ self.write(x)
+ self.out.write('</td>')
+ self.out.write('</tr>')
+ self.gallerylevel -= 1
+ self.out.write('</table>')
+ else:
+ # All others handled by base class.
+ mwlib.htmlwriter.HTMLWriter.writeTagNode(self, t)
+class WikiRequestHandler(SimpleHTTPRequestHandler):
+ def __init__(self, index, request, client_address, server):
+ self.index = index
+ SimpleHTTPRequestHandler.__init__(
+ self, request, client_address, server)
+ def resolve_links(self, article_prelinks):
+ LinkStats.pagehits = 1
+ LinkStats.pagetotal = 1
+ for match in re.finditer(r"\[\[(.*?)\]\]", article_prelinks):
+ if match:
+ link = match.group(1)
+ pipes = link.count("|")
+ toreplace = "[[" + link + "]]"
+ if pipes > 1:
+ continue
+ # First, see if we have a [[foo|bar]]-style link.
+ pipematch = re.search(r"(.*?)\|(.*)", link)
+ if pipematch:
+ prepipe = pipematch.group(1)
+ postpipe = pipematch.group(2)
+ title = prepipe
+ title = title[0].capitalize() + title[1:]
+ title = title.replace("_", " ")
+ #article_exists = wp.wp_article_exists(title.encode('utf8'))
+ article_exists = title.encode('utf8') in self.index
+ if article_exists:
+ # Exact match. Internal link.
+ LinkStats.allhits += 1
+ LinkStats.alltotal += 1
+ LinkStats.pagehits += 1
+ LinkStats.pagetotal += 1
+ article_prelinks = article_prelinks.replace(toreplace, "<a href='/wiki/%s'>%s</a>" % (prepipe, postpipe))
+ else:
+ # No match. External link. Use es.wikipedia.org.
+ # FIXME: Decide between es.w.o and schoolserver.
+ LinkStats.alltotal += 1
+ LinkStats.pagetotal += 1
+ article_prelinks = article_prelinks.replace(toreplace, "<a class='offsite' href='http://es.wikipedia.org/wiki/%s'>%s</a>" % (prepipe, postpipe))
+ else:
+ # [[foo]]-style link.
+ title = link
+ title = title[0].capitalize() + title[1:]
+ title = title.replace("_", " ")
+ #article_exists = wp.wp_article_exists(title.encode('utf8'))
+ article_exists = title.encode('utf8') in self.index
+ if article_exists:
+ LinkStats.allhits += 1
+ LinkStats.alltotal += 1
+ LinkStats.pagehits += 1
+ LinkStats.pagetotal += 1
+ else:
+ article_prelinks = article_prelinks.replace(toreplace, "<a class='offsite' href='http://es.wikipedia.org/wiki/%s'>%s</a>" % (link, link))
+ LinkStats.alltotal += 1
+ LinkStats.pagetotal += 1
+ return article_prelinks
+ def strip_templates(self, wikitext):
+ """Recursively strips all {{ }} style templates from 'wikitext'."""
+ output = ''
+ nest_level = 0
+ i = 0
+ while i < len(wikitext)-1:
+ if wikitext[i] == '{' and wikitext[i+1] == '{':
+ nest_level += 1
+ i += 2
+ elif wikitext[i] == '}' and wikitext[i+1] == '}':
+ nest_level -= 1
+ if nest_level < 0:
+ nest_level = 0
+ i += 2
+ else:
+ if nest_level == 0:
+ output += wikitext[i]
+ i += 1
+ return output
+ def get_wikitext(self, title):
+ wikidb = WPWikiDB()
+ article_text = wikidb.getRawArticle(title)
+ # Pass ?noexpand=1 in the url to disable template expansion.
+ if self.params.get('noexpand', 0):
+ article_text = wikidb.getRawArticle(title)
+ article_text = self.strip_templates(article_text)
+ else:
+ article_text = wikidb.getExpandedArticle(title)
+ # Pass ?override=1 in the url to replace wikitext for testing the renderer.
+ if self.params.get('override', 0):
+ try:
+ override = open('override.txt', 'r')
+ article_text = unicode(override.read(), 'utf8')
+ override.close()
+ except:
+ pass
+ return article_text
+ def send_wiki_html_js(self, article_text, parser):
+ self.wfile.write("<script type='text/javascript' src='%s'></script>" % parser)
+ #self.wfile.write("Internal hits on this page: %d<br>" % LinkStats.pagehits)
+ #self.wfile.write("Total links on this page: %d<br>" % LinkStats.pagetotal)
+ #page_percent = ((1.0 * LinkStats.pagehits / LinkStats.pagetotal) * 100)
+ #self.wfile.write("Percentage: %.2f<br>" % page_percent)
+ #self.wfile.write("Internal hits so far: %d<br>" % LinkStats.allhits)
+ #self.wfile.write("Total links so far: %d<br>" % LinkStats.alltotal)
+ #total_percent = ((1.0 * LinkStats.allhits / LinkStats.alltotal) * 100)
+ #self.wfile.write("Percentage: %.2f<br>" % total_percent)
+ # Link resolution.
+ article_text = self.resolve_links(article_text)
+ # Embed article text and call parser.
+ jstext = ''
+ for l in article_text.split('\n'):
+ jstext += re.escape(l) + '\\n\\\n'
+ self.wfile.write("<script type='text/javascript'>");
+ self.wfile.write("var wikitext = \"%s\";" % jstext.encode('utf8'));
+ self.wfile.write("document.write(convert_wiki_to_html(unescape(wikitext)));");
+ self.wfile.write("</script>")
+ def send_wiki_html_mwlib(self, title, article_text):
+ tokens = scanner.tokenize(article_text, title)
+ wiki_parsed = parser.Parser(tokens, title).parse()
+ wiki_parsed.caption = title
+ htmlbuf = HTMLOutputBuffer()
+ imagedb = WPImageDB()
+ writer = WPHTMLWriter(self.index, htmlbuf, images=imagedb)
+ writer.write(wiki_parsed)
+ self.wfile.write(htmlbuf.getvalue())
+ def send_article(self, title):
+ article_text = self.get_wikitext(title)
+ # Capitalize the first letter of the article -- Trac #6991.
+ title = title[0].capitalize() + title[1:]
+ # Replace underscores with spaces in title.
+ title = title.replace("_", " ")
+ # Redirect to Wikipedia if the article text is empty (e.g. an image link)
+ if article_text == "":
+ self.send_response(301)
+ self.send_header("Location",
+ "http://es.wikipedia.org/wiki/" + title.encode('utf8'))
+ self.end_headers()
+ return
+ # Pass ?raw=1 in the URL to see the raw wikitext (post expansion, unless noexpand=1 is also set).
+ if self.params.get('raw', 0):
+ self.send_response(200)
+ self.send_header("Content-Type", "text/plain; charset=utf-8")
+ self.end_headers()
+ self.wfile.write(article_text.encode('utf8'))
+ else:
+ self.send_response(200)
+ self.send_header("Content-Type", "text/html; charset=utf-8")
+ self.end_headers()
+ self.wfile.write("<html><head><title>%s</title>" % title.encode('utf8'))
+ self.wfile.write("<style type='text/css' media='screen, projection'>"\
+ "@import '/static/common.css';"\
+ "@import '/static/monobook.css';"\
+ "@import '/static/styles.css';"\
+ "@import '/static/shared.css';"\
+ "</style>")
+ self.wfile.write("</head>")
+ self.wfile.write("<body>")
+ parser_index = int(self.params.get('parser', default_parser))
+ parser = parsers[parser_index]
+ if parser == 'mwlib':
+ self.send_wiki_html_mwlib(title, article_text)
+ else:
+ self.send_wiki_html_js(article_text, parser)
+ self.wfile.write('<center>Contenido disponible bajo los términos de la <a href="/static/es-gfdl.html">Licencia de documentación libre de GNU</a>. <br/> Wikipedia es una marca registrada de la organización sin ánimo de lucro Wikimedia Foundation, Inc.<br/><a href="/static/acerca.html">Acerca de Wikipedia</a> </center>')
+ self.wfile.write("</body></html>")
+ def send_searchresult(self, title):
+ self.send_response(200)
+ self.send_header("Content-Type", "text/html; charset=utf-8")
+ self.end_headers()
+ self.wfile.write("<html><head><title>Search Results for '%s'</title></head>" % title.encode('utf8'))
+ self.wfile.write("<style type='text/css' media='screen, projection'>"\
+ "@import '/static/monobook.css';</style>")
+ self.wfile.write("</head>")
+ self.wfile.write("<body>")
+ self.wfile.write("<h1>Search Results for '%s'.</h1>" % title.encode('utf8'))
+ self.wfile.write("<ul>")
+ num_results = wp.wp_search(title.encode('utf8'))
+ for i in xrange(0, num_results):
+ result = unicode(wp.wp_result(i), 'utf8')
+ self.wfile.write('<li><a href="/wiki/%s">%s</a></li>' %
+ (result.encode('utf8'), result.encode('utf8')))
+ self.wfile.write("</ul>")
+ self.wfile.write("</body></html>")
+ def send_image(self, path):
+ if os.path.exists('images/' + path):
+ # If image exists locally, serve it as normal.
+ SimpleHTTPRequestHandler.do_GET(self)
+ else:
+ # If not, redirect to wikimedia.
+ redirect_url = "http://upload.wikimedia.org/wikipedia/commons/" + path
+ self.send_response(301)
+ self.send_header("Location", redirect_url.encode('utf8'))
+ self.end_headers()
+ def do_GET(self):
+ real_path = urllib.unquote(self.path)
+ real_path = unicode(real_path, 'utf8')
+ (real_path, sep, param_text) = real_path.partition('?')
+ self.params = {}
+ for p in param_text.split('&'):
+ (key, sep, value) = p.partition('=')
+ self.params[key] = value
+ # Wiki requests return article contents or redirect to Wikipedia.
+ m = re.match(r'^/wiki/(.+)$', real_path)
+ if m:
+ self.send_article(m.group(1))
+ return
+ # Search requests return search results.
+ m = re.match(r'^/search$', real_path)
+ if m:
+ self.send_searchresult(self.params.get('q', ''))
+ return
+ # Image requests are handled locally or are referenced from Wikipedia.
+ m = re.match(r'^/images/(.+)$', real_path)
+ if m:
+ self.send_image(m.group(1))
+ return
+ # Static requests handed off to SimpleHTTPServer.
+ m = re.match(r'^/static/(.*)$', real_path)
+ if m:
+ SimpleHTTPRequestHandler.do_GET(self)
+ return
+ # Any other request redirects to the index page.
+ self.send_response(301)
+ self.send_header("Location", "/static/")
+ self.end_headers()
+def load_db(dbname):
+ wp.wp_load_dump(
+ dbname + '.processed',
+ dbname + '.locate.db',
+ dbname + '.locate.prefixdb',
+ dbname + '.blocks.db')
+def run_server(path, port):
+ index = ArticleIndex('%s.index.txt' % path)
+ httpd = BaseHTTPServer.HTTPServer(('', port),
+ lambda *args: WikiRequestHandler(index, *args))
+ from threading import Thread
+ server = Thread(target=httpd.serve_forever)
+ server.run()
+ # Tell the world that we're ready to accept request.
+ print 'ready'
+if __name__ == '__main__':
+ load_db(sys.argv[1])
+ # This is an attempt to work around a race condition where Browse starts up before
+ # the server has loaded the index. Not working yet, though.
+ #if os.fork():
+ # sys.exit(0)
+ run_server(sys.argv[1], int(sys.argv[2]))