1 files changed, 529 insertions, 0 deletions
diff --git a/websdk/hatta/parser.py b/websdk/hatta/parser.py
new file mode 100644
index 0000000..a76fa78
--- /dev/null
+++ b/websdk/hatta/parser.py
@@ -0,0 +1,529 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import re
+import sys
+import unicodedata
+import itertools
+import werkzeug
+
+
+EXTERNAL_URL_RE = re.compile(ur'^[a-z]+://|^mailto:', re.I | re.U)
+
+
+def external_link(addr):
+    """
+    Decide whether a link is absolute or internal.
+
+    >>> external_link('http://example.com')
+    True
+    >>> external_link('https://example.com')
+    True
+    >>> external_link('ftp://example.com')
+    True
+    >>> external_link('mailto:user@example.com')
+    True
+    >>> external_link('PageTitle')
+    False
+    >>> external_link(u'ąęśćUnicodePage')
+    False
+
+    """
+
+    return EXTERNAL_URL_RE.match(addr)
+
+
+class WikiParser(object):
+    r"""
+    Responsible for generating HTML markup from the wiki markup.
+
+    The parser works on two levels. On the block level, it analyzes lines
+    of text and decides what kind of block element they belong to (block
+    elements include paragraphs, lists, headings, preformatted blocks).
+    Lines belonging to the same block are joined together, and a second
+    pass is made using regular expressions to parse line-level elements,
+    such as links, bold and italic text and smileys.
+
+    Some block-level elements, such as preformatted blocks, consume additional
+    lines from the input until they encounter the end-of-block marker, using
+    lines_until. Most block-level elements are just runs of marked up lines
+    though.
+
+
+    """
+
+    list_pat = ur"^\s*[*#]+\s+"
+    heading_pat = ur"^\s*=+"
+    quote_pat = ur"^[>]+\s+"
+    block = {
+        # "name": (priority, ur"pattern"),
+        "list": (10, list_pat),
+        "code": (20, ur"^[{][{][{]+\s*$"),
+        "conflict": (30, ur"^<<<<<<< local\s*$"),
+        "empty": (40, ur"^\s*$"),
+        "heading": (50, heading_pat),
+        "indent": (60, ur"^[ \t]+"),
+        "macro": (70, ur"^<<\w+\s*$"),
+        "quote": (80, quote_pat),
+        "rule": (90, ur"^\s*---+\s*$"),
+        "syntax": (100, ur"^\{\{\{\#![\w+#.-]+\s*$"),
+        "table": (110, ur"^\|"),
+    }
+    image_pat = (ur"\{\{(?P<image_target>([^|}]|}[^|}])*)"
+                 ur"(\|(?P<image_text>([^}]|}[^}])*))?}}")
+    smilies = {
+        r':)': "smile.png",
+        r':(': "frown.png",
+        r':P': "tongue.png",
+        r':D': "grin.png",
+        r';)': "wink.png",
+    }
+    punct = {
+        r'...': "&hellip;",
+        r'--': "&ndash;",
+        r'---': "&mdash;",
+        r'~': "&nbsp;",
+        r'\~': "~",
+        r'~~': "&sim;",
+        r'(C)': "&copy;",
+        r'-->': "&rarr;",
+        r'<--': "&larr;",
+        r'(R)': "&reg;",
+        r'(TM)': "&trade;",
+        r'%%': "&permil;",
+        r'``': "&ldquo;",
+        r"''": "&rdquo;",
+        r",,": "&bdquo;",
+    }
+    markup = {
+        # "name": (priority, ur"pattern"),
+        "bold": (10, ur"[*][*]"),
+        "code": (20, ur"[{][{][{](?P<code_text>([^}]|[^}][}]|[^}][}][}])"
+                ur"*[}]*)[}][}][}]"),
+        "free_link": (30, ur"""[a-zA-Z]+://\S+[^\s.,:;!?()'"\*/=+<>-]"""),
+        "italic": (40, ur"//"),
+        "link": (50, ur"\[\[(?P<link_target>([^|\]]|\][^|\]])+)"
+                ur"(\|(?P<link_text>([^\]]|\][^\]])+))?\]\]"),
+        "image": (60, image_pat),
+        "linebreak": (70, ur"\\\\"),
+        "macro": (80, ur"[<][<](?P<macro_name>\w+)\s+"
+                 ur"(?P<macro_text>([^>]|[^>][>])+)[>][>]"),
+        "mail": (90, ur"""(mailto:)?\S+@\S+(\.[^\s.,:;!?()'"\*/=+<>-]+)+"""),
+        "math": (100, ur"\$\$(?P<math_text>[^$]+)\$\$"),
+        "mono": (110, ur"##"),
+        "newline": (120, ur"\n"),
+        "punct": (130,
+                  ur'(^|\b|(?<=\s))(%s)((?=[\s.,:;!?)/&=+"\'—-])|\b|$)' %
+                  ur"|".join(re.escape(k) for k in punct)),
+        "table": (140, ur"=?\|=?"),
+        "text": (150, ur".+?"),
+    }
+
+    def __init__(self, lines, wiki_link, wiki_image,
+                 wiki_syntax=None, wiki_math=None, smilies=None):
+        self.wiki_link = wiki_link
+        self.wiki_image = wiki_image
+        self.wiki_syntax = wiki_syntax
+        self.wiki_math = wiki_math
+        self.enumerated_lines = enumerate(lines)
+        if smilies is not None:
+            self.smilies = smilies
+        self.compile_patterns()
+        self.headings = {}
+        self.stack = []
+        self.line_no = 0
+
+    def compile_patterns(self):
+        self.quote_re = re.compile(self.quote_pat, re.U)
+        self.heading_re = re.compile(self.heading_pat, re.U)
+        self.list_re = re.compile(self.list_pat, re.U)
+        patterns = ((k, p) for (k, (x, p)) in
+                    sorted(self.block.iteritems(), key=lambda x: x[1][0]))
+        self.block_re = re.compile(ur"|".join("(?P<%s>%s)" % pat
+                                   for pat in patterns), re.U)
+        self.code_close_re = re.compile(ur"^\}\}\}\s*$", re.U)
+        self.macro_close_re = re.compile(ur"^>>\s*$", re.U)
+        self.conflict_close_re = re.compile(ur"^>>>>>>> other\s*$", re.U)
+        self.conflict_sep_re = re.compile(ur"^=======\s*$", re.U)
+        self.image_re = re.compile(self.image_pat, re.U)
+        smileys = ur"|".join(re.escape(k) for k in self.smilies)
+        smiley_pat = (ur"(^|\b|(?<=\s))(?P<smiley_face>%s)"
+                      ur"((?=[\s.,:;!?)/&=+-])|$)" % smileys)
+        self.markup['smiley'] = (125, smiley_pat)
+        patterns = ((k, p) for (k, (x, p)) in
+                    sorted(self.markup.iteritems(), key=lambda x: x[1][0]))
+        self.markup_re = re.compile(ur"|".join("(?P<%s>%s)" % pat
+                                    for pat in patterns), re.U)
+
+    def __iter__(self):
+        return self.parse()
+
+    @classmethod
+    def extract_links(cls, text):
+        links = []
+
+        def link(addr, label=None, class_=None, image=None, alt=None,
+                 lineno=0):
+            addr = addr.strip()
+            if external_link(addr):
+                # Don't index external links
+                return u''
+            if '#' in addr:
+                addr, chunk = addr.split('#', 1)
+            if addr == u'':
+                return u''
+            links.append((addr, label))
+            return u''
+        lines = text.split('\n')
+        for part in cls(lines, link, link):
+            for ret in links:
+                yield ret
+            links[:] = []
+
+    def parse(self):
+        """Parse a list of lines of wiki markup, yielding HTML for it."""
+
+        self.headings = {}
+        self.stack = []
+        self.line_no = 0
+
+        def key(enumerated_line):
+            line_no, line = enumerated_line
+            match = self.block_re.match(line)
+            if match:
+                return match.lastgroup
+            return "paragraph"
+
+        for kind, block in itertools.groupby(self.enumerated_lines, key):
+            func = getattr(self, "_block_%s" % kind)
+            for part in func(block):
+                yield part
+
+    def parse_line(self, line):
+        """
+        Find all the line-level markup and return HTML for it.
+
+        """
+
+        for match in self.markup_re.finditer(line):
+            func = getattr(self, "_line_%s" % match.lastgroup)
+            yield func(match.groupdict())
+
+    def pop_to(self, stop):
+        """
+            Pop from the stack until the specified tag is encoutered.
+            Return string containing closing tags of everything popped.
+        """
+        tags = []
+        tag = None
+        try:
+            while tag != stop:
+                tag = self.stack.pop()
+                tags.append(tag)
+        except IndexError:
+            pass
+        return u"".join(u"</%s>" % tag for tag in tags)
+
+    def lines_until(self, close_re):
+        """Get lines from input until the closing markup is encountered."""
+
+        self.line_no, line = self.enumerated_lines.next()
+        while not close_re.match(line):
+            yield line.rstrip()
+            line_no, line = self.enumerated_lines.next()
+
+# methods for the markup inside lines:
+
+    def _line_table(self, groups):
+        return groups["table"]
+
+    def _line_linebreak(self, groups):
+        return u'<br>'
+
+    def _line_smiley(self, groups):
+        smiley = groups["smiley_face"]
+        try:
+            url = self.smilies[smiley]
+        except KeyError:
+            url = ''
+        return self.wiki_image(url, smiley, class_="smiley")
+
+    def _line_bold(self, groups):
+        if 'b' in self.stack:
+            return self.pop_to('b')
+        else:
+            self.stack.append('b')
+            return u"<b>"
+
+    def _line_italic(self, groups):
+        if 'i' in self.stack:
+            return self.pop_to('i')
+        else:
+            self.stack.append('i')
+            return u"<i>"
+
+    def _line_mono(self, groups):
+        if 'tt' in self.stack:
+            return self.pop_to('tt')
+        else:
+            self.stack.append('tt')
+            return u"<tt>"
+
+    def _line_punct(self, groups):
+        text = groups["punct"]
+        return self.punct.get(text, text)
+
+    def _line_newline(self, groups):
+        return "\n"
+
+    def _line_text(self, groups):
+        return werkzeug.escape(groups["text"])
+
+    def _line_math(self, groups):
+        if self.wiki_math:
+            return self.wiki_math(groups["math_text"])
+        else:
+            return "<var>%s</var>" % werkzeug.escape(groups["math_text"])
+
+    def _line_code(self, groups):
+        return u'<code>%s</code>' % werkzeug.escape(groups["code_text"])
+
+    def _line_free_link(self, groups):
+        groups['link_target'] = groups['free_link']
+        return self._line_link(groups)
+
+    def _line_mail(self, groups):
+        addr = groups['mail']
+        groups['link_text'] = addr
+        if not addr.startswith(u'mailto:'):
+            addr = u'mailto:%s' % addr
+        groups['link_target'] = addr
+        return self._line_link(groups)
+
+    def _line_link(self, groups):
+        target = groups['link_target']
+        text = groups.get('link_text')
+        if not text:
+            text = target
+            if '#' in text:
+                text, chunk = text.split('#', 1)
+        match = self.image_re.match(text)
+        if match:
+            image = self._line_image(match.groupdict())
+            return self.wiki_link(target, text, image=image)
+        return self.wiki_link(target, text)
+
+    def _line_image(self, groups):
+        target = groups['image_target']
+        alt = groups.get('image_text')
+        if alt is None:
+            alt = target
+        return self.wiki_image(target, alt)
+
+    def _line_macro(self, groups):
+        name = groups['macro_name']
+        text = groups['macro_text'].strip()
+        return u'<span class="%s">%s</span>' % (
+            werkzeug.escape(name, quote=True),
+            werkzeug.escape(text))
+
+# methods for the block (multiline) markup:
+
+    def _block_code(self, block):
+        for self.line_no, part in block:
+            inside = u"\n".join(self.lines_until(self.code_close_re))
+            yield werkzeug.html.pre(werkzeug.html(inside), class_="code",
+                                    id="line_%d" % self.line_no)
+
+    def _block_syntax(self, block):
+        for self.line_no, part in block:
+            syntax = part.lstrip('{#!').strip()
+            inside = u"\n".join(self.lines_until(self.code_close_re))
+            if self.wiki_syntax:
+                return self.wiki_syntax(inside, syntax=syntax,
+                                        line_no=self.line_no)
+            else:
+                return [werkzeug.html.div(werkzeug.html.pre(
+                    werkzeug.html(inside), id="line_%d" % self.line_no),
+                    class_="highlight")]
+
+    def _block_macro(self, block):
+        for self.line_no, part in block:
+            name = part.lstrip('<').strip()
+            inside = u"\n".join(self.lines_until(self.macro_close_re))
+            yield u'<div class="%s">%s</div>' % (
+                werkzeug.escape(name, quote=True),
+                werkzeug.escape(inside))
+
+    def _block_paragraph(self, block):
+        parts = []
+        first_line = None
+        for self.line_no, part in block:
+            if first_line is None:
+                first_line = self.line_no
+            parts.append(part)
+        text = u"".join(self.parse_line(u"".join(parts)))
+        yield werkzeug.html.p(text, self.pop_to(""), id="line_%d" % first_line)
+
+    def _block_indent(self, block):
+        parts = []
+        first_line = None
+        for self.line_no, part in block:
+            if first_line is None:
+                first_line = self.line_no
+            parts.append(part.rstrip())
+        text = u"\n".join(parts)
+        yield werkzeug.html.pre(werkzeug.html(text), id="line_%d" % first_line)
+
+    def _block_table(self, block):
+        first_line = None
+        in_head = False
+        for self.line_no, line in block:
+            if first_line is None:
+                first_line = self.line_no
+                yield u'<table id="line_%d">' % first_line
+            table_row = line.strip()
+            is_header = table_row.startswith('|=') and table_row.endswith('=|')
+            if not in_head and is_header:
+                in_head = True
+                yield '<thead>'
+            elif in_head and not is_header:
+                in_head = False
+                yield '</thead>'
+            yield '<tr>'
+            in_cell = False
+            in_th = False
+
+            for part in self.parse_line(table_row):
+                if part in ('=|', '|', '=|=', '|='):
+                    if in_cell:
+                        if in_th:
+                            yield '</th>'
+                        else:
+                            yield '</td>'
+                        in_cell = False
+                    if part in ('=|=', '|='):
+                        in_th = True
+                    else:
+                        in_th = False
+                else:
+                    if not in_cell:
+                        if in_th:
+                            yield '<th>'
+                        else:
+                            yield '<td>'
+                        in_cell = True
+                    yield part
+            if in_cell:
+                if in_th:
+                    yield '</th>'
+                else:
+                    yield '</td>'
+            yield '</tr>'
+        yield u'</table>'
+
+    def _block_empty(self, block):
+        yield u''
+
+    def _block_rule(self, block):
+        for self.line_no, line in block:
+            yield werkzeug.html.hr()
+
+    def _block_heading(self, block):
+        for self.line_no, line in block:
+            level = min(len(self.heading_re.match(line).group(0).strip()), 5)
+            self.headings[level - 1] = self.headings.get(level - 1, 0) + 1
+            label = u"-".join(str(self.headings.get(i, 0))
+                              for i in range(level))
+            yield werkzeug.html.a(name="head-%s" % label)
+            yield u'<h%d id="line_%d">%s</h%d>' % (level, self.line_no,
+                werkzeug.escape(line.strip("= \t\n\r\v")), level)
+
+    def _block_list(self, block):
+        level = 0
+        in_ul = False
+        kind = None
+        for self.line_no, line in block:
+            bullets = self.list_re.match(line).group(0).strip()
+            nest = len(bullets)
+            if kind is None:
+                if bullets.startswith('*'):
+                    kind = 'ul'
+                else:
+                    kind = 'ol'
+            while nest > level:
+                if in_ul:
+                    yield '<li>'
+                yield '<%s id="line_%d">' % (kind, self.line_no)
+                in_ul = True
+                level += 1
+            while nest < level:
+                yield '</li></%s>' % kind
+                in_ul = False
+                level -= 1
+            if nest == level and not in_ul:
+                yield '</li>'
+            content = line.lstrip().lstrip('*#').strip()
+            yield '<li>%s%s' % (u"".join(self.parse_line(content)),
+                                self.pop_to(""))
+            in_ul = False
+        yield ('</li></%s>' % kind) * level
+
+    def _block_quote(self, block):
+        level = 0
+        in_p = False
+        for self.line_no, line in block:
+            nest = len(self.quote_re.match(line).group(0).strip())
+            if nest == level:
+                yield u'\n'
+            while nest > level:
+                if in_p:
+                    yield '%s</p>' % self.pop_to("")
+                    in_p = False
+                yield '<blockquote>'
+                level += 1
+            while nest < level:
+                if in_p:
+                    yield '%s</p>' % self.pop_to("")
+                    in_p = False
+                yield '</blockquote>'
+                level -= 1
+            content = line.lstrip().lstrip('>').strip()
+            if not in_p:
+                yield '<p id="line_%d">' % self.line_no
+                in_p = True
+            yield u"".join(self.parse_line(content))
+        if in_p:
+            yield '%s</p>' % self.pop_to("")
+        yield '</blockquote>' * level
+
+    def _block_conflict(self, block):
+        for self.line_no, part in block:
+            yield u'<div class="conflict">'
+            local = u"\n".join(self.lines_until(self.conflict_sep_re))
+            yield werkzeug.html.pre(werkzeug.html(local),
+                                    class_="local",
+                                    id="line_%d" % self.line_no)
+            other = u"\n".join(self.lines_until(self.conflict_close_re))
+            yield werkzeug.html.pre(werkzeug.html(other),
+                                    class_="other",
+                                    id="line_%d" % self.line_no)
+            yield u'</div>'
+
+
+class WikiWikiParser(WikiParser):
+    """A version of WikiParser that recognizes WikiWord links."""
+
+    markup = dict(WikiParser.markup)
+    camel_link = ur"\w+[%s]\w+" % re.escape(
+        u''.join(unichr(i) for i in xrange(sys.maxunicode)
+        if unicodedata.category(unichr(i)) == 'Lu'))
+    markup["camel_link"] = (105, camel_link)
+    markup["camel_nolink"] = (106, ur"[!~](?P<camel_text>%s)" % camel_link)
+
+    def _line_camel_link(self, groups):
+        groups['link_target'] = groups['camel_link']
+        return self._line_link(groups)
+
+    def _line_camel_nolink(self, groups):
+        return werkzeug.escape(groups["camel_text"])