From dd58bf72d6799438d8033cf7de6bc26a711734c3 Mon Sep 17 00:00:00 2001 From: Wade Brainerd Date: Fri, 23 May 2008 22:58:23 +0000 Subject: Rename step 1. Breaking the renames up because Git fails to recognize them when too many are done at once. --- (limited to 'mwlib/mwscan.py') diff --git a/mwlib/mwscan.py b/mwlib/mwscan.py new file mode 100755 index 0000000..100ea35 --- /dev/null +++ b/mwlib/mwscan.py @@ -0,0 +1,315 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import sys +import time +import _mwscan +import htmlentitydefs + +class token(object): + t_end = 0 + t_text = 1 + t_entity = 2 + t_special = 3 + t_magicword = 4 + t_comment = 5 + t_2box_open = 6 + t_2box_close = 7 + t_http_url = 8 + t_break = 9 + t_begin_table = 10 + t_end_table = 11 + t_html_tag = 12 + t_style = 13 + t_pre = 14 + t_section = 15 + t_section_end = 16 + t_item = 17 + t_colon = 18 + t_semicolon = 19 + t_hrule = 20 + t_newline = 21 + t_column = 22 + t_row = 23 + t_tablecaption = 24 + t_urllink = 25 + + token2name = {} + +for d in dir(token): + token2name = token.token2name + if d.startswith("t_"): + token2name[getattr(token, d)] = d +del d + + + + +def dump_tokens(text, tokens): + for type, start, len in tokens: + print type, repr(text[start:start+len]) + +def scan(text): + stime=time.time() + text += u"\0"*32 + tokens = _mwscan.scan(text) + return scan_result(text, tokens) + +def resolve_entity(e): + if e[1]=='#': + if e[2]=='x' or e[2]=='X': + return unichr(int(e[3:-1], 16)) + else: + return unichr(int(e[2:-1])) + + else: + try: + return unichr(htmlentitydefs.name2codepoint[e[1:-1]]) + except KeyError: + return e + + +class scan_result(object): + def __init__(self, source, toks): + self.source = source + self.toks = toks + + def rawtext(self, (type, start, tlen)): + return self.source[start:start+tlen] + + def text(self, t): + r=self.rawtext(t) + if t[0] == token.t_entity: + return resolve_entity(r) + else: + return r + + def dump(self, out=None): + if out is None: + out = sys.stdout + for x in self: + out.write("%s\n" % self.repr(x)) + + + + def repr(self, t): + return "(%s, %r)" % (token.token2name.get(t[0]), self.rawtext(t)) + + + def __len__(self): + return len(self.toks) + + def __iter__(self): + return iter(self.toks) + + def __getitem__(self, idx): + return self.toks[idx] + + +class _compat_scanner(object): + class ignore: pass + tok2compat = { + token.t_text: "TEXT", + token.t_special: "SPECIAL", + token.t_2box_open: "[[", + token.t_2box_close: "]]", + token.t_http_url: "URL", + token.t_break: "BREAK", + token.t_style: "STYLE", + token.t_pre: "PRE", + token.t_section: "SECTION", + token.t_section_end: "ENDSECTION", + token.t_magicword: ignore, + token.t_comment: ignore, + token.t_end: ignore, + token.t_item: "ITEM", + token.t_colon: "EOLSTYLE", + token.t_semicolon: "EOLSTYLE", + token.t_newline: "\n", + token.t_begin_table: "BEGINTABLE", + token.t_end_table: "ENDTABLE", + token.t_column: "COLUMN", + token.t_row: "ROW", + token.t_tablecaption: "TABLECAPTION", + token.t_urllink: "URLLINK", + } + + + def __call__(self, text): + tokens = scan(text) + scanres = scan_result(text, tokens) + + + res = [] + + def g(): + return text[start:start+tlen] + a = lambda x: res.append((x,g())) + + + ignore = self.ignore + tok2compat = self.tok2compat + + i = 0 + numtokens = len(tokens) + while i < numtokens: + type, start, tlen = tokens[i] + n=tok2compat.get(type) + if n is ignore: + pass + elif n is not None: + a(n) + elif type==token.t_entity: + res.append(("TEXT", resolve_entity(g()))) + elif type==token.t_hrule: + res.append((self.tagtoken("
"), g())) + elif type==token.t_html_tag: + s = g() + + tt = self.tagtoken(s) + isEndToken = isinstance(tt, EndTagToken) + closingOrSelfClosing = isEndToken or tt.selfClosing + + if tt.t=="math": + if closingOrSelfClosing: + i+=1 + continue + + res.append(("MATH", g())) + i+=1 + while i"): + name = text[1:-2] + klass = TagToken + selfClosing = True + isEndToken = False # ??? + else: + name = text[1:-1] + klass = TagToken + isEndToken = False + + name, values = (name.split(None, 1)+[u''])[:2] + from mwlib.parser import paramrx + values = dict(paramrx.findall(values)) + name = name.lower() + + if name=='br' or name=='references': + isEndToken = False + klass = TagToken + + r = klass(name, text) + r.selfClosing = selfClosing + r.values = values + return r + + + +compat_scan = _compat_scanner() + +# from plexscanner import _BaseTagToken, TagToken, EndTagToken + +class _BaseTagToken(object): + def __eq__(self, other): + if isinstance(other, basestring): + return self.t == other + if isinstance(other, self.__class__): + return self.t == other.t + return False + + def __ne__(self, other): + return not(self==other) + + def __hash__(self): + return hash(self.t) + +class TagToken(_BaseTagToken): + values = {} + selfClosing=False + + def __init__(self, t, text=''): + self.t = t + self.text = text + + def __repr__(self): + return "" % (self.t, self.text) + +class EndTagToken(_BaseTagToken): + def __init__(self, t, text=''): + self.t = t + self.text = text + + def __repr__(self): + return "" % self.t + +def tokenize(input, name="unknown"): + assert input is not None, "must specify input argument in tokenize" + return compat_scan(input) -- cgit v0.9.1