Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/mwlib/_mwscan.re
diff options
context:
space:
mode:
Diffstat (limited to 'mwlib/_mwscan.re')
-rw-r--r--mwlib/_mwscan.re327
1 files changed, 327 insertions, 0 deletions
diff --git a/mwlib/_mwscan.re b/mwlib/_mwscan.re
new file mode 100644
index 0000000..99ffe95
--- /dev/null
+++ b/mwlib/_mwscan.re
@@ -0,0 +1,327 @@
+// -*- mode: c++ -*-
+// Copyright (c) 2007-2008 PediaPress GmbH
+// See README.txt for additional licensing information.
+
+#include <Python.h>
+
+#include <iostream>
+#include <assert.h>
+#include <vector>
+using namespace std;
+
+#define RET(x) {found(x); return x;}
+
+typedef enum {
+ t_end,
+ t_text,
+ t_entity,
+ t_special,
+ t_magicword,
+ t_comment,
+ t_2box_open, // [[
+ t_2box_close, // ]]
+ t_http_url,
+ t_break,
+ t_begin_table,
+ t_end_table,
+ t_html_tag,
+ t_style,
+ t_pre,
+ t_section,
+ t_section_end,
+ t_item,
+ t_colon,
+ t_semicolon,
+ t_hrule,
+ t_newline,
+ t_column,
+ t_row,
+ t_tablecaption,
+ t_urllink,
+} mwtok;
+
+struct Token
+{
+ int type;
+ int start;
+ int len;
+};
+
+class Scanner
+{
+public:
+
+ Scanner(Py_UNICODE *_start, Py_UNICODE *_end) {
+ source = start = _start;
+ end = _end;
+ cursor = start;
+ line_startswith_section = -1;
+ tablemode=0;
+ }
+
+ int found(mwtok val) {
+ if (val==t_text && tokens.size()) {
+ Token &previous_token (tokens[tokens.size()-1]);
+ if (previous_token.type==val) {
+ previous_token.len += cursor-start;
+ return tokens.size()-1;
+ }
+ }
+ Token t;
+ t.type = val;
+ t.start = (start-source);
+ t.len = cursor-start;
+ tokens.push_back(t);
+ return tokens.size()-1;
+ }
+
+ bool bol() const {
+ return (start==source) || (start[-1]=='\n');
+ }
+
+ bool eol() const {
+ return *cursor=='\n' || *cursor==0;
+ }
+
+ void newline() {
+ if (line_startswith_section>=0) {
+ tokens[line_startswith_section].type = t_text;
+ }
+ line_startswith_section = -1;
+ }
+
+ inline int scan();
+
+ Py_UNICODE *source;
+
+ Py_UNICODE *start;
+ Py_UNICODE *cursor;
+ Py_UNICODE *end;
+ vector<Token> tokens;
+
+ int line_startswith_section;
+ int tablemode;
+};
+
+
+int Scanner::scan()
+{
+ start=cursor;
+
+ Py_UNICODE *marker=cursor;
+
+ Py_UNICODE *save_cursor = cursor;
+
+
+#define YYCTYPE Py_UNICODE
+#define YYCURSOR cursor
+#define YYMARKER marker
+#define YYLIMIT (end)
+// #define YYFILL(n) return 0;
+
+/*!re2c
+re2c:yyfill:enable = 0 ;
+*/
+
+/*
+ the re2c manpage says:
+ "The user must arrange for a sentinel token to appear at the end of input"
+ \000 is our sentinel token.
+*/
+
+/*!re2c
+ any = [^\000];
+ ftp = "ftp://" [-a-zA-Z0-9_+${}~?=/@#&*(),:.]+ ;
+ mailto = "mailto:" [-a-zA-Z0-9_!#$%*./?|^{}`~&'+=]+ "@" [-a-zA-Z0-9_.]+ ;
+ url = "http" "s"? "://" [-\xe4\xc4\xf6\xd6\xfc\xdca-zA-Z_0-9./?=&:%:~()#+,]+ ;
+ entity_name = "&" [a-zA-Z0-9]+ ";";
+ entity_hex = "&#" 'x' [a-fA-F0-9]+ ";";
+ entity_dec = "&#" [0-9]+ ";";
+
+ entity = (entity_name | entity_hex | entity_dec);
+
+
+ magicword = ( "__TOC__"
+ | "__NOTOC__"
+ | "__FORCETOC__"
+ | "__NOEDITSECTION__"
+ | "__NEWSECTIONLINK__"
+ | "__NOCONTENTCONVERT__"
+ | "__NOCC__"
+ | "__NOGALLERY__"
+ | "__NOTITLECONVERT__"
+ | "__NOTC__"
+ | "__END__"
+ | "__START__"
+ );
+*/
+ if (!bol()) {
+ goto not_bol;
+ }
+/*!re2c
+ " "* "{|" {++tablemode; RET(t_begin_table);}
+ " "* "|}" {--tablemode; RET(t_end_table);}
+
+ " "* "|" "-"+
+ {
+ if (tablemode)
+ RET(t_row);
+ if (*start==' ') {
+ cursor = start+1;
+ RET(t_pre);
+ }
+ RET(t_text);
+ }
+
+ " "* ("|" | "!")
+ {
+ if (tablemode)
+ RET(t_column);
+
+ if (*start==' ') {
+ cursor = start+1;
+ RET(t_pre);
+ }
+ RET(t_text);
+ }
+
+ " "* "|" "+"+
+ {
+ if (tablemode)
+ RET(t_tablecaption);
+ if (*start==' ') {
+ cursor = start+1;
+ RET(t_pre);
+ }
+ RET(t_text);
+ }
+
+ " " {RET(t_pre);}
+ "="+ [ \t]* {
+ line_startswith_section = found(t_section);
+ return t_section;
+ }
+ ":"* [#*]+ {RET(t_item);}
+ ":"+ {RET(t_colon);}
+ ";"+ {RET(t_semicolon);}
+ "-"{4,} {RET(t_hrule);}
+
+ [^] {goto not_bol;}
+ */
+
+
+not_bol:
+ cursor = save_cursor;
+ marker = cursor;
+
+/*!re2c
+ "[" mailto {RET(t_urllink);}
+ mailto {RET(t_http_url);}
+ "[" ftp {RET(t_urllink);}
+ ftp {RET(t_http_url);}
+ "[" url {RET(t_urllink);}
+ url {RET(t_http_url);}
+ magicword {RET(t_magicword);}
+ [a-zA-Z0-9_]+ {RET(t_text);}
+ "[[" {RET(t_2box_open);}
+ "]]" {RET(t_2box_close);}
+ "="+ [ \t]* {
+ if (eol()) {
+ if (line_startswith_section>=0) {
+ line_startswith_section=-1;
+ RET(t_section_end);
+ } else {
+ RET(t_text);
+ }
+ } else {
+ RET(t_text);
+ }
+ }
+ "\n"{2,} {newline(); RET(t_break);}
+ "\n" {newline(); RET(t_newline);}
+ "||" | "|!" | "!!"
+ {
+ if (tablemode)
+ RET(t_column);
+ cursor = start+1;
+ RET(t_special);
+ }
+ "|+"
+ {
+ if (tablemode)
+ RET(t_tablecaption);
+ cursor = start+1;
+ RET(t_special);
+ }
+ [:|\[\]] {RET(t_special);}
+ "'''''" | "'''" | "''" {RET(t_style);}
+ "<" "/"? [a-zA-Z]+ [^\000<>]* "/"? ">"
+ {RET(t_html_tag);}
+
+ "<!--"[^\000<>]*"-->"
+ {RET(t_comment);}
+ entity {RET(t_entity);}
+
+ "\000" {newline(); return t_end;}
+ . {RET(t_text);}
+*/
+}
+
+
+PyObject *py_scan(PyObject *self, PyObject *args)
+{
+ PyObject *arg1;
+ if (!PyArg_ParseTuple(args, "O:mwscan.scan", &arg1)) {
+ return 0;
+ }
+ PyUnicodeObject *unistr = (PyUnicodeObject*)PyUnicode_FromObject(arg1);
+ if (unistr == NULL) {
+ PyErr_SetString(PyExc_TypeError,
+ "parameter cannot be converted to unicode in mwscan.scan");
+ return 0;
+ }
+
+ Py_UNICODE *start = unistr->str;
+ Py_UNICODE *end = start+unistr->length;
+
+
+ Scanner scanner (start, end);
+ Py_BEGIN_ALLOW_THREADS
+ while (scanner.scan()) {
+ }
+ Py_END_ALLOW_THREADS
+ Py_XDECREF(unistr);
+
+ // return PyList_New(0); // uncomment to see timings for scanning
+
+ int size = scanner.tokens.size();
+ PyObject *result = PyList_New(size);
+ if (!result) {
+ return 0;
+ }
+
+ for (int i=0; i<size; i++) {
+ Token t = scanner.tokens[i];
+ PyList_SET_ITEM(result, i, Py_BuildValue("iii", t.type, t.start, t.len));
+ }
+
+ return result;
+}
+
+
+
+static PyMethodDef module_functions[] = {
+ {"scan", (PyCFunction)py_scan, METH_VARARGS, "scan(text)"},
+ {0, 0},
+};
+
+
+
+extern "C" {
+ DL_EXPORT(void) init_mwscan();
+}
+
+DL_EXPORT(void) init_mwscan()
+{
+ /*PyObject *m =*/ Py_InitModule("_mwscan", module_functions);
+}