diff options
Diffstat (limited to 'mwlib/_mwscan.cc')
-rw-r--r-- | mwlib/_mwscan.cc | 1699 |
1 files changed, 1699 insertions, 0 deletions
diff --git a/mwlib/_mwscan.cc b/mwlib/_mwscan.cc new file mode 100644 index 0000000..f673880 --- /dev/null +++ b/mwlib/_mwscan.cc @@ -0,0 +1,1699 @@ +/* Generated by re2c 0.13.4 */ +#line 1 "_mwscan.re" +// -*- mode: c++ -*- +// Copyright (c) 2007-2008 PediaPress GmbH +// See README.txt for additional licensing information. + +#include <Python.h> + +#include <iostream> +#include <assert.h> +#include <vector> +using namespace std; + +#define RET(x) {found(x); return x;} + +typedef enum { + t_end, + t_text, + t_entity, + t_special, + t_magicword, + t_comment, + t_2box_open, // [[ + t_2box_close, // ]] + t_http_url, + t_break, + t_begin_table, + t_end_table, + t_html_tag, + t_style, + t_pre, + t_section, + t_section_end, + t_item, + t_colon, + t_semicolon, + t_hrule, + t_newline, + t_column, + t_row, + t_tablecaption, + t_urllink, +} mwtok; + +struct Token +{ + int type; + int start; + int len; +}; + +class Scanner +{ +public: + + Scanner(Py_UNICODE *_start, Py_UNICODE *_end) { + source = start = _start; + end = _end; + cursor = start; + line_startswith_section = -1; + tablemode=0; + } + + int found(mwtok val) { + if (val==t_text && tokens.size()) { + Token &previous_token (tokens[tokens.size()-1]); + if (previous_token.type==val) { + previous_token.len += cursor-start; + return tokens.size()-1; + } + } + Token t; + t.type = val; + t.start = (start-source); + t.len = cursor-start; + tokens.push_back(t); + return tokens.size()-1; + } + + bool bol() const { + return (start==source) || (start[-1]=='\n'); + } + + bool eol() const { + return *cursor=='\n' || *cursor==0; + } + + void newline() { + if (line_startswith_section>=0) { + tokens[line_startswith_section].type = t_text; + } + line_startswith_section = -1; + } + + inline int scan(); + + Py_UNICODE *source; + + Py_UNICODE *start; + Py_UNICODE *cursor; + Py_UNICODE *end; + vector<Token> tokens; + + int line_startswith_section; + int tablemode; +}; + + +int Scanner::scan() +{ + start=cursor; + + Py_UNICODE *marker=cursor; + + Py_UNICODE *save_cursor = cursor; + + +#define YYCTYPE Py_UNICODE +#define YYCURSOR cursor +#define YYMARKER marker +#define YYLIMIT (end) +// #define YYFILL(n) return 0; + +#line 124 "_mwscan.re" + + +/* + the re2c manpage says: + "The user must arrange for a sentinel token to appear at the end of input" + \000 is our sentinel token. +*/ + +#line 157 "_mwscan.re" + + if (!bol()) { + goto not_bol; + } + +#line 140 "_mwscan.cc" +{ + YYCTYPE yych; + unsigned int yyaccept = 0; + + yych = *YYCURSOR; + if (yych <= '-') { + if (yych <= '"') { + if (yych <= 0x001F) goto yy18; + if (yych <= ' ') goto yy2; + if (yych <= '!') goto yy8; + goto yy18; + } else { + if (yych <= ')') { + if (yych <= '#') goto yy13; + goto yy18; + } else { + if (yych <= '*') goto yy13; + if (yych <= ',') goto yy18; + goto yy17; + } + } + } else { + if (yych <= '<') { + if (yych <= '9') goto yy18; + if (yych <= ':') goto yy11; + if (yych <= ';') goto yy15; + goto yy18; + } else { + if (yych <= 'z') { + if (yych <= '=') goto yy9; + goto yy18; + } else { + if (yych <= '{') goto yy4; + if (yych <= '|') goto yy6; + goto yy18; + } + } + } +yy2: + yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 0x001F) goto yy3; + if (yych <= '!') goto yy47; + if (yych <= 'z') goto yy3; + if (yych <= '|') goto yy47; +yy3: +#line 199 "_mwscan.re" + {RET(t_pre);} +#line 189 "_mwscan.cc" +yy4: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '|') goto yy43; +yy5: +#line 209 "_mwscan.re" + {goto not_bol;} +#line 196 "_mwscan.cc" +yy6: + ++YYCURSOR; + if ((yych = *YYCURSOR) <= ',') { + if (yych == '+') goto yy35; + } else { + if (yych <= '-') goto yy38; + if (yych == '}') goto yy41; + } +yy7: +#line 177 "_mwscan.re" + { + if (tablemode) + RET(t_column); + + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } +#line 217 "_mwscan.cc" +yy8: + yych = *++YYCURSOR; + goto yy7; +yy9: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '=') goto yy33; + goto yy32; +yy10: +#line 200 "_mwscan.re" + { + line_startswith_section = found(t_section); + return t_section; + } +#line 231 "_mwscan.cc" +yy11: + ++YYCURSOR; + if ((yych = *YYCURSOR) <= ')') { + if (yych == '#') goto yy27; + } else { + if (yych <= '*') goto yy27; + if (yych == ':') goto yy29; + } +yy12: +#line 205 "_mwscan.re" + {RET(t_colon);} +#line 243 "_mwscan.cc" +yy13: + ++YYCURSOR; + yych = *YYCURSOR; + goto yy28; +yy14: +#line 204 "_mwscan.re" + {RET(t_item);} +#line 251 "_mwscan.cc" +yy15: + ++YYCURSOR; + yych = *YYCURSOR; + goto yy26; +yy16: +#line 206 "_mwscan.re" + {RET(t_semicolon);} +#line 259 "_mwscan.cc" +yy17: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == '-') goto yy19; + goto yy5; +yy18: + yych = *++YYCURSOR; + goto yy5; +yy19: + yych = *++YYCURSOR; + if (yych == '-') goto yy21; +yy20: + YYCURSOR = YYMARKER; + if (yyaccept <= 0) { + goto yy3; + } else { + goto yy5; + } +yy21: + yych = *++YYCURSOR; + if (yych != '-') goto yy20; +yy22: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '-') goto yy22; +#line 207 "_mwscan.re" + {RET(t_hrule);} +#line 287 "_mwscan.cc" +yy25: + ++YYCURSOR; + yych = *YYCURSOR; +yy26: + if (yych == ';') goto yy25; + goto yy16; +yy27: + ++YYCURSOR; + yych = *YYCURSOR; +yy28: + if (yych == '#') goto yy27; + if (yych == '*') goto yy27; + goto yy14; +yy29: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ')') { + if (yych == '#') goto yy27; + goto yy12; + } else { + if (yych <= '*') goto yy27; + if (yych == ':') goto yy29; + goto yy12; + } +yy31: + ++YYCURSOR; + yych = *YYCURSOR; +yy32: + if (yych == '\t') goto yy31; + if (yych == ' ') goto yy31; + goto yy10; +yy33: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= 0x001F) { + if (yych == '\t') goto yy31; + goto yy10; + } else { + if (yych <= ' ') goto yy31; + if (yych == '=') goto yy33; + goto yy10; + } +yy35: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '+') goto yy35; +#line 189 "_mwscan.re" + { + if (tablemode) + RET(t_tablecaption); + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } +#line 344 "_mwscan.cc" +yy38: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '-') goto yy38; +#line 166 "_mwscan.re" + { + if (tablemode) + RET(t_row); + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } +#line 359 "_mwscan.cc" +yy41: + ++YYCURSOR; +#line 163 "_mwscan.re" + {--tablemode; RET(t_end_table);} +#line 364 "_mwscan.cc" +yy43: + ++YYCURSOR; +#line 162 "_mwscan.re" + {++tablemode; RET(t_begin_table);} +#line 369 "_mwscan.cc" +yy45: + yych = *++YYCURSOR; + if (yych <= ',') { + if (yych == '+') goto yy35; + goto yy7; + } else { + if (yych <= '-') goto yy38; + if (yych == '}') goto yy41; + goto yy7; + } +yy46: + ++YYCURSOR; + yych = *YYCURSOR; +yy47: + if (yych <= '!') { + if (yych <= 0x001F) goto yy20; + if (yych <= ' ') goto yy46; + } else { + if (yych <= 'z') goto yy20; + if (yych <= '{') goto yy49; + if (yych <= '|') goto yy45; + goto yy20; + } + yych = *++YYCURSOR; + goto yy7; +yy49: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '|') goto yy43; + goto yy20; +} +#line 210 "_mwscan.re" + + + +not_bol: + cursor = save_cursor; + marker = cursor; + + +#line 409 "_mwscan.cc" +{ + YYCTYPE yych; + unsigned int yyaccept = 0; + yych = *YYCURSOR; + if (yych <= 'Z') { + if (yych <= '\'') { + if (yych <= ' ') { + if (yych <= 0x0000) goto yy72; + if (yych == '\n') goto yy63; + goto yy74; + } else { + if (yych <= '!') goto yy66; + if (yych <= '%') goto yy74; + if (yych <= '&') goto yy71; + goto yy69; + } + } else { + if (yych <= ';') { + if (yych <= '/') goto yy74; + if (yych <= '9') goto yy59; + if (yych <= ':') goto yy68; + goto yy74; + } else { + if (yych <= '<') goto yy70; + if (yych <= '=') goto yy61; + if (yych <= '@') goto yy74; + goto yy59; + } + } + } else { + if (yych <= 'f') { + if (yych <= '^') { + if (yych <= '[') goto yy52; + if (yych == ']') goto yy60; + goto yy74; + } else { + if (yych <= '_') goto yy58; + if (yych <= '`') goto yy74; + if (yych <= 'e') goto yy59; + goto yy56; + } + } else { + if (yych <= 'm') { + if (yych == 'h') goto yy57; + if (yych <= 'l') goto yy59; + goto yy54; + } else { + if (yych <= 'z') goto yy59; + if (yych == '|') goto yy65; + goto yy74; + } + } + } +yy52: + yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + switch (yych) { + case '[': goto yy249; + case 'f': goto yy252; + case 'h': goto yy251; + case 'm': goto yy253; + default: goto yy53; + } +yy53: +#line 256 "_mwscan.re" + {RET(t_special);} +#line 476 "_mwscan.cc" +yy54: + ++YYCURSOR; + if ((yych = *YYCURSOR) == 'a') goto yy237; + goto yy121; +yy55: +#line 225 "_mwscan.re" + {RET(t_text);} +#line 484 "_mwscan.cc" +yy56: + yych = *++YYCURSOR; + if (yych == 't') goto yy229; + goto yy121; +yy57: + yych = *++YYCURSOR; + if (yych == 't') goto yy219; + goto yy121; +yy58: + yych = *++YYCURSOR; + if (yych == '_') goto yy122; + goto yy121; +yy59: + yych = *++YYCURSOR; + goto yy121; +yy60: + yych = *++YYCURSOR; + if (yych == ']') goto yy118; + goto yy53; +yy61: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '=') goto yy116; + goto yy115; +yy62: +#line 228 "_mwscan.re" + { + if (eol()) { + if (line_startswith_section>=0) { + line_startswith_section=-1; + RET(t_section_end); + } else { + RET(t_text); + } + } else { + RET(t_text); + } + } +#line 522 "_mwscan.cc" +yy63: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '\n') goto yy111; +#line 241 "_mwscan.re" + {newline(); RET(t_newline);} +#line 528 "_mwscan.cc" +yy65: + yych = *++YYCURSOR; + if (yych <= '*') { + if (yych == '!') goto yy107; + goto yy53; + } else { + if (yych <= '+') goto yy109; + if (yych == '|') goto yy107; + goto yy53; + } +yy66: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '!') goto yy107; +yy67: +#line 266 "_mwscan.re" + {RET(t_text);} +#line 545 "_mwscan.cc" +yy68: + yych = *++YYCURSOR; + goto yy53; +yy69: + yych = *++YYCURSOR; + if (yych == '\'') goto yy102; + goto yy67; +yy70: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= '/') { + if (yych == '!') goto yy86; + if (yych <= '.') goto yy67; + goto yy87; + } else { + if (yych <= 'Z') { + if (yych <= '@') goto yy67; + goto yy88; + } else { + if (yych <= '`') goto yy67; + if (yych <= 'z') goto yy88; + goto yy67; + } + } +yy71: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= '9') { + if (yych == '#') goto yy75; + if (yych <= '/') goto yy67; + goto yy77; + } else { + if (yych <= 'Z') { + if (yych <= '@') goto yy67; + goto yy77; + } else { + if (yych <= '`') goto yy67; + if (yych <= 'z') goto yy77; + goto yy67; + } + } +yy72: + ++YYCURSOR; +#line 265 "_mwscan.re" + {newline(); return t_end;} +#line 591 "_mwscan.cc" +yy74: + yych = *++YYCURSOR; + goto yy67; +yy75: + yych = *++YYCURSOR; + if (yych <= 'W') { + if (yych <= '/') goto yy76; + if (yych <= '9') goto yy82; + } else { + if (yych <= 'X') goto yy81; + if (yych == 'x') goto yy81; + } +yy76: + YYCURSOR = YYMARKER; + if (yyaccept <= 1) { + if (yyaccept <= 0) { + goto yy53; + } else { + goto yy67; + } + } else { + if (yyaccept <= 2) { + goto yy103; + } else { + goto yy55; + } + } +yy77: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ';') { + if (yych <= '/') goto yy76; + if (yych <= '9') goto yy77; + if (yych <= ':') goto yy76; + } else { + if (yych <= 'Z') { + if (yych <= '@') goto yy76; + goto yy77; + } else { + if (yych <= '`') goto yy76; + if (yych <= 'z') goto yy77; + goto yy76; + } + } +yy79: + ++YYCURSOR; +#line 263 "_mwscan.re" + {RET(t_entity);} +#line 640 "_mwscan.cc" +yy81: + yych = *++YYCURSOR; + if (yych == ';') goto yy76; + goto yy85; +yy82: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '/') goto yy76; + if (yych <= '9') goto yy82; + if (yych == ';') goto yy79; + goto yy76; +yy84: + ++YYCURSOR; + yych = *YYCURSOR; +yy85: + if (yych <= ';') { + if (yych <= '/') goto yy76; + if (yych <= '9') goto yy84; + if (yych <= ':') goto yy76; + goto yy79; + } else { + if (yych <= 'F') { + if (yych <= '@') goto yy76; + goto yy84; + } else { + if (yych <= '`') goto yy76; + if (yych <= 'f') goto yy84; + goto yy76; + } + } +yy86: + yych = *++YYCURSOR; + if (yych == '-') goto yy94; + goto yy76; +yy87: + yych = *++YYCURSOR; + if (yych <= '@') goto yy76; + if (yych <= 'Z') goto yy88; + if (yych <= '`') goto yy76; + if (yych >= '{') goto yy76; +yy88: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '>') { + if (yych <= ';') { + if (yych <= 0x0000) goto yy76; + } else { + if (yych <= '<') goto yy76; + if (yych >= '>') goto yy92; + } + } else { + if (yych <= 'Z') { + if (yych >= 'A') goto yy88; + } else { + if (yych <= '`') goto yy90; + if (yych <= 'z') goto yy88; + } + } +yy90: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '<') { + if (yych <= 0x0000) goto yy76; + if (yych <= ';') goto yy90; + goto yy76; + } else { + if (yych != '>') goto yy90; + } +yy92: + ++YYCURSOR; +#line 259 "_mwscan.re" + {RET(t_html_tag);} +#line 713 "_mwscan.cc" +yy94: + yych = *++YYCURSOR; + if (yych != '-') goto yy76; +yy95: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ';') { + if (yych <= 0x0000) goto yy76; + if (yych != '-') goto yy95; + } else { + if (yych == '=') goto yy95; + if (yych <= '>') goto yy76; + goto yy95; + } + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ';') { + if (yych <= 0x0000) goto yy76; + if (yych != '-') goto yy95; + } else { + if (yych == '=') goto yy95; + if (yych <= '>') goto yy76; + goto yy95; + } +yy98: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ';') { + if (yych <= 0x0000) goto yy76; + if (yych == '-') goto yy98; + goto yy95; + } else { + if (yych <= '<') goto yy76; + if (yych != '>') goto yy95; + } + ++YYCURSOR; +#line 262 "_mwscan.re" + {RET(t_comment);} +#line 752 "_mwscan.cc" +yy102: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '\'') goto yy104; +yy103: +#line 257 "_mwscan.re" + {RET(t_style);} +#line 759 "_mwscan.cc" +yy104: + yyaccept = 2; + yych = *(YYMARKER = ++YYCURSOR); + if (yych != '\'') goto yy103; + yych = *++YYCURSOR; + if (yych != '\'') goto yy76; + yych = *++YYCURSOR; + goto yy103; +yy107: + ++YYCURSOR; +#line 243 "_mwscan.re" + { + if (tablemode) + RET(t_column); + cursor = start+1; + RET(t_special); + } +#line 777 "_mwscan.cc" +yy109: + ++YYCURSOR; +#line 250 "_mwscan.re" + { + if (tablemode) + RET(t_tablecaption); + cursor = start+1; + RET(t_special); + } +#line 787 "_mwscan.cc" +yy111: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '\n') goto yy111; +#line 240 "_mwscan.re" + {newline(); RET(t_break);} +#line 794 "_mwscan.cc" +yy114: + ++YYCURSOR; + yych = *YYCURSOR; +yy115: + if (yych == '\t') goto yy114; + if (yych == ' ') goto yy114; + goto yy62; +yy116: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= 0x001F) { + if (yych == '\t') goto yy114; + goto yy62; + } else { + if (yych <= ' ') goto yy114; + if (yych == '=') goto yy116; + goto yy62; + } +yy118: + ++YYCURSOR; +#line 227 "_mwscan.re" + {RET(t_2box_close);} +#line 817 "_mwscan.cc" +yy120: + ++YYCURSOR; + yych = *YYCURSOR; +yy121: + if (yych <= 'Z') { + if (yych <= '/') goto yy55; + if (yych <= '9') goto yy120; + if (yych <= '@') goto yy55; + goto yy120; + } else { + if (yych <= '_') { + if (yych <= '^') goto yy55; + goto yy120; + } else { + if (yych <= '`') goto yy55; + if (yych <= 'z') goto yy120; + goto yy55; + } + } +yy122: + yych = *++YYCURSOR; + switch (yych) { + case 'E': goto yy126; + case 'F': goto yy125; + case 'N': goto yy124; + case 'S': goto yy127; + case 'T': goto yy123; + default: goto yy121; + } +yy123: + yych = *++YYCURSOR; + if (yych == 'O') goto yy216; + goto yy121; +yy124: + yych = *++YYCURSOR; + if (yych == 'E') goto yy146; + if (yych == 'O') goto yy147; + goto yy121; +yy125: + yych = *++YYCURSOR; + if (yych == 'O') goto yy138; + goto yy121; +yy126: + yych = *++YYCURSOR; + if (yych == 'N') goto yy135; + goto yy121; +yy127: + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'A') goto yy121; + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; +yy133: + ++YYCURSOR; + if ((yych = *YYCURSOR) <= 'Z') { + if (yych <= '/') goto yy134; + if (yych <= '9') goto yy120; + if (yych >= 'A') goto yy120; + } else { + if (yych <= '_') { + if (yych >= '_') goto yy120; + } else { + if (yych <= '`') goto yy134; + if (yych <= 'z') goto yy120; + } + } +yy134: +#line 224 "_mwscan.re" + {RET(t_magicword);} +#line 894 "_mwscan.cc" +yy135: + yych = *++YYCURSOR; + if (yych != 'D') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy138: + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy146: + yych = *++YYCURSOR; + if (yych == 'W') goto yy203; + goto yy121; +yy147: + yych = *++YYCURSOR; + switch (yych) { + case 'C': goto yy150; + case 'E': goto yy149; + case 'G': goto yy151; + case 'T': goto yy148; + default: goto yy121; + } +yy148: + yych = *++YYCURSOR; + if (yych <= 'H') { + if (yych == 'C') goto yy186; + goto yy121; + } else { + if (yych <= 'I') goto yy187; + if (yych == 'O') goto yy188; + goto yy121; + } +yy149: + yych = *++YYCURSOR; + if (yych == 'D') goto yy175; + goto yy121; +yy150: + yych = *++YYCURSOR; + if (yych == 'C') goto yy159; + if (yych == 'O') goto yy160; + goto yy121; +yy151: + yych = *++YYCURSOR; + if (yych != 'A') goto yy121; + yych = *++YYCURSOR; + if (yych != 'L') goto yy121; + yych = *++YYCURSOR; + if (yych != 'L') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'Y') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy159: + yych = *++YYCURSOR; + if (yych == '_') goto yy174; + goto yy121; +yy160: + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'V') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy174: + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy175: + yych = *++YYCURSOR; + if (yych != 'I') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'S') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'I') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy186: + yych = *++YYCURSOR; + if (yych == '_') goto yy202; + goto yy121; +yy187: + yych = *++YYCURSOR; + if (yych == 'T') goto yy191; + goto yy121; +yy188: + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy191: + yych = *++YYCURSOR; + if (yych != 'L') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'V') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy202: + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy203: + yych = *++YYCURSOR; + if (yych != 'S') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'I') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'L') goto yy121; + yych = *++YYCURSOR; + if (yych != 'I') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'K') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy216: + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy219: + yych = *++YYCURSOR; + if (yych != 't') goto yy121; + yych = *++YYCURSOR; + if (yych != 'p') goto yy121; + yyaccept = 3; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == ':') goto yy223; + if (yych != 's') goto yy121; + yyaccept = 3; + yych = *(YYMARKER = ++YYCURSOR); + if (yych != ':') goto yy121; +yy223: + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych <= '_') { + if (yych <= ':') { + if (yych <= '&') { + if (yych == '#') goto yy226; + if (yych <= '$') goto yy76; + } else { + if (yych <= '\'') goto yy76; + if (yych == '*') goto yy76; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy226; + if (yych <= '>') goto yy76; + } else { + if (yych <= '@') goto yy76; + if (yych <= 'Z') goto yy226; + if (yych <= '^') goto yy76; + } + } + } else { + if (yych <= 0x00D6) { + if (yych <= '~') { + if (yych <= '`') goto yy76; + if (yych <= 'z') goto yy226; + if (yych <= '}') goto yy76; + } else { + if (yych == 0x00C4) goto yy226; + if (yych <= 0x00D5) goto yy76; + } + } else { + if (yych <= 0x00E4) { + if (yych == 0x00DC) goto yy226; + if (yych <= 0x00E3) goto yy76; + } else { + if (yych <= 0x00F6) { + if (yych <= 0x00F5) goto yy76; + } else { + if (yych != 0x00FC) goto yy76; + } + } + } + } +yy226: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '_') { + if (yych <= ':') { + if (yych <= '&') { + if (yych == '#') goto yy226; + if (yych >= '%') goto yy226; + } else { + if (yych <= '\'') goto yy228; + if (yych != '*') goto yy226; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy226; + if (yych >= '?') goto yy226; + } else { + if (yych <= '@') goto yy228; + if (yych <= 'Z') goto yy226; + if (yych >= '_') goto yy226; + } + } + } else { + if (yych <= 0x00D6) { + if (yych <= '~') { + if (yych <= '`') goto yy228; + if (yych <= 'z') goto yy226; + if (yych >= '~') goto yy226; + } else { + if (yych == 0x00C4) goto yy226; + if (yych >= 0x00D6) goto yy226; + } + } else { + if (yych <= 0x00E4) { + if (yych == 0x00DC) goto yy226; + if (yych >= 0x00E4) goto yy226; + } else { + if (yych <= 0x00F6) { + if (yych >= 0x00F6) goto yy226; + } else { + if (yych == 0x00FC) goto yy226; + } + } + } + } +yy228: +#line 223 "_mwscan.re" + {RET(t_http_url);} +#line 1221 "_mwscan.cc" +yy229: + yych = *++YYCURSOR; + if (yych != 'p') goto yy121; + yyaccept = 3; + yych = *(YYMARKER = ++YYCURSOR); + if (yych != ':') goto yy121; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych <= '=') { + if (yych <= '&') { + if (yych <= '"') goto yy76; + if (yych == '%') goto yy76; + } else { + if (yych <= '\'') goto yy76; + if (yych <= ':') goto yy234; + if (yych <= '<') goto yy76; + } + } else { + if (yych <= '_') { + if (yych <= '>') goto yy76; + if (yych <= 'Z') goto yy234; + if (yych <= '^') goto yy76; + } else { + if (yych <= '{') { + if (yych <= '`') goto yy76; + } else { + if (yych <= '|') goto yy76; + if (yych >= 0x007F) goto yy76; + } + } + } +yy234: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '=') { + if (yych <= '&') { + if (yych <= '"') goto yy236; + if (yych != '%') goto yy234; + } else { + if (yych <= '\'') goto yy236; + if (yych <= ':') goto yy234; + if (yych >= '=') goto yy234; + } + } else { + if (yych <= '_') { + if (yych <= '>') goto yy236; + if (yych <= 'Z') goto yy234; + if (yych >= '_') goto yy234; + } else { + if (yych <= '{') { + if (yych >= 'a') goto yy234; + } else { + if (yych <= '|') goto yy236; + if (yych <= '~') goto yy234; + } + } + } +yy236: +#line 221 "_mwscan.re" + {RET(t_http_url);} +#line 1285 "_mwscan.cc" +yy237: + yych = *++YYCURSOR; + if (yych != 'i') goto yy121; + yych = *++YYCURSOR; + if (yych != 'l') goto yy121; + yych = *++YYCURSOR; + if (yych != 't') goto yy121; + yych = *++YYCURSOR; + if (yych != 'o') goto yy121; + yyaccept = 3; + yych = *(YYMARKER = ++YYCURSOR); + if (yych != ':') goto yy121; + yych = *++YYCURSOR; + if (yych == '@') goto yy76; + goto yy244; +yy243: + ++YYCURSOR; + yych = *YYCURSOR; +yy244: + if (yych <= '9') { + if (yych <= '\'') { + if (yych == '!') goto yy243; + if (yych <= '"') goto yy76; + goto yy243; + } else { + if (yych <= ')') goto yy76; + if (yych == ',') goto yy76; + goto yy243; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy243; + if (yych <= '>') goto yy76; + goto yy243; + } else { + if (yych <= 'Z') { + if (yych >= 'A') goto yy243; + } else { + if (yych <= ']') goto yy76; + if (yych <= '~') goto yy243; + goto yy76; + } + } + } + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '.') { + if (yych <= ',') goto yy76; + } else { + if (yych <= '/') goto yy76; + if (yych >= ':') goto yy76; + } + } else { + if (yych <= '_') { + if (yych <= 'Z') goto yy246; + if (yych <= '^') goto yy76; + } else { + if (yych <= '`') goto yy76; + if (yych >= '{') goto yy76; + } + } +yy246: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '@') { + if (yych <= '.') { + if (yych >= '-') goto yy246; + } else { + if (yych <= '/') goto yy248; + if (yych <= '9') goto yy246; + } + } else { + if (yych <= '_') { + if (yych <= 'Z') goto yy246; + if (yych >= '_') goto yy246; + } else { + if (yych <= '`') goto yy248; + if (yych <= 'z') goto yy246; + } + } +yy248: +#line 219 "_mwscan.re" + {RET(t_http_url);} +#line 1369 "_mwscan.cc" +yy249: + ++YYCURSOR; +#line 226 "_mwscan.re" + {RET(t_2box_open);} +#line 1374 "_mwscan.cc" +yy251: + yych = *++YYCURSOR; + if (yych == 't') goto yy274; + goto yy76; +yy252: + yych = *++YYCURSOR; + if (yych == 't') goto yy266; + goto yy76; +yy253: + yych = *++YYCURSOR; + if (yych != 'a') goto yy76; + yych = *++YYCURSOR; + if (yych != 'i') goto yy76; + yych = *++YYCURSOR; + if (yych != 'l') goto yy76; + yych = *++YYCURSOR; + if (yych != 't') goto yy76; + yych = *++YYCURSOR; + if (yych != 'o') goto yy76; + yych = *++YYCURSOR; + if (yych != ':') goto yy76; + yych = *++YYCURSOR; + if (yych == '@') goto yy76; + goto yy261; +yy260: + ++YYCURSOR; + yych = *YYCURSOR; +yy261: + if (yych <= '9') { + if (yych <= '\'') { + if (yych == '!') goto yy260; + if (yych <= '"') goto yy76; + goto yy260; + } else { + if (yych <= ')') goto yy76; + if (yych == ',') goto yy76; + goto yy260; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy260; + if (yych <= '>') goto yy76; + goto yy260; + } else { + if (yych <= 'Z') { + if (yych >= 'A') goto yy260; + } else { + if (yych <= ']') goto yy76; + if (yych <= '~') goto yy260; + goto yy76; + } + } + } + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '.') { + if (yych <= ',') goto yy76; + } else { + if (yych <= '/') goto yy76; + if (yych >= ':') goto yy76; + } + } else { + if (yych <= '_') { + if (yych <= 'Z') goto yy263; + if (yych <= '^') goto yy76; + } else { + if (yych <= '`') goto yy76; + if (yych >= '{') goto yy76; + } + } +yy263: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '@') { + if (yych <= '.') { + if (yych >= '-') goto yy263; + } else { + if (yych <= '/') goto yy265; + if (yych <= '9') goto yy263; + } + } else { + if (yych <= '_') { + if (yych <= 'Z') goto yy263; + if (yych >= '_') goto yy263; + } else { + if (yych <= '`') goto yy265; + if (yych <= 'z') goto yy263; + } + } +yy265: +#line 218 "_mwscan.re" + {RET(t_urllink);} +#line 1467 "_mwscan.cc" +yy266: + yych = *++YYCURSOR; + if (yych != 'p') goto yy76; + yych = *++YYCURSOR; + if (yych != ':') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych <= '=') { + if (yych <= '&') { + if (yych <= '"') goto yy76; + if (yych == '%') goto yy76; + } else { + if (yych <= '\'') goto yy76; + if (yych <= ':') goto yy271; + if (yych <= '<') goto yy76; + } + } else { + if (yych <= '_') { + if (yych <= '>') goto yy76; + if (yych <= 'Z') goto yy271; + if (yych <= '^') goto yy76; + } else { + if (yych <= '{') { + if (yych <= '`') goto yy76; + } else { + if (yych <= '|') goto yy76; + if (yych >= 0x007F) goto yy76; + } + } + } +yy271: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '=') { + if (yych <= '&') { + if (yych <= '"') goto yy273; + if (yych != '%') goto yy271; + } else { + if (yych <= '\'') goto yy273; + if (yych <= ':') goto yy271; + if (yych >= '=') goto yy271; + } + } else { + if (yych <= '_') { + if (yych <= '>') goto yy273; + if (yych <= 'Z') goto yy271; + if (yych >= '_') goto yy271; + } else { + if (yych <= '{') { + if (yych >= 'a') goto yy271; + } else { + if (yych <= '|') goto yy273; + if (yych <= '~') goto yy271; + } + } + } +yy273: +#line 220 "_mwscan.re" + {RET(t_urllink);} +#line 1530 "_mwscan.cc" +yy274: + yych = *++YYCURSOR; + if (yych != 't') goto yy76; + yych = *++YYCURSOR; + if (yych != 'p') goto yy76; + yych = *++YYCURSOR; + if (yych == ':') goto yy278; + if (yych != 's') goto yy76; + yych = *++YYCURSOR; + if (yych != ':') goto yy76; +yy278: + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych <= '_') { + if (yych <= ':') { + if (yych <= '&') { + if (yych == '#') goto yy281; + if (yych <= '$') goto yy76; + } else { + if (yych <= '\'') goto yy76; + if (yych == '*') goto yy76; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy281; + if (yych <= '>') goto yy76; + } else { + if (yych <= '@') goto yy76; + if (yych <= 'Z') goto yy281; + if (yych <= '^') goto yy76; + } + } + } else { + if (yych <= 0x00D6) { + if (yych <= '~') { + if (yych <= '`') goto yy76; + if (yych <= 'z') goto yy281; + if (yych <= '}') goto yy76; + } else { + if (yych == 0x00C4) goto yy281; + if (yych <= 0x00D5) goto yy76; + } + } else { + if (yych <= 0x00E4) { + if (yych == 0x00DC) goto yy281; + if (yych <= 0x00E3) goto yy76; + } else { + if (yych <= 0x00F6) { + if (yych <= 0x00F5) goto yy76; + } else { + if (yych != 0x00FC) goto yy76; + } + } + } + } +yy281: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '_') { + if (yych <= ':') { + if (yych <= '&') { + if (yych == '#') goto yy281; + if (yych >= '%') goto yy281; + } else { + if (yych <= '\'') goto yy283; + if (yych != '*') goto yy281; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy281; + if (yych >= '?') goto yy281; + } else { + if (yych <= '@') goto yy283; + if (yych <= 'Z') goto yy281; + if (yych >= '_') goto yy281; + } + } + } else { + if (yych <= 0x00D6) { + if (yych <= '~') { + if (yych <= '`') goto yy283; + if (yych <= 'z') goto yy281; + if (yych >= '~') goto yy281; + } else { + if (yych == 0x00C4) goto yy281; + if (yych >= 0x00D6) goto yy281; + } + } else { + if (yych <= 0x00E4) { + if (yych == 0x00DC) goto yy281; + if (yych >= 0x00E4) goto yy281; + } else { + if (yych <= 0x00F6) { + if (yych >= 0x00F6) goto yy281; + } else { + if (yych == 0x00FC) goto yy281; + } + } + } + } +yy283: +#line 222 "_mwscan.re" + {RET(t_urllink);} +#line 1637 "_mwscan.cc" +} +#line 267 "_mwscan.re" + +} + + +PyObject *py_scan(PyObject *self, PyObject *args) +{ + PyObject *arg1; + if (!PyArg_ParseTuple(args, "O:mwscan.scan", &arg1)) { + return 0; + } + PyUnicodeObject *unistr = (PyUnicodeObject*)PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "parameter cannot be converted to unicode in mwscan.scan"); + return 0; + } + + Py_UNICODE *start = unistr->str; + Py_UNICODE *end = start+unistr->length; + + + Scanner scanner (start, end); + Py_BEGIN_ALLOW_THREADS + while (scanner.scan()) { + } + Py_END_ALLOW_THREADS + Py_XDECREF(unistr); + + // return PyList_New(0); // uncomment to see timings for scanning + + int size = scanner.tokens.size(); + PyObject *result = PyList_New(size); + if (!result) { + return 0; + } + + for (int i=0; i<size; i++) { + Token t = scanner.tokens[i]; + PyList_SET_ITEM(result, i, Py_BuildValue("iii", t.type, t.start, t.len)); + } + + return result; +} + + + +static PyMethodDef module_functions[] = { + {"scan", (PyCFunction)py_scan, METH_VARARGS, "scan(text)"}, + {0, 0}, +}; + + + +extern "C" { + DL_EXPORT(void) init_mwscan(); +} + +DL_EXPORT(void) init_mwscan() +{ + /*PyObject *m =*/ Py_InitModule("_mwscan", module_functions); +} |