1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
#! /usr/bin/env python
# Copyright (c) 2007-2008 PediaPress GmbH
# See README.txt for additional licensing information.
"""usable/user parser"""
from mwlib import parser, scanner, expander
def simplify(node):
"concatenates textnodes in order to reduce the number of objects"
Text = parser.Text
last = None
toremove = []
for i,c in enumerate(node.children):
if c.__class__ == Text: # would isinstance be safe?
if last:
last.caption += c.caption
toremove.append(i)
else:
last = c
else:
simplify(c)
last = None
for i,ii in enumerate(toremove):
del node.children[ii-i]
def fixlitags(node):
Text = parser.Text
if not isinstance(node, parser.ItemList):
idx = 0
while idx < len(node.children):
if isinstance(node.children[idx], parser.Item):
lst = parser.ItemList()
lst.append(node.children[idx])
node.children[idx] = lst
idx += 1
while idx<len(node.children):
if isinstance(node.children[idx], parser.Item):
lst.append(node.children[idx])
del node.children[idx]
elif node.children[idx]==Text("\n"):
del node.children[idx]
else:
break
else:
idx += 1
for x in node.children:
fixlitags(x)
def removeBoilerplate(node):
i = 0
while i < len(node.children):
x = node.children[i]
if isinstance(x, parser.TagNode) and x.caption=='div':
try:
klass = x.values.get('class', '')
except AttributeError:
klass = ''
if 'boilerplate' in klass:
del node.children[i]
continue
i += 1
for x in node.children:
removeBoilerplate(x)
postprocessors = [removeBoilerplate, simplify, fixlitags]
def parseString(title=None, raw=None, wikidb=None, revision=None):
"""parse article with title from raw mediawiki text"""
assert title is not None
if raw is None:
raw = wikidb.getRawArticle(title, revision=revision)
assert raw is not None, "cannot get article %r" % (title,)
if wikidb:
te = expander.Expander(raw, pagename=title, wikidb=wikidb)
input = te.expandTemplates()
else:
input = raw
tokens = scanner.tokenize(input, title)
a = parser.Parser(tokens, title).parse()
a.caption = title
for x in postprocessors:
x(a)
return a
def simpleparse(raw): # !!! USE FOR DEBUGGING ONLY !!! does not use post processors
import sys
from mwlib import dummydb
db = dummydb.DummyDB()
tokens = scanner.tokenize(raw)
r=parser.Parser(tokens, "unknown").parse()
parser.show(sys.stdout, r, 0)
return r
def main():
from mwlib.dummydb import DummyDB
import os
import sys
db = DummyDB()
for x in sys.argv[1:]:
input = unicode(open(x).read(), 'utf8')
title = unicode(os.path.basename(x))
parseString(title, input, db)
if __name__=="__main__":
main()
|