1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
#! /usr/bin/env python
#! -*- coding:utf-8 -*-
import re
import simplejson
"""
See METABOOK.txt for description of Metabook data
"""
class MetaBook(object):
"""Encapsulate meta information about an article collection"""
title = u""
subtitle = u""
def __init__(self):
self.type = 'collection'
self.version = 1
self.items = []
def addArticles(self, articleTitles, chapterTitle=None, contentType='text/x-wiki'):
"""
@param articleTitles: sequence of article titles or dicts containing
article title (value for key 'title') and optionally display title
(value for key 'displaytitle').
@type articleTitles: [unicode|{str: unicode}]
"""
articles = []
for title in articleTitles:
article = {
'type': 'article',
'content-type': contentType,
}
if isinstance(title, dict):
article.update(title)
else:
article['title'] = title
articles.append(article)
if chapterTitle:
self.items.append({
'type': 'chapter',
'title': chapterTitle,
'items': articles,
})
else:
self.items.extend(articles)
def dumpJson(self):
return simplejson.dumps(vars(self))
def loadJson(self, jsonStr):
for (var, value) in simplejson.loads(jsonStr).items():
setattr(self, var, value)
def readJsonFile(self, filename):
self.loadJson(open(filename, 'rb').read())
def loadCollectionPage(self, mwcollection):
"""Parse wikitext of a MediaWiki collection page
@param mwcollection: wikitext of a MediaWiki collection page as created by
the Collection extension for MediaWiki
@type mwcollection: unicode
"""
titleRe = '^==\s+(?P<title>.*?)\s+==$'
subtitleRe = '^===\s+(?P<subtitle>.*?)\s+===$'
chapterRe = '^;(?P<chapter>.*?)$'
articleRe = '^:\[\[:?(?P<article>.*?)(?:\|(?P<displaytitle>.*?))?\]\]$'
alltogetherRe = re.compile("(%s)|(%s)|(%s)|(%s)" % (titleRe, subtitleRe, chapterRe, articleRe))
gotChapter = False
chapter = ''
articles = []
for line in mwcollection.splitlines():
res = alltogetherRe.search(line.strip())
if not res:
continue
if res.group('title'):
self.title = res.group('title')
elif res.group('subtitle'):
self.subtitle = res.group('subtitle')
elif res.group('chapter'):
self.addArticles(articles, chapter)
articles = []
chapter = res.group('chapter')
elif res.group('article'):
d = {'title': res.group('article')}
if res.group('displaytitle'):
d['displaytitle'] = res.group('displaytitle')
articles.append(d)
if len(articles):
self.addArticles(articles, chapter)
def getArticles(self):
"""Generator that produces a sequence of (title, revision) pairs for
each article contained in this collection. If no revision is specified,
None is returned for the revision item.
"""
for item in self.getItems():
if item['type'] == 'article':
yield item['title'], item.get('revision', None)
def getItems(self):
"""Generator that produces a flattened list of chapters and articles
in this collection.
"""
for item in self.items:
if item['type'] == 'article':
yield item
elif item['type'] == 'chapter':
yield item
for article in item.get('items', []):
yield article
|