1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2007, One Laptop Per Child
#
# License: GPLv2
#
# Usage:
# ./tools2/expandtemplates.py directory 2>expand.log
# Ex:
# ./tools2/expandtemplates.py es_lat
import sys
reload(sys)
# Important! We'll be using stdout and stderr with
# UTF-8 chars. Without this, errors galore.
sys.setdefaultencoding('utf-8')
sys.path.append('.')
import os
import re
import codecs
from server import WPWikiDB
from make_selection import FileListReader
START_HEADING = chr(1)
START_TEXT = chr(2)
END_TEXT = chr(3)
import config
# __main__
only_page = None
start_at = None
stdout = False
if len(sys.argv) > 1:
directory = sys.argv[1]
for argn in range(1, len(sys.argv)):
arg = sys.argv[argn]
if arg.startswith('--only='):
only_page = arg[len('--only='):]
print "Processing only article '%s'" % only_page
if arg.startswith('--start_at='):
start_at = arg[len('--start_at='):]
print "Starting to process at article '%s'" % start_at
if arg == '--stdout':
stdout = True
print "Writing output to stdout"
else:
print "Use expandtemplates.py directory"
exit()
xml_file_name = config.input_xml_file_name
if xml_file_name.find('/') > -1:
xml_file_name = xml_file_name[xml_file_name.find('/') + 1:]
path = os.path.join(directory, xml_file_name)
articles_list = []
if only_page is not None:
articles_list = [unicode(only_page)]
else:
articles_reader = FileListReader('%s.pages_selected-level-1' % path)
articles_list = articles_reader.list
if start_at is not None:
filtered_list = []
found = False
for title in articles_list:
if title == start_at:
found = True
if found:
filtered_list.append(title)
articles_list = filtered_list
lang = os.path.basename(path)[0:2]
templateprefix = config.TEMPLATE_NAMESPACES[0]
# load blacklist only once
templateblacklist = set()
templateblacklistpath = os.path.join(os.path.dirname(path),
'template_blacklist')
if os.path.exists(templateblacklistpath):
with open(templateblacklistpath, 'r') as f:
for line in f.readlines():
templateblacklist.add(line.rstrip().decode('utf8'))
wikidb = WPWikiDB(path, lang, templateprefix, templateblacklist)
rx = re.compile('(' + templateprefix + '|Wikipedia:)')
if not stdout:
file_mode = 'w'
if os.path.exists('%s.processed_expanded' % path):
file_mode = 'a'
_output = codecs.open('%s.processed_expanded' % path,
encoding='utf-8', mode=file_mode)
else:
_output = sys.stdout
for title in articles_list:
if title.find('#') > -1:
if title.find('#') == 0:
continue
else:
title = title[:title.find('#')]
if rx.match(title):
sys.stderr.write('SKIPPING: ' + title + "\n")
continue
sys.stderr.write('PROCESSING: ' + title + "\n")
article_text = wikidb.getExpandedArticle(title)
if article_text == None:
sys.stderr.write('ERROR - SKIPPING: ' + title + "\n")
continue
_output.write(START_HEADING + '\n')
_output.write(title + '\n')
# in Python 2.x, len() over a unicode string
# gives us the bytecount. Not compat w Python 3.
_output.write("%s\n" % len(article_text))
_output.write(START_TEXT + '\n')
_output.write(article_text + '\n')
_output.write(END_TEXT + '\n')
_output.close()
|