tools2/expandtemplates.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2007, One Laptop Per Child
#
# License: GPLv2
#
# Usage:
# ./tools2/expandtemplates.py directory 2>expand.log
# Ex:
# ./tools2/expandtemplates.py es_lat

import sys
reload(sys)
# Important! We'll be using stdout and stderr with
# UTF-8 chars. Without this, errors galore.
sys.setdefaultencoding('utf-8')

sys.path.append('.')

import os
import re
import codecs
from server import WPWikiDB
from make_selection import FileListReader

START_HEADING = chr(1)
START_TEXT = chr(2)
END_TEXT = chr(3)

import config

# __main__

only_page = None
start_at = None
stdout = False

if len(sys.argv) > 1:
    directory = sys.argv[1]

    for argn in range(1, len(sys.argv)):
        arg = sys.argv[argn]
        if arg.startswith('--only='):
            only_page = arg[len('--only='):]
            print "Processing only article '%s'" % only_page
        if arg.startswith('--start_at='):
            start_at = arg[len('--start_at='):]
            print "Starting to process at article '%s'" % start_at
        if arg == '--stdout':
            stdout = True
            print "Writing output to stdout"

else:
    print "Use expandtemplates.py directory"
    exit()


xml_file_name = config.input_xml_file_name
if xml_file_name.find('/') > -1:
    xml_file_name = xml_file_name[xml_file_name.find('/') + 1:]
path = os.path.join(directory, xml_file_name)

articles_list = []
if only_page is not None:
    articles_list = [unicode(only_page)]
else:
    articles_reader = FileListReader('%s.pages_selected-level-1' % path)

    articles_list = articles_reader.list
    if start_at is not None:
        filtered_list = []
        found = False
        for title in articles_list:
            if title == start_at:
                found = True
            if found:
                filtered_list.append(title)
        articles_list = filtered_list

lang = os.path.basename(path)[0:2]

templateprefix = config.TEMPLATE_NAMESPACES[0]

# load blacklist only once
templateblacklist = set()
templateblacklistpath = os.path.join(os.path.dirname(path),
                                     'template_blacklist')
if os.path.exists(templateblacklistpath):
    with open(templateblacklistpath, 'r') as f:
        for line in f.readlines():
            templateblacklist.add(line.rstrip().decode('utf8'))

wikidb = WPWikiDB(path, lang, templateprefix, templateblacklist)
rx = re.compile('(' + templateprefix + '|Wikipedia:)')

if not stdout:
    file_mode = 'w'
    if os.path.exists('%s.processed_expanded' % path):
        file_mode = 'a'

    _output = codecs.open('%s.processed_expanded' % path,
            encoding='utf-8', mode=file_mode)
else:
    _output = sys.stdout

for title in articles_list:
    if title.find('#') > -1:
        if title.find('#') == 0:
            continue
        else:
            title = title[:title.find('#')]

    if rx.match(title):
        sys.stderr.write('SKIPPING: ' + title + "\n")
        continue

    sys.stderr.write('PROCESSING: ' + title + "\n")

    article_text = wikidb.getExpandedArticle(title)
    if article_text == None:
        sys.stderr.write('ERROR - SKIPPING: ' + title + "\n")
        continue

    _output.write(START_HEADING + '\n')
    _output.write(title + '\n')
    # in Python 2.x, len() over a unicode string
    # gives us the bytecount. Not compat w Python 3.
    _output.write("%s\n" % len(article_text))
    _output.write(START_TEXT + '\n')
    _output.write(article_text + '\n')
    _output.write(END_TEXT + '\n')

_output.close()