blob: d58327de7aa416ef1b2bc4db9b5c3e2fd59698a8 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This utility takes a already processed articles
# and create a new file with part of these articles based in
# ignoring the articles in a list
import codecs
import os
import config
from make_selection import FileListReader, RedirectParser, RedirectsUsedWriter
from make_selection import CountedTemplatesReader
if __name__ == '__main__':
input_xml_file_name = config.input_xml_file_name
print "Loading low rank pages"
ignored_pages_reader = FileListReader('%s.low_rank_pages' %
input_xml_file_name)
processed_file = open("%s.processed" % input_xml_file_name, mode='r')
output_file = open("%s.processed_filtered" % input_xml_file_name, mode='w')
data_line = processed_file.readline()
while data_line:
#print data_line
if len(data_line) == 2:
if ord(data_line[0]) == 1:
title = processed_file.readline()
# read article size
# size
size_line = processed_file.readline()
# \02
data_line = processed_file.readline()
title = title[0:-1].strip().capitalize()
if title not in ignored_pages_reader.list:
# \01
output_file.write('\01\n')
output_file.write('%s\n' % title)
# size
output_file.write('%s\n' % size_line)
# \02
output_file.write('\02\n')
finish = False
while not finish:
line = processed_file.readline()
output_file.write('%s\n' % line)
if len(line) == 2:
if ord(line[0]) == 3:
output_file.write('\03\n')
finish = True
break
else:
print "* Ignored %s " % title
data_line = processed_file.readline()
output_file.close()
# clean redirects used
print "Loading redirects used "
redirect_checker = RedirectParser(input_xml_file_name,
postfix='redirects_used')
print "Loading selected pages"
selected_pages_reader = FileListReader('%s.pages_selected-level-1' %
input_xml_file_name)
print "Cleaning selected pages list"
# clean selected_pages_reader list
filtered_list = []
for article in selected_pages_reader.list:
if article not in ignored_pages_reader.list:
filtered_list.append(article)
print "Loading templates used"
templates_used_reader = CountedTemplatesReader(input_xml_file_name)
print "Writing redirects used filtered"
redirect_writer = RedirectsUsedWriter(input_xml_file_name, filtered_list,
templates_used_reader.templates,
redirect_checker, postfix='redirects_used_filtered')
|