Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools2/make_selection.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools2/make_selection.py')
-rwxr-xr-xtools2/make_selection.py522
1 files changed, 522 insertions, 0 deletions
diff --git a/tools2/make_selection.py b/tools2/make_selection.py
new file mode 100755
index 0000000..4dd3a63
--- /dev/null
+++ b/tools2/make_selection.py
@@ -0,0 +1,522 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# take a list of pages
+# select a level default = 1
+# prepare a list of links in the pages from the original list
+# create a file with the titles of all the selected pages
+# create a file with the content of all the selected pages
+
+import codecs
+import re
+from xml.sax import make_parser, handler
+import os
+import sys
+from operator import itemgetter
+import config
+
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
+
+
+def normalize_title(title):
+ return title.strip().replace(' ', '_').capitalize()
+
+
+class FileListReader():
+
+ def __init__(self, file_name):
+ _file = codecs.open(file_name,
+ encoding='utf-8', mode='r')
+ self.list = []
+ line = _file.readline()
+ while line:
+ self.list.append(normalize_title(line))
+ line = _file.readline()
+
+
+class RedirectParser:
+
+ def __init__(self, file_name, postfix='redirects'):
+ self.link_re = re.compile('\[\[.*?\]\]')
+ # Load redirects
+ input_redirects = codecs.open('%s.%s' % (file_name, postfix),
+ encoding='utf-8', mode='r')
+
+ self.redirects = {}
+ self.reversed_index = {}
+ count = 0
+ for line in input_redirects.readlines():
+ links = links = self.link_re.findall(unicode(line))
+ if len(links) == 2:
+ origin = links[0][2:-2]
+ destination = links[1][2:-2]
+ self.redirects[normalize_title(origin)] = \
+ normalize_title(destination)
+ # add to the reversed index
+ if destination in self.reversed_index:
+ self.reversed_index[destination].append(origin)
+ else:
+ self.reversed_index[destination] = [origin]
+
+ count += 1
+ #print "Processing %s" % normalize_title(origin)
+ input_redirects.close()
+
+ def get_redirected(self, article_title):
+ try:
+ article_title = article_title.capitalize()
+ redirect = self.redirects[article_title]
+ except:
+ redirect = None
+ return redirect
+
+
+class PagesLinksFilter():
+
+ def __init__(self, file_name, redirects_checker):
+ """
+ Read the list of pages from the .links file
+ """
+ self.pages = []
+ input_links = codecs.open('%s.links' % file_name,
+ encoding='utf-8', mode='r')
+ line = input_links.readline()
+ while line:
+ words = line.split()
+ if len(words) > 0:
+ page = words[0]
+ print "Adding page %s" % page
+ redirected = redirects_checker.get_redirected(page)
+ if redirected is not None:
+ page = redirected
+ if not page in self.pages:
+ self.pages.append(page)
+ line = input_links.readline()
+ input_links.close()
+
+
+class LinksFilter():
+
+ def __init__(self, file_name, redirects_checker, favorites):
+ self.links = []
+ input_links = codecs.open('%s.links' % file_name,
+ encoding='utf-8', mode='r')
+ line = input_links.readline()
+ while line:
+ words = line.split()
+ if len(words) > 0:
+ page = words[0]
+ #print "Processing page %s \r" % page,
+ if page in favorites:
+ print "Adding page %s" % page
+ for n in range(1, len(words) - 1):
+ link = words[n]
+ link = normalize_title(link)
+
+ if link.find('#') > -1:
+ # don't count links in the same page
+ if link.find('#') == 0:
+ continue
+ else:
+ # use only the article part of the link
+ link = link[:link.find('#')]
+
+ # check if is a redirect
+ redirected = redirects_checker.get_redirected(link)
+ if redirected is not None:
+ link = redirected
+
+ if not link in self.links and \
+ not link in favorites:
+ self.links.append(link)
+ line = input_links.readline()
+ input_links.close()
+
+
+class PagesProcessor(handler.ContentHandler):
+
+ def __init__(self, file_name, selected_pages_list, pages_blacklist):
+ handler.ContentHandler.__init__(self)
+ self._page_counter = 0
+ self._page = None
+ self._output = codecs.open('%s.processed' % file_name,
+ encoding='utf-8', mode='w')
+ self._output_page_images = codecs.open('%s.page_images' % file_name,
+ encoding='utf-8', mode='w')
+
+ self.image_re = re.compile('\[\[%s.*?\]\]' % config.FILE_TAG)
+ self._selected_pages_list = selected_pages_list
+ self._pages_blacklist = pages_blacklist
+
+ def startElement(self, name, attrs):
+ if name == "page":
+ self._page = {}
+ self._page_counter += 1
+ self._text = ""
+
+ def characters(self, content):
+ self._text = self._text + content
+
+ def _register_page(self, register, title, content):
+ register.write('\01\n')
+ register.write('%s\n' % normalize_title(title))
+ register.write('%d\n' % len(content))
+ register.write('\02\n')
+ register.write('%s\n' % content)
+ register.write('\03\n')
+
+ def _hashpath(self, name):
+ name = name.replace(' ', '_')
+ name = name[:1].upper() + name[1:]
+ d = md5(name.encode('utf-8')).hexdigest()
+ return "/".join([d[0], d[:2], name])
+
+ def _get_url_image(self, image_wiki):
+ """
+ [[Archivo:Johann Sebastian Bach.jpg|thumb|200px|right|[[J. S. Bach]]
+ """
+ # remove [[ and ]]
+ image_wiki = image_wiki[2:-2]
+ parts = image_wiki.split('|')
+
+ name = parts[0]
+ name = name[len(config.FILE_TAG):]
+
+ image_size = config.MAX_IMAGE_SIZE
+ # check if there are a size defined
+ for part in parts:
+ # this image sizes are copied from server.py
+ if part.strip() == 'thumb':
+ image_size = 180
+ break
+
+ if part.find('px') > -1:
+ try:
+ image_size = int(part[:part.find('px')])
+ except:
+ pass
+
+ hashed_name = unicode(self._hashpath(name)) # .encode('utf8')
+ url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/' \
+ + hashed_name + ('/%dpx-' % image_size) + name.replace(' ', '_')
+ # the svg files are requested as png
+ if re.match(r'.*\.svg$', url, re.IGNORECASE):
+ url = url + '.png'
+ return url
+
+ def get_images(self, title):
+ # find images used in the pages
+ images = self.image_re.findall(unicode(self._page))
+ images_list = []
+ for image in images:
+ url = self._get_url_image(image)
+ # only add one time by page
+ if not url in images_list:
+ images_list.append(url)
+
+ if len(images_list) > 0:
+ self._output_page_images.write('%s ' % title)
+ for image in images_list:
+ self._output_page_images.write('%s ' % image)
+ self._output_page_images.write('\n')
+
+ def endElement(self, name):
+ if name == "title":
+ self._title = self._text
+ elif name == "text":
+ self._page = self._text
+ elif name == "page":
+
+ for namespace in config.BLACKLISTED_NAMESPACES:
+ if unicode(self._title).startswith(namespace):
+ return
+
+ title = normalize_title(self._title)
+
+ for namespace in config.TEMPLATE_NAMESPACES:
+ if unicode(self._title).startswith(namespace):
+ self.get_images(title)
+ return
+
+ for tag in config.REDIRECT_TAGS:
+ if unicode(self._page).startswith(tag):
+ return
+
+ if (title not in self._pages_blacklist) and \
+ (title in self._selected_pages_list):
+ print "%d Page '%s', length %d \r" % \
+ (self._page_counter, title, len(self._page)),
+ # processed
+ self._register_page(self._output, title, self._page)
+ self.get_images(title)
+
+ elif name == "mediawiki":
+ self._output.close()
+ self._output_page_images.close()
+ print "Processed %d pages." % self._page_counter
+
+
+class TemplatesCounter:
+
+ def __init__(self, file_name, pages_selected, redirect_checker):
+ self.templates_to_counter = {}
+ input_links = codecs.open('%s.page_templates' % file_name,
+ encoding='utf-8', mode='r')
+ line = input_links.readline()
+ while line:
+ words = line.split()
+ page = words[0]
+ if page in pages_selected:
+ print "Processing page %s \r" % page,
+ for n in range(1, len(words) - 1):
+ template = words[n]
+ try:
+ self.templates_to_counter[template] = \
+ self.templates_to_counter[template] + 1
+ except:
+ self.templates_to_counter[template] = 1
+ line = input_links.readline()
+ input_links.close()
+
+ # Verify redirects
+ print "Verifying redirects"
+ for template in self.templates_to_counter.keys():
+ redirected = redirect_checker.get_redirected(template)
+ if redirected is not None:
+ if redirected in self.templates_to_counter:
+ self.templates_to_counter[redirected] = \
+ self.templates_to_counter[redirected] + \
+ self.templates_to_counter[template]
+ self.templates_to_counter[template] = 0
+ else:
+ self.templates_to_counter[redirected] = \
+ self.templates_to_counter[template]
+ self.templates_to_counter[template] = 0
+
+
+class CountedTemplatesReader():
+
+ def __init__(self, file_name):
+ _file = codecs.open('%s.templates_counted' % file_name,
+ encoding='utf-8', mode='r')
+ self.templates = {}
+ line = _file.readline()
+ while line:
+ words = line.split()
+ template_name = words[0]
+ cant_used = int(words[1])
+ self.templates[normalize_title(template_name)] = \
+ {'cant': cant_used}
+ line = _file.readline()
+
+
+class TemplatesLoader():
+
+ def __init__(self, file_name, templates_used, select_all=False):
+ _file = codecs.open('%s.templates' % file_name,
+ encoding='utf-8', mode='r')
+ self._output = codecs.open('%s.processed' % file_name,
+ encoding='utf-8', mode='a')
+ line = _file.readline()
+ while line:
+ if len(line) == 2:
+ if ord(line[0]) == 1:
+ title = _file.readline()
+ size = _file.readline()
+ separator = _file.readline()
+ finish = False
+ template_content = ''
+ while not finish:
+ line = _file.readline()
+ #print line
+ if len(line) == 2:
+ if ord(line[0]) == 3:
+ finish = True
+ break
+ template_content += line
+ template_namespace = title[:title.find(':')]
+ template_name = title[title.find(':') + 1:]
+ template_name = normalize_title(template_name)
+ #print "checking", template_name,
+
+ if select_all or template_name in templates_used.keys():
+ #print "Adding", template_name,
+ title = template_namespace + ":" + template_name
+ self._register_page(title, template_content.strip())
+
+ line = _file.readline()
+
+ def _register_page(self, title, content):
+ self._output.write('\01\n')
+ self._output.write('%s\n' % normalize_title(title))
+ self._output.write('%d\n' % len(content))
+ self._output.write('\02\n')
+ self._output.write('%s\n' % content)
+ self._output.write('\03\n')
+
+
+class RedirectsUsedWriter():
+
+ def __init__(self, file_name, selected_pages_list, templates_used,
+ redirect_checker, postfix='redirects_used'):
+ _output_redirects = codecs.open('%s.%s' % (file_name, postfix),
+ encoding='utf-8', mode='w')
+
+ counter = 0
+ # check pages in redirects
+ for title in selected_pages_list:
+ title = normalize_title(title)
+ if title in redirect_checker.reversed_index:
+ for origin in redirect_checker.reversed_index[title]:
+ _output_redirects.write('[[%s]]\t[[%s]]\n' %
+ (origin, title))
+ counter += 1
+ print "Found %d redirected pages" % counter
+
+ templates_redirects = {}
+ # check pages in redirects
+ counter = 0
+ for title in templates_used.keys():
+ title = normalize_title(title)
+ if title in redirect_checker.reversed_index:
+ for origin in redirect_checker.reversed_index[title]:
+ _output_redirects.write('[[%s]]\t[[%s]]\n' %
+ (origin, title))
+ counter += 1
+
+ print "Found %d redirected templates" % counter
+
+ _output_redirects.close()
+
+
+if __name__ == '__main__':
+
+ select_all = False
+ if len(sys.argv) > 1:
+ for argn in range(1, len(sys.argv)):
+ arg = sys.argv[argn]
+ if arg == '--all':
+ select_all = True
+ print "Selecting all the pages"
+
+ MAX_LEVELS = 1
+
+ if not select_all:
+ fav_reader = FileListReader(config.favorites_file_name)
+ print "Loaded %d favorite pages" % len(fav_reader.list)
+
+ if os.path.exists(config.blacklist_file_name):
+ pages_blacklisted_reader = FileListReader(config.blacklist_file_name)
+ pages_blacklist = pages_blacklisted_reader.list
+ print "Loaded %d blacklisted pages" % len(pages_blacklist)
+ else:
+ pages_blacklist = []
+
+ input_xml_file_name = config.input_xml_file_name
+
+ print "Init redirects checker"
+ redirect_checker = RedirectParser(input_xml_file_name)
+
+ level = 1
+
+ if not select_all:
+ selected_pages_file_name = '%s.pages_selected-level-%d' % \
+ (input_xml_file_name, MAX_LEVELS)
+ else:
+ selected_pages_file_name = '%s.pages_selected' % input_xml_file_name
+
+ if not os.path.exists(selected_pages_file_name):
+ if not select_all:
+ while level <= MAX_LEVELS:
+ print "Processing links level %d" % level
+ links_filter = LinksFilter(input_xml_file_name,
+ redirect_checker, fav_reader.list)
+ fav_reader.list.extend(links_filter.links)
+ level += 1
+
+ print "Writing pages_selected-level-%d file" % MAX_LEVELS
+ output_file = codecs.open(selected_pages_file_name,
+ encoding='utf-8', mode='w')
+ for page in fav_reader.list:
+ output_file.write('%s\n' % page)
+ output_file.close()
+ selected_pages_list = fav_reader.list
+ else:
+ print "Processing links"
+ links_filter = PagesLinksFilter(input_xml_file_name,
+ redirect_checker)
+
+ print "Writing pages_selected file %d pages" % \
+ len(links_filter.pages)
+ output_file = codecs.open(selected_pages_file_name,
+ encoding='utf-8', mode='w')
+ for page in links_filter.pages:
+ output_file.write('%s\n' % page)
+ output_file.close()
+ selected_pages_list = links_filter.pages
+
+ else:
+ print "Loading selected pages"
+ pages_selected_reader = FileListReader(selected_pages_file_name)
+ selected_pages_list = pages_selected_reader.list
+
+ if not os.path.exists('%s.processed' % input_xml_file_name):
+ print "Writing .processed file"
+ parser = make_parser()
+ parser.setContentHandler(PagesProcessor(input_xml_file_name,
+ selected_pages_list, pages_blacklist))
+ parser.parse(input_xml_file_name)
+
+ # if there are a .templates_counted file should be removed
+ # because we need recalculate it
+ if os.path.exists('%s.templates_counted' % input_xml_file_name):
+ os.remove('%s.templates_counted' % input_xml_file_name)
+
+ templates_used_reader = None
+ if not os.path.exists('%s.templates_counted' % input_xml_file_name):
+ if select_all:
+ templates_loader = TemplatesLoader(input_xml_file_name, [], True)
+ else:
+ print "Processing templates"
+ templates_counter = TemplatesCounter(input_xml_file_name,
+ selected_pages_list, redirect_checker)
+
+ print "Sorting counted templates"
+ items = templates_counter.templates_to_counter.items()
+ items.sort(key=itemgetter(1), reverse=True)
+
+ print "Writing templates_counted file"
+ output_file = codecs.open('%s.templates_counted' % \
+ input_xml_file_name, encoding='utf-8', mode='w')
+ for n in range(len(items)):
+ if int(items[n][1]) > 0:
+ output_file.write('%s %d\n' % (items[n][0], items[n][1]))
+ output_file.close()
+
+ print "Loading templates used"
+ templates_used_reader = CountedTemplatesReader(input_xml_file_name)
+ print "Readed %d templates used" % len(
+ templates_used_reader.templates)
+
+ print "Adding used templates to .processed file"
+ templates_loader = TemplatesLoader(input_xml_file_name,
+ templates_used_reader.templates)
+
+ if not os.path.exists('%s.redirects_used' % input_xml_file_name):
+ if select_all:
+ os.link('%s.redirects' % input_xml_file_name,
+ '%s.redirects_used' % input_xml_file_name)
+ else:
+ if templates_used_reader is None:
+ print "Loading templates used"
+ templates_used_reader = \
+ CountedTemplatesReader(input_xml_file_name)
+ print "Readed %d templates used" % \
+ len(templates_used_reader.templates)
+
+ redirects_used_writer = RedirectsUsedWriter(input_xml_file_name,
+ selected_pages_list, templates_used_reader.templates,
+ redirect_checker)