#!/usr/bin/env python # -*- coding: utf-8 -*- # take a list of pages # select a level default = 1 # prepare a list of links in the pages from the original list # create a file with the titles of all the selected pages # create a file with the content of all the selected pages import codecs import re from xml.sax import make_parser, handler import os import sys from operator import itemgetter import config try: from hashlib import md5 except ImportError: from md5 import md5 def normalize_title(title): return title.strip().replace(' ', '_').capitalize() class FileListReader(): def __init__(self, file_name): _file = codecs.open(file_name, encoding='utf-8', mode='r') self.list = [] line = _file.readline() while line: self.list.append(normalize_title(line)) line = _file.readline() class RedirectParser: def __init__(self, file_name, postfix='redirects'): self.link_re = re.compile('\[\[.*?\]\]') # Load redirects input_redirects = codecs.open('%s.%s' % (file_name, postfix), encoding='utf-8', mode='r') self.redirects = {} self.reversed_index = {} count = 0 for line in input_redirects.readlines(): links = links = self.link_re.findall(unicode(line)) if len(links) == 2: origin = links[0][2:-2] destination = links[1][2:-2] self.redirects[normalize_title(origin)] = \ normalize_title(destination) # add to the reversed index if destination in self.reversed_index: self.reversed_index[destination].append(origin) else: self.reversed_index[destination] = [origin] count += 1 #print "Processing %s" % normalize_title(origin) input_redirects.close() def get_redirected(self, article_title): try: article_title = article_title.capitalize() redirect = self.redirects[article_title] except: redirect = None return redirect class PagesLinksFilter(): def __init__(self, file_name, redirects_checker): """ Read the list of pages from the .links file """ self.pages = [] input_links = codecs.open('%s.links' % file_name, encoding='utf-8', mode='r') line = input_links.readline() while line: words = line.split() if len(words) > 0: page = words[0] print "Adding page %s" % page redirected = redirects_checker.get_redirected(page) if redirected is not None: page = redirected if not page in self.pages: self.pages.append(page) line = input_links.readline() input_links.close() class LinksFilter(): def __init__(self, file_name, redirects_checker, favorites): self.links = [] input_links = codecs.open('%s.links' % file_name, encoding='utf-8', mode='r') line = input_links.readline() while line: words = line.split() if len(words) > 0: page = words[0] #print "Processing page %s \r" % page, if page in favorites: print "Adding page %s" % page for n in range(1, len(words) - 1): link = words[n] link = normalize_title(link) if link.find('#') > -1: # don't count links in the same page if link.find('#') == 0: continue else: # use only the article part of the link link = link[:link.find('#')] # check if is a redirect redirected = redirects_checker.get_redirected(link) if redirected is not None: link = redirected if not link in self.links and \ not link in favorites: self.links.append(link) line = input_links.readline() input_links.close() class PagesProcessor(handler.ContentHandler): def __init__(self, file_name, selected_pages_list, pages_blacklist): handler.ContentHandler.__init__(self) self._page_counter = 0 self._page = None self._output = codecs.open('%s.processed' % file_name, encoding='utf-8', mode='w') self._output_page_images = codecs.open('%s.page_images' % file_name, encoding='utf-8', mode='w') self.image_re = re.compile('\[\[%s.*?\]\]' % config.FILE_TAG) self._selected_pages_list = selected_pages_list self._pages_blacklist = pages_blacklist def startElement(self, name, attrs): if name == "page": self._page = {} self._page_counter += 1 self._text = "" def characters(self, content): self._text = self._text + content def _register_page(self, register, title, content): register.write('\01\n') register.write('%s\n' % normalize_title(title)) register.write('%d\n' % len(content)) register.write('\02\n') register.write('%s\n' % content) register.write('\03\n') def _hashpath(self, name): name = name.replace(' ', '_') name = name[:1].upper() + name[1:] d = md5(name.encode('utf-8')).hexdigest() return "/".join([d[0], d[:2], name]) def _get_url_image(self, image_wiki): """ [[Archivo:Johann Sebastian Bach.jpg|thumb|200px|right|[[J. S. Bach]] """ # remove [[ and ]] image_wiki = image_wiki[2:-2] parts = image_wiki.split('|') name = parts[0] name = name[len(config.FILE_TAG):] image_size = config.MAX_IMAGE_SIZE # check if there are a size defined for part in parts: # this image sizes are copied from server.py if part.strip() == 'thumb': image_size = 180 break if part.find('px') > -1: try: image_size = int(part[:part.find('px')]) except: pass hashed_name = unicode(self._hashpath(name)) # .encode('utf8') url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/' \ + hashed_name + ('/%dpx-' % image_size) + name.replace(' ', '_') # the svg files are requested as png if re.match(r'.*\.svg$', url, re.IGNORECASE): url = url + '.png' return url def get_images(self, title): # find images used in the pages images = self.image_re.findall(unicode(self._page)) images_list = [] for image in images: url = self._get_url_image(image) # only add one time by page if not url in images_list: images_list.append(url) if len(images_list) > 0: self._output_page_images.write('%s ' % title) for image in images_list: self._output_page_images.write('%s ' % image) self._output_page_images.write('\n') def endElement(self, name): if name == "title": self._title = self._text elif name == "text": self._page = self._text elif name == "page": for namespace in config.BLACKLISTED_NAMESPACES: if unicode(self._title).startswith(namespace): return title = normalize_title(self._title) for namespace in config.TEMPLATE_NAMESPACES: if unicode(self._title).startswith(namespace): self.get_images(title) return for tag in config.REDIRECT_TAGS: if unicode(self._page).startswith(tag): return if (title not in self._pages_blacklist) and \ (title in self._selected_pages_list): print "%d Page '%s', length %d \r" % \ (self._page_counter, title, len(self._page)), # processed self._register_page(self._output, title, self._page) self.get_images(title) elif name == "mediawiki": self._output.close() self._output_page_images.close() print "Processed %d pages." % self._page_counter class TemplatesCounter: def __init__(self, file_name, pages_selected, redirect_checker): self.templates_to_counter = {} input_links = codecs.open('%s.page_templates' % file_name, encoding='utf-8', mode='r') line = input_links.readline() while line: words = line.split() page = words[0] if page in pages_selected: print "Processing page %s \r" % page, for n in range(1, len(words) - 1): template = words[n] try: self.templates_to_counter[template] = \ self.templates_to_counter[template] + 1 except: self.templates_to_counter[template] = 1 line = input_links.readline() input_links.close() # Verify redirects print "Verifying redirects" for template in self.templates_to_counter.keys(): redirected = redirect_checker.get_redirected(template) if redirected is not None: if redirected in self.templates_to_counter: self.templates_to_counter[redirected] = \ self.templates_to_counter[redirected] + \ self.templates_to_counter[template] self.templates_to_counter[template] = 0 else: self.templates_to_counter[redirected] = \ self.templates_to_counter[template] self.templates_to_counter[template] = 0 class CountedTemplatesReader(): def __init__(self, file_name): _file = codecs.open('%s.templates_counted' % file_name, encoding='utf-8', mode='r') self.templates = {} line = _file.readline() while line: words = line.split() template_name = words[0] cant_used = int(words[1]) self.templates[normalize_title(template_name)] = \ {'cant': cant_used} line = _file.readline() class TemplatesLoader(): def __init__(self, file_name, templates_used, select_all=False): _file = codecs.open('%s.templates' % file_name, encoding='utf-8', mode='r') self._output = codecs.open('%s.processed' % file_name, encoding='utf-8', mode='a') line = _file.readline() while line: if len(line) == 2: if ord(line[0]) == 1: title = _file.readline() size = _file.readline() separator = _file.readline() finish = False template_content = '' while not finish: line = _file.readline() #print line if len(line) == 2: if ord(line[0]) == 3: finish = True break template_content += line template_namespace = title[:title.find(':')] template_name = title[title.find(':') + 1:] template_name = normalize_title(template_name) #print "checking", template_name, if select_all or template_name in templates_used.keys(): #print "Adding", template_name, title = template_namespace + ":" + template_name self._register_page(title, template_content.strip()) line = _file.readline() def _register_page(self, title, content): self._output.write('\01\n') self._output.write('%s\n' % normalize_title(title)) self._output.write('%d\n' % len(content)) self._output.write('\02\n') self._output.write('%s\n' % content) self._output.write('\03\n') class RedirectsUsedWriter(): def __init__(self, file_name, selected_pages_list, templates_used, redirect_checker, postfix='redirects_used'): _output_redirects = codecs.open('%s.%s' % (file_name, postfix), encoding='utf-8', mode='w') counter = 0 # check pages in redirects for title in selected_pages_list: title = normalize_title(title) if title in redirect_checker.reversed_index: for origin in redirect_checker.reversed_index[title]: _output_redirects.write('[[%s]]\t[[%s]]\n' % (origin, title)) counter += 1 print "Found %d redirected pages" % counter templates_redirects = {} # check pages in redirects counter = 0 for title in templates_used.keys(): title = normalize_title(title) if title in redirect_checker.reversed_index: for origin in redirect_checker.reversed_index[title]: _output_redirects.write('[[%s]]\t[[%s]]\n' % (origin, title)) counter += 1 print "Found %d redirected templates" % counter _output_redirects.close() if __name__ == '__main__': select_all = False if len(sys.argv) > 1: for argn in range(1, len(sys.argv)): arg = sys.argv[argn] if arg == '--all': select_all = True print "Selecting all the pages" MAX_LEVELS = 1 if not select_all: fav_reader = FileListReader(config.favorites_file_name) print "Loaded %d favorite pages" % len(fav_reader.list) if os.path.exists(config.blacklist_file_name): pages_blacklisted_reader = FileListReader(config.blacklist_file_name) pages_blacklist = pages_blacklisted_reader.list print "Loaded %d blacklisted pages" % len(pages_blacklist) else: pages_blacklist = [] input_xml_file_name = config.input_xml_file_name print "Init redirects checker" redirect_checker = RedirectParser(input_xml_file_name) level = 1 if not select_all: selected_pages_file_name = '%s.pages_selected-level-%d' % \ (input_xml_file_name, MAX_LEVELS) else: selected_pages_file_name = '%s.pages_selected' % input_xml_file_name if not os.path.exists(selected_pages_file_name): if not select_all: while level <= MAX_LEVELS: print "Processing links level %d" % level links_filter = LinksFilter(input_xml_file_name, redirect_checker, fav_reader.list) fav_reader.list.extend(links_filter.links) level += 1 print "Writing pages_selected-level-%d file" % MAX_LEVELS output_file = codecs.open(selected_pages_file_name, encoding='utf-8', mode='w') for page in fav_reader.list: output_file.write('%s\n' % page) output_file.close() selected_pages_list = fav_reader.list else: print "Processing links" links_filter = PagesLinksFilter(input_xml_file_name, redirect_checker) print "Writing pages_selected file %d pages" % \ len(links_filter.pages) output_file = codecs.open(selected_pages_file_name, encoding='utf-8', mode='w') for page in links_filter.pages: output_file.write('%s\n' % page) output_file.close() selected_pages_list = links_filter.pages else: print "Loading selected pages" pages_selected_reader = FileListReader(selected_pages_file_name) selected_pages_list = pages_selected_reader.list if not os.path.exists('%s.processed' % input_xml_file_name): print "Writing .processed file" parser = make_parser() parser.setContentHandler(PagesProcessor(input_xml_file_name, selected_pages_list, pages_blacklist)) parser.parse(input_xml_file_name) # if there are a .templates_counted file should be removed # because we need recalculate it if os.path.exists('%s.templates_counted' % input_xml_file_name): os.remove('%s.templates_counted' % input_xml_file_name) templates_used_reader = None if not os.path.exists('%s.templates_counted' % input_xml_file_name): if select_all: templates_loader = TemplatesLoader(input_xml_file_name, [], True) else: print "Processing templates" templates_counter = TemplatesCounter(input_xml_file_name, selected_pages_list, redirect_checker) print "Sorting counted templates" items = templates_counter.templates_to_counter.items() items.sort(key=itemgetter(1), reverse=True) print "Writing templates_counted file" output_file = codecs.open('%s.templates_counted' % \ input_xml_file_name, encoding='utf-8', mode='w') for n in range(len(items)): if int(items[n][1]) > 0: output_file.write('%s %d\n' % (items[n][0], items[n][1])) output_file.close() print "Loading templates used" templates_used_reader = CountedTemplatesReader(input_xml_file_name) print "Readed %d templates used" % len( templates_used_reader.templates) print "Adding used templates to .processed file" templates_loader = TemplatesLoader(input_xml_file_name, templates_used_reader.templates) if not os.path.exists('%s.redirects_used' % input_xml_file_name): if select_all: os.link('%s.redirects' % input_xml_file_name, '%s.redirects_used' % input_xml_file_name) else: if templates_used_reader is None: print "Loading templates used" templates_used_reader = \ CountedTemplatesReader(input_xml_file_name) print "Readed %d templates used" % \ len(templates_used_reader.templates) redirects_used_writer = RedirectsUsedWriter(input_xml_file_name, selected_pages_list, templates_used_reader.templates, redirect_checker)