diff options
Diffstat (limited to 'tools2/make_ranking.py')
-rwxr-xr-x | tools2/make_ranking.py | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/tools2/make_ranking.py b/tools2/make_ranking.py new file mode 100755 index 0000000..ad67a92 --- /dev/null +++ b/tools2/make_ranking.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Create a list of pages with a nuber of how many links are directed to them. + +import codecs +import re +from operator import itemgetter +from make_selection import RedirectParser, FileListReader +import config + +class LinksCounter: + + def __init__(self, file_name, redirects, selected_pages): + self.links_to_counter = {} + input_links = codecs.open('%s.links' % file_name, + encoding='utf-8', mode='r') + line = input_links.readline() + while line: + words = line.split() + if len(words) > 0: + page = words[0] + if page in selected_pages: + print "Processing page %s \r" % page, + for n in range(1, len(words) - 1): + link = words[n] + # check if is a redirect + try: + link = redirects[link] + except: + pass + if link in selected_pages: + try: + self.links_to_counter[link] += 1 + except: + self.links_to_counter[link] = 0 + line = input_links.readline() + input_links.close() + + +input_xml_file_name = config.input_xml_file_name +print "Loading redirects" +redirect_parser = RedirectParser(input_xml_file_name) +print "Processed %d redirects" % len(redirect_parser.redirects) + +print "Loading selected pages" +selected_pages_reader = FileListReader('%s.pages_selected-level-1' % + input_xml_file_name) + +print "Processing links" +links_counter = LinksCounter(input_xml_file_name, redirect_parser.redirects, + selected_pages_reader.list) + +print "Sorting counted links" +items = links_counter.links_to_counter.items() +items.sort(key = itemgetter(1), reverse=True) + +print "Writing links_counted file" +output_file = codecs.open('%s.links_counted' % input_xml_file_name, + encoding='utf-8', mode='w') +for n in range(len(items)): + output_file.write('%s %d\n' % (items[n][0], items[n][1])) +output_file.close() |