Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools2/make_ranking.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools2/make_ranking.py')
-rwxr-xr-xtools2/make_ranking.py62
1 files changed, 62 insertions, 0 deletions
diff --git a/tools2/make_ranking.py b/tools2/make_ranking.py
new file mode 100755
index 0000000..ad67a92
--- /dev/null
+++ b/tools2/make_ranking.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Create a list of pages with a nuber of how many links are directed to them.
+
+import codecs
+import re
+from operator import itemgetter
+from make_selection import RedirectParser, FileListReader
+import config
+
+class LinksCounter:
+
+ def __init__(self, file_name, redirects, selected_pages):
+ self.links_to_counter = {}
+ input_links = codecs.open('%s.links' % file_name,
+ encoding='utf-8', mode='r')
+ line = input_links.readline()
+ while line:
+ words = line.split()
+ if len(words) > 0:
+ page = words[0]
+ if page in selected_pages:
+ print "Processing page %s \r" % page,
+ for n in range(1, len(words) - 1):
+ link = words[n]
+ # check if is a redirect
+ try:
+ link = redirects[link]
+ except:
+ pass
+ if link in selected_pages:
+ try:
+ self.links_to_counter[link] += 1
+ except:
+ self.links_to_counter[link] = 0
+ line = input_links.readline()
+ input_links.close()
+
+
+input_xml_file_name = config.input_xml_file_name
+print "Loading redirects"
+redirect_parser = RedirectParser(input_xml_file_name)
+print "Processed %d redirects" % len(redirect_parser.redirects)
+
+print "Loading selected pages"
+selected_pages_reader = FileListReader('%s.pages_selected-level-1' %
+ input_xml_file_name)
+
+print "Processing links"
+links_counter = LinksCounter(input_xml_file_name, redirect_parser.redirects,
+ selected_pages_reader.list)
+
+print "Sorting counted links"
+items = links_counter.links_to_counter.items()
+items.sort(key = itemgetter(1), reverse=True)
+
+print "Writing links_counted file"
+output_file = codecs.open('%s.links_counted' % input_xml_file_name,
+ encoding='utf-8', mode='w')
+for n in range(len(items)):
+ output_file.write('%s %d\n' % (items[n][0], items[n][1]))
+output_file.close()