Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools2/create_low_rank_pages.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools2/create_low_rank_pages.py')
-rwxr-xr-xtools2/create_low_rank_pages.py44
1 files changed, 44 insertions, 0 deletions
diff --git a/tools2/create_low_rank_pages.py b/tools2/create_low_rank_pages.py
new file mode 100755
index 0000000..2227e99
--- /dev/null
+++ b/tools2/create_low_rank_pages.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Create a file with a list of pages with less than N links pointing to them.
+# By default N=5 but can be configured with the param --min_cant_links
+
+import codecs
+import sys
+from make_selection import FileListReader
+import config
+
+if __name__ == '__main__':
+
+ input_xml_file_name = config.input_xml_file_name
+
+ min_cant_links = 5
+ if len(sys.argv) > 1:
+ if sys.argv[1] == '--min_cant_links':
+ min_cant_links = int(sys.argv[2])
+
+ print "Adding articles with less than %d links" % min_cant_links
+
+ # Read favorites list
+ favorites_reader = FileListReader(config.favorites_file_name)
+
+ ranking_file = codecs.open('%s.links_counted' % input_xml_file_name,
+ encoding='utf-8', mode='r')
+
+ print "Writing low_rank_pages file"
+ output_file = codecs.open('%s.low_rank_pages' % input_xml_file_name,
+ encoding='utf-8', mode='w')
+
+ line = ranking_file.readline()
+ while line:
+ parts = line.split()
+ article = parts[0]
+ cant_links = int(parts[1])
+ if cant_links < min_cant_links and \
+ article not in favorites_reader.list:
+ output_file.write('%s\n' % article)
+
+ line = ranking_file.readline()
+
+ output_file.close()
+ ranking_file.close()