*.pl update from Mad.

author: Chris Ball <cjb@laptop.org> 2008-06-11 02:39:22 (GMT)
committer: Chris Ball <cjb@laptop.org> 2008-06-11 02:39:22 (GMT)
commit: 5ec1d3d4717116db66c3252548a907676533fecb (patch)
tree: 56da1931a109c39cca49b03f4eee64e181de57dd
parent: 1a66b3ac45dda8fd036968518bbc20241eed9933 (diff)
4 files changed, 188 insertions, 34 deletions
diff --git a/tools/GetPages.pl b/tools/GetPages.pl
index e1ac721..39f9c65 100755
--- a/tools/GetPages.pl
+++ b/tools/GetPages.pl
@@ -1,4 +1,10 @@
 #!/usr/bin/perl
+# ./GetPages.pl
+# Create list of pages and redirects for a wikipedia snapshot.
+# Usage: Takes in a wikipedia snapshot as STDIN. First and second
+# arguments give the "page list" and "redirect list" output locations.
+# For example:
+# bzcat ./eswiki-20080416-pages-articles.xml.bz2 | ./GetPages all_pages all_redirects
 
 use strict;
 use warnings;
@@ -6,20 +12,26 @@ use warnings;
 my $pages = $ARGV[0];
 my $redirects = $ARGV[1];
 
+# open output files
 open(PAGES,">$pages") or die;
 open(REDIR,">$redirects") or die;
 
-my %pagerank = ();
+# Keep list of redirects as key, value hash pairs
 my %redirects = ();
-my $currpage = "";
+my $currpage = "";   # will store current page title
+my $inpage = 0;
 while (<STDIN>) {
-  if (/<title>(.*?)<\/title>/) {
+  if (/<title>(.*?)<\/title>/) {   # start of new page!
     my $pagetitle = $1;
-    $pagerank{$1} = 0;
     $currpage = $1;
-    print PAGES "$currpage\n";
+    print PAGES "$currpage\n";          # print page name to page list file
+    $inpage = 0;                        # this should have been zeroed anyway...
   }
-  if (/#REDIRECT:?\s*\[\[\s*(.*?)\s*\]\]/i) {
+  if (/.*?<text xml.*?>(.*)/) {
+    s/.*?<text xml.*?>//;
+    $inpage = 1;        # keep track of when we've entered page text
+  }
+  if ((/#REDIRECT:?\s*\[\[\s*(.*?)\s*\]\]/i) and $inpage) {  # this is a redirect
     my $bracketed = $1;
     if ($bracketed =~ /:Categ/i) {
       $bracketed =~ s/:(Categ)/$1/i;
@@ -28,23 +40,30 @@ while (<STDIN>) {
     if ($bracketed =~ /(.*?)\s*[#\|]/) {
       $redirectpage = $1;
     }
-    $redirectpage = capitalize($redirectpage);
-    $redirectpage =~ s/_/ /g;
+    $redirectpage = capitalize($redirectpage);  # first of page names needs to be capitalized
+    $redirectpage =~ s/_/ /g;                   # don't use underscores
     unless (exists $redirects{$currpage}) {
+      # it turns out some pages have multiple redirects on them;
+      # in wikipedia and in this program the first one listed "wins".
       $redirects{$currpage} = $redirectpage;
+      print REDIR "[[$currpage]]\t[[$redirectpage]]\n";  # print redirect to redirect list file
     }
-    print REDIR "[[$currpage]]\t[[$redirectpage]]\n";
   } 
+  if (/<\/text>/) {                             # out of text
+    $inpage = 0;
+  }
 }
 
+# capitalize: takes in a string, capitalizes the first letter
 sub capitalize {
   my $word = shift;
-  #print "$word to ";
   my $firstletter = substr($word,0,1);
   if ($firstletter =~ /[a-z]/) {
     my $newletter = uc($firstletter);
     substr($word,0,1) = $newletter;
   }
+  # This is kind of a hack. I need to capitalize
+  # the first letters when they're accented. - mad
   unless ($firstletter =~ /[a-zA-Z]/) {
     $firstletter = substr($word,0,2);
     if ($firstletter eq "á") {
diff --git a/tools/PageLinks.pl b/tools/PageLinks.pl
index de430ad..b8df811 100755
--- a/tools/PageLinks.pl
+++ b/tools/PageLinks.pl
@@ -1,14 +1,21 @@
 #!/usr/bin/perl
+# create a file recording the outgoing links for each page
+# each line of the file represents a page and it's links,
+# the first is the page name, all following are articles it links to
+# inputs are page list and redirects list, and wikipedia snapshot as stdin, eg.:
+# bzcat ../../eswiki-20080416-pages-articles.xml.bz2 |  ./PageLinks.pl top70k_pages all_redirects >| top70k_links
+# NOTE - This program is slow to run. It's a little faster if you run with
+# a smaller page list, so you don't bother assessing links for pages already excluded
 
 use strict;
 use warnings;
 
-#my $input = $ARGV[0];
+# input
 my $pages = $ARGV[0];
 my $redirects = $ARGV[1];
 
 my %pagerank = ();
-# read in pages
+# read in pages as keys in a hash
 open (PAGE,$pages);
 while (<PAGE>) {
   chomp;
@@ -17,7 +24,7 @@ while (<PAGE>) {
 close(PAGE);
 
 my %redirects = ();
-# read in redirects
+# read in redirects as key, value pairs in a hash
 open (REDIR,$redirects);
 while (<REDIR>) {
   if (/\[\[(.*?)\]\]\s*\[\[(.*?)\]\]/) {
@@ -32,14 +39,16 @@ my $currpage = "";
 my $intext = 0;
 my $counter = 0;
 while (<STDIN>) {
-  if (/<title>(.*?)<\/title>/) {
+  if (/<title>(.*?)<\/title>/) {        # new page
     $counter++;
     $currpage = $1;
     if (exists $redirects{$currpage}) {
       next;
-    } else {
+    } elsif (exists $pagerank{$currpage}) {
       $currpage =~ s/ /_/g;
       print "$currpage ";
+    } else {
+      next;
     }
     #if (0 == $counter % 100) {
     #  print "Working on ${counter}th page...\n";
@@ -47,7 +56,7 @@ while (<STDIN>) {
     my $intext = 0;
     my $readmore = 1;
     my $pagetext = "";
-    while ($readmore) {
+    while ($readmore) {         # read in all page text at once
       if (/<text xml:space="preserve">(.*)/) {
         $intext = 1;
         $_ = $1;
@@ -58,41 +67,36 @@ while (<STDIN>) {
         $pagetext = $pagetext . $1;
       } elsif ($intext) {
         $pagetext = $pagetext . $_;
-        if (length($pagetext) > 500000) {
-print "FAILPAGE! ";
+        if (length($pagetext) > 300000) {       # hack to hanging on ridiculously large pages
+          #print "FAILPAGE! ";
           last;
         }
       }
       $_ = <STDIN>;
     }
-    # now $pagetext contains all of this page
-    # process pagetext...
-#print "pagetext:\n$pagetext\n";
-    $pagetext =~ s/\n/ <newline> /g;
+    $pagetext =~ s/\n/ <newline> /g;            # to make sure we don't miss multi-line stuff
     $pagetext =~ s/(?<!=\[)\[([^\[\]]*?)\s*\](?!\])//g;  # remove all singleton brackets "links" so I can ignore them
-    $pagetext =~ s/([^\[])\[([^\[])/$1$2/g;  # remove stranded left brackets
-    $pagetext =~ s/([^\]])\]([^\]])/$1$2/g;  # remove stranded right brackets
-#print "processed pagetext:\n$pagetext\n";
+    $pagetext =~ s/([^\[])\[([^\[])/$1$2/g;     # remove stranded left brackets
+    $pagetext =~ s/([^\]])\]([^\]])/$1$2/g;     # remove stranded right brackets
 
+    # now pull out each double bracketed object without brackets inside
     while ($pagetext =~ /^.*?\[\[\s*([^\[\]]*?)\s*\]\]/s) {
       my $match = $1;
-      #print "MATCH $match\n";
       my $link = $match;
-      if (not ($match =~ /^imagen?\W*:/i)) {  # check that it's not an image
-        if ($match =~ /^:Categ/i) {              # remove preceding colon for category links
+      if (not ($match =~ /^imagen?\W*:/i)) {    # check that it's not an image
+        if ($match =~ /^:Categ/i) {             # remove preceding colon for category links
           $match =~ s/^:(Categ)/$1/;
         }
-        if ($match =~ /^(.*?)\s*[#\|]/) {        # check for pipes or subsections
+        if ($match =~ /^(.*?)\s*[#\|]/) {        # check for pipes or subsections and ignore
           $link = $1;
         }
         my $link = capitalize($link);
-        if (exists $pagerank{$link}) {
+        if (exists $pagerank{$link}) {          # only print if it's a known page
           $link =~ s/ /_/g;
           print " $link";
         }
       }
       $pagetext =~ s/^(.*?)\[\[([^\[\]]*?)\s*\]\]/$1/s;  # remove this match from pagetext
-#      print "remaining pagetext:\n$pagetext\n";
     }
     print "\n";
   }
@@ -100,10 +104,29 @@ print "FAILPAGE! ";
 
 sub capitalize {
   my $word = shift;
-  #print "$word to ";
   my $firstletter = substr($word,0,1);
-  my $newletter = uc($firstletter);
-  substr($word,0,1) = $newletter;
+  if ($firstletter =~ /[a-z]/) {
+    my $newletter = uc($firstletter);
+    substr($word,0,1) = $newletter;
+  }
+  # This is kind of a hack. I need to capitalize
+  # the first letters when they're accented. - mad
+  unless ($firstletter =~ /[a-zA-Z]/) {
+    $firstletter = substr($word,0,2);
+    if ($firstletter eq "á") {
+      my $newletter = "Á";
+      substr($word,0,2) = $newletter;
+    } elsif ($firstletter eq "ñ") {
+      my $newletter = "Ñ";
+      substr($word,0,2) = $newletter;
+    } elsif ($firstletter eq "é") {
+      my $newletter = "É";
+      substr($word,0,2) = $newletter;
+    } elsif ($firstletter eq "ó") {
+      my $newletter = "Ó";
+      substr($word,0,2) = $newletter;
+    }
+  }
   #print "$word\n";
   return $word;
 }
diff --git a/tools/RemoveBlacklist.pl b/tools/RemoveBlacklist.pl
new file mode 100755
index 0000000..2c969b6
--- /dev/null
+++ b/tools/RemoveBlacklist.pl
@@ -0,0 +1,40 @@
+#!/usr/bin/perl
+
+my $blacklist = $ARGV[0];
+my $redirects = $ARGV[1];
+my $pages = $ARGV[2];
+
+open(BLACK,$blacklist) or die;
+my %blacklist = ();
+while (<BLACK>) {
+  chomp;
+  $blacklist{$_} = 0;
+}
+close(BLACK);
+
+my %redirects = ();
+open(REDIR,$redirects) or die;
+while (<REDIR>) {
+  if (/\[\[(.*)\]\].*\[\[(.*)\]\]/) {
+    $redirects{$1} = $2;
+  }
+}
+close(REDIR);
+
+open(PAGE,$pages) or die;
+while (<PAGE>) {
+  chomp;
+  my $inblacklist = 0;
+  if (exists $blacklist{$_}) {
+    $inblacklist = 1;
+  }
+  if (exists $redirects{$_}) {
+    if (exists $blacklist{$redirects{$_}}) {
+      $inblacklist = 1;
+    }
+  }
+  unless ($inblacklist) {
+    print "$_\n";
+  }
+}
+
diff --git a/tools/RemoveUnlinked.pl b/tools/RemoveUnlinked.pl
new file mode 100755
index 0000000..2b642bf
--- /dev/null
+++ b/tools/RemoveUnlinked.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+# remove unlinked pages from a list of pages
+# also remove any matching certain keywords
+# input: page list, redirect list (from GetPages.pl)
+# and page links (from PageLinks.pl)
+
+my $pagelist = $ARGV[0];
+my $redirectlist = $ARGV[1];
+my $pagelinks = $ARGV[2];
+
+# remove pages starting with these keywords
+my @removepages = ("Wikipedia:","Ayuda:","Wikiproyecto:","MediaWiki:","Plantilla:","WP:","Portal:", "Categoría");
+
+my %pagecounts = ();
+open(PAGE,$pagelist) or die;
+while(<PAGE>) {
+  chomp;
+  $pagecounts{$_} = 0;
+}
+close(PAGE);
+
+my %redirects = ();
+open(REDIR,$redirectlist) or die;
+while (<REDIR>) {
+  if (/\[\[(.*?)\]\]\s*\[\[(.*?)\]\]/) {
+    $redirects{$1} = $2;
+  } else {
+    die "redirects line is weird:\n$_\n";
+  }
+}
+close(REDIR);
+
+open(LINKS,$pagelinks);
+while (<LINKS>) {
+  my @data = split;
+  my $currpage = shift(@data);
+  $currpage =~ s/_/ /g;
+  unless (exists ($pagecounts{$currpage})) {
+    next;
+  }
+  foreach my $link (@data) {
+    $link =~ s/_/ /g;
+    if (exists ($redirects{$link})) {
+      if (exists ($pagecounts{$redirects{$link}})) {
+        $pagecounts{$redirects{$link}}++;
+      } else {
+#print "Weird: $link redirects to $redirects{$link}, but this one isn't on the pagelist?\n";
+      }
+    }
+    if (exists $pagecounts{$link}) {
+      $pagecounts{$link}++;
+    } else {
+#print "$link does not exist on pagelist\n";
+    }
+  }
+}
+
+foreach my $page (keys %pagecounts) {
+  my $good = 1;
+  foreach my $remove (@removepages) {
+    if ($page =~ /^$remove/) {
+      $good = 0;
+    }
+  }
+  if (($pagecounts{$page} >= 1) and $good) {
+    print "$page\n";
+  }
+}
author	Chris Ball <cjb@laptop.org>	2008-06-11 02:39:22 (GMT)
committer	Chris Ball <cjb@laptop.org>	2008-06-11 02:39:22 (GMT)
commit	5ec1d3d4717116db66c3252548a907676533fecb (patch)
tree	56da1931a109c39cca49b03f4eee64e181de57dd
parent	1a66b3ac45dda8fd036968518bbc20241eed9933 (diff)