Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Ball <cjb@laptop.org>2008-06-11 02:39:22 (GMT)
committer Chris Ball <cjb@laptop.org>2008-06-11 02:39:22 (GMT)
commit5ec1d3d4717116db66c3252548a907676533fecb (patch)
tree56da1931a109c39cca49b03f4eee64e181de57dd
parent1a66b3ac45dda8fd036968518bbc20241eed9933 (diff)
*.pl update from Mad.
-rwxr-xr-xtools/GetPages.pl39
-rwxr-xr-xtools/PageLinks.pl71
-rwxr-xr-xtools/RemoveBlacklist.pl40
-rwxr-xr-xtools/RemoveUnlinked.pl72
4 files changed, 188 insertions, 34 deletions
diff --git a/tools/GetPages.pl b/tools/GetPages.pl
index e1ac721..39f9c65 100755
--- a/tools/GetPages.pl
+++ b/tools/GetPages.pl
@@ -1,4 +1,10 @@
#!/usr/bin/perl
+# ./GetPages.pl
+# Create list of pages and redirects for a wikipedia snapshot.
+# Usage: Takes in a wikipedia snapshot as STDIN. First and second
+# arguments give the "page list" and "redirect list" output locations.
+# For example:
+# bzcat ./eswiki-20080416-pages-articles.xml.bz2 | ./GetPages all_pages all_redirects
use strict;
use warnings;
@@ -6,20 +12,26 @@ use warnings;
my $pages = $ARGV[0];
my $redirects = $ARGV[1];
+# open output files
open(PAGES,">$pages") or die;
open(REDIR,">$redirects") or die;
-my %pagerank = ();
+# Keep list of redirects as key, value hash pairs
my %redirects = ();
-my $currpage = "";
+my $currpage = ""; # will store current page title
+my $inpage = 0;
while (<STDIN>) {
- if (/<title>(.*?)<\/title>/) {
+ if (/<title>(.*?)<\/title>/) { # start of new page!
my $pagetitle = $1;
- $pagerank{$1} = 0;
$currpage = $1;
- print PAGES "$currpage\n";
+ print PAGES "$currpage\n"; # print page name to page list file
+ $inpage = 0; # this should have been zeroed anyway...
}
- if (/#REDIRECT:?\s*\[\[\s*(.*?)\s*\]\]/i) {
+ if (/.*?<text xml.*?>(.*)/) {
+ s/.*?<text xml.*?>//;
+ $inpage = 1; # keep track of when we've entered page text
+ }
+ if ((/#REDIRECT:?\s*\[\[\s*(.*?)\s*\]\]/i) and $inpage) { # this is a redirect
my $bracketed = $1;
if ($bracketed =~ /:Categ/i) {
$bracketed =~ s/:(Categ)/$1/i;
@@ -28,23 +40,30 @@ while (<STDIN>) {
if ($bracketed =~ /(.*?)\s*[#\|]/) {
$redirectpage = $1;
}
- $redirectpage = capitalize($redirectpage);
- $redirectpage =~ s/_/ /g;
+ $redirectpage = capitalize($redirectpage); # first of page names needs to be capitalized
+ $redirectpage =~ s/_/ /g; # don't use underscores
unless (exists $redirects{$currpage}) {
+ # it turns out some pages have multiple redirects on them;
+ # in wikipedia and in this program the first one listed "wins".
$redirects{$currpage} = $redirectpage;
+ print REDIR "[[$currpage]]\t[[$redirectpage]]\n"; # print redirect to redirect list file
}
- print REDIR "[[$currpage]]\t[[$redirectpage]]\n";
}
+ if (/<\/text>/) { # out of text
+ $inpage = 0;
+ }
}
+# capitalize: takes in a string, capitalizes the first letter
sub capitalize {
my $word = shift;
- #print "$word to ";
my $firstletter = substr($word,0,1);
if ($firstletter =~ /[a-z]/) {
my $newletter = uc($firstletter);
substr($word,0,1) = $newletter;
}
+ # This is kind of a hack. I need to capitalize
+ # the first letters when they're accented. - mad
unless ($firstletter =~ /[a-zA-Z]/) {
$firstletter = substr($word,0,2);
if ($firstletter eq "á") {
diff --git a/tools/PageLinks.pl b/tools/PageLinks.pl
index de430ad..b8df811 100755
--- a/tools/PageLinks.pl
+++ b/tools/PageLinks.pl
@@ -1,14 +1,21 @@
#!/usr/bin/perl
+# create a file recording the outgoing links for each page
+# each line of the file represents a page and it's links,
+# the first is the page name, all following are articles it links to
+# inputs are page list and redirects list, and wikipedia snapshot as stdin, eg.:
+# bzcat ../../eswiki-20080416-pages-articles.xml.bz2 | ./PageLinks.pl top70k_pages all_redirects >| top70k_links
+# NOTE - This program is slow to run. It's a little faster if you run with
+# a smaller page list, so you don't bother assessing links for pages already excluded
use strict;
use warnings;
-#my $input = $ARGV[0];
+# input
my $pages = $ARGV[0];
my $redirects = $ARGV[1];
my %pagerank = ();
-# read in pages
+# read in pages as keys in a hash
open (PAGE,$pages);
while (<PAGE>) {
chomp;
@@ -17,7 +24,7 @@ while (<PAGE>) {
close(PAGE);
my %redirects = ();
-# read in redirects
+# read in redirects as key, value pairs in a hash
open (REDIR,$redirects);
while (<REDIR>) {
if (/\[\[(.*?)\]\]\s*\[\[(.*?)\]\]/) {
@@ -32,14 +39,16 @@ my $currpage = "";
my $intext = 0;
my $counter = 0;
while (<STDIN>) {
- if (/<title>(.*?)<\/title>/) {
+ if (/<title>(.*?)<\/title>/) { # new page
$counter++;
$currpage = $1;
if (exists $redirects{$currpage}) {
next;
- } else {
+ } elsif (exists $pagerank{$currpage}) {
$currpage =~ s/ /_/g;
print "$currpage ";
+ } else {
+ next;
}
#if (0 == $counter % 100) {
# print "Working on ${counter}th page...\n";
@@ -47,7 +56,7 @@ while (<STDIN>) {
my $intext = 0;
my $readmore = 1;
my $pagetext = "";
- while ($readmore) {
+ while ($readmore) { # read in all page text at once
if (/<text xml:space="preserve">(.*)/) {
$intext = 1;
$_ = $1;
@@ -58,41 +67,36 @@ while (<STDIN>) {
$pagetext = $pagetext . $1;
} elsif ($intext) {
$pagetext = $pagetext . $_;
- if (length($pagetext) > 500000) {
-print "FAILPAGE! ";
+ if (length($pagetext) > 300000) { # hack to hanging on ridiculously large pages
+ #print "FAILPAGE! ";
last;
}
}
$_ = <STDIN>;
}
- # now $pagetext contains all of this page
- # process pagetext...
-#print "pagetext:\n$pagetext\n";
- $pagetext =~ s/\n/ <newline> /g;
+ $pagetext =~ s/\n/ <newline> /g; # to make sure we don't miss multi-line stuff
$pagetext =~ s/(?<!=\[)\[([^\[\]]*?)\s*\](?!\])//g; # remove all singleton brackets "links" so I can ignore them
- $pagetext =~ s/([^\[])\[([^\[])/$1$2/g; # remove stranded left brackets
- $pagetext =~ s/([^\]])\]([^\]])/$1$2/g; # remove stranded right brackets
-#print "processed pagetext:\n$pagetext\n";
+ $pagetext =~ s/([^\[])\[([^\[])/$1$2/g; # remove stranded left brackets
+ $pagetext =~ s/([^\]])\]([^\]])/$1$2/g; # remove stranded right brackets
+ # now pull out each double bracketed object without brackets inside
while ($pagetext =~ /^.*?\[\[\s*([^\[\]]*?)\s*\]\]/s) {
my $match = $1;
- #print "MATCH $match\n";
my $link = $match;
- if (not ($match =~ /^imagen?\W*:/i)) { # check that it's not an image
- if ($match =~ /^:Categ/i) { # remove preceding colon for category links
+ if (not ($match =~ /^imagen?\W*:/i)) { # check that it's not an image
+ if ($match =~ /^:Categ/i) { # remove preceding colon for category links
$match =~ s/^:(Categ)/$1/;
}
- if ($match =~ /^(.*?)\s*[#\|]/) { # check for pipes or subsections
+ if ($match =~ /^(.*?)\s*[#\|]/) { # check for pipes or subsections and ignore
$link = $1;
}
my $link = capitalize($link);
- if (exists $pagerank{$link}) {
+ if (exists $pagerank{$link}) { # only print if it's a known page
$link =~ s/ /_/g;
print " $link";
}
}
$pagetext =~ s/^(.*?)\[\[([^\[\]]*?)\s*\]\]/$1/s; # remove this match from pagetext
-# print "remaining pagetext:\n$pagetext\n";
}
print "\n";
}
@@ -100,10 +104,29 @@ print "FAILPAGE! ";
sub capitalize {
my $word = shift;
- #print "$word to ";
my $firstletter = substr($word,0,1);
- my $newletter = uc($firstletter);
- substr($word,0,1) = $newletter;
+ if ($firstletter =~ /[a-z]/) {
+ my $newletter = uc($firstletter);
+ substr($word,0,1) = $newletter;
+ }
+ # This is kind of a hack. I need to capitalize
+ # the first letters when they're accented. - mad
+ unless ($firstletter =~ /[a-zA-Z]/) {
+ $firstletter = substr($word,0,2);
+ if ($firstletter eq "á") {
+ my $newletter = "Á";
+ substr($word,0,2) = $newletter;
+ } elsif ($firstletter eq "ñ") {
+ my $newletter = "Ñ";
+ substr($word,0,2) = $newletter;
+ } elsif ($firstletter eq "é") {
+ my $newletter = "É";
+ substr($word,0,2) = $newletter;
+ } elsif ($firstletter eq "ó") {
+ my $newletter = "Ó";
+ substr($word,0,2) = $newletter;
+ }
+ }
#print "$word\n";
return $word;
}
diff --git a/tools/RemoveBlacklist.pl b/tools/RemoveBlacklist.pl
new file mode 100755
index 0000000..2c969b6
--- /dev/null
+++ b/tools/RemoveBlacklist.pl
@@ -0,0 +1,40 @@
+#!/usr/bin/perl
+
+my $blacklist = $ARGV[0];
+my $redirects = $ARGV[1];
+my $pages = $ARGV[2];
+
+open(BLACK,$blacklist) or die;
+my %blacklist = ();
+while (<BLACK>) {
+ chomp;
+ $blacklist{$_} = 0;
+}
+close(BLACK);
+
+my %redirects = ();
+open(REDIR,$redirects) or die;
+while (<REDIR>) {
+ if (/\[\[(.*)\]\].*\[\[(.*)\]\]/) {
+ $redirects{$1} = $2;
+ }
+}
+close(REDIR);
+
+open(PAGE,$pages) or die;
+while (<PAGE>) {
+ chomp;
+ my $inblacklist = 0;
+ if (exists $blacklist{$_}) {
+ $inblacklist = 1;
+ }
+ if (exists $redirects{$_}) {
+ if (exists $blacklist{$redirects{$_}}) {
+ $inblacklist = 1;
+ }
+ }
+ unless ($inblacklist) {
+ print "$_\n";
+ }
+}
+
diff --git a/tools/RemoveUnlinked.pl b/tools/RemoveUnlinked.pl
new file mode 100755
index 0000000..2b642bf
--- /dev/null
+++ b/tools/RemoveUnlinked.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+# remove unlinked pages from a list of pages
+# also remove any matching certain keywords
+# input: page list, redirect list (from GetPages.pl)
+# and page links (from PageLinks.pl)
+
+my $pagelist = $ARGV[0];
+my $redirectlist = $ARGV[1];
+my $pagelinks = $ARGV[2];
+
+# remove pages starting with these keywords
+my @removepages = ("Wikipedia:","Ayuda:","Wikiproyecto:","MediaWiki:","Plantilla:","WP:","Portal:", "Categoría");
+
+my %pagecounts = ();
+open(PAGE,$pagelist) or die;
+while(<PAGE>) {
+ chomp;
+ $pagecounts{$_} = 0;
+}
+close(PAGE);
+
+my %redirects = ();
+open(REDIR,$redirectlist) or die;
+while (<REDIR>) {
+ if (/\[\[(.*?)\]\]\s*\[\[(.*?)\]\]/) {
+ $redirects{$1} = $2;
+ } else {
+ die "redirects line is weird:\n$_\n";
+ }
+}
+close(REDIR);
+
+open(LINKS,$pagelinks);
+while (<LINKS>) {
+ my @data = split;
+ my $currpage = shift(@data);
+ $currpage =~ s/_/ /g;
+ unless (exists ($pagecounts{$currpage})) {
+ next;
+ }
+ foreach my $link (@data) {
+ $link =~ s/_/ /g;
+ if (exists ($redirects{$link})) {
+ if (exists ($pagecounts{$redirects{$link}})) {
+ $pagecounts{$redirects{$link}}++;
+ } else {
+#print "Weird: $link redirects to $redirects{$link}, but this one isn't on the pagelist?\n";
+ }
+ }
+ if (exists $pagecounts{$link}) {
+ $pagecounts{$link}++;
+ } else {
+#print "$link does not exist on pagelist\n";
+ }
+ }
+}
+
+foreach my $page (keys %pagecounts) {
+ my $good = 1;
+ foreach my $remove (@removepages) {
+ if ($page =~ /^$remove/) {
+ $good = 0;
+ }
+ }
+ if (($pagecounts{$page} >= 1) and $good) {
+ print "$page\n";
+ }
+}