diff options
author | Chris Ball <cjb@laptop.org> | 2008-06-11 02:39:22 (GMT) |
---|---|---|
committer | Chris Ball <cjb@laptop.org> | 2008-06-11 02:39:22 (GMT) |
commit | 5ec1d3d4717116db66c3252548a907676533fecb (patch) | |
tree | 56da1931a109c39cca49b03f4eee64e181de57dd | |
parent | 1a66b3ac45dda8fd036968518bbc20241eed9933 (diff) |
*.pl update from Mad.
-rwxr-xr-x | tools/GetPages.pl | 39 | ||||
-rwxr-xr-x | tools/PageLinks.pl | 71 | ||||
-rwxr-xr-x | tools/RemoveBlacklist.pl | 40 | ||||
-rwxr-xr-x | tools/RemoveUnlinked.pl | 72 |
4 files changed, 188 insertions, 34 deletions
diff --git a/tools/GetPages.pl b/tools/GetPages.pl index e1ac721..39f9c65 100755 --- a/tools/GetPages.pl +++ b/tools/GetPages.pl @@ -1,4 +1,10 @@ #!/usr/bin/perl +# ./GetPages.pl +# Create list of pages and redirects for a wikipedia snapshot. +# Usage: Takes in a wikipedia snapshot as STDIN. First and second +# arguments give the "page list" and "redirect list" output locations. +# For example: +# bzcat ./eswiki-20080416-pages-articles.xml.bz2 | ./GetPages all_pages all_redirects use strict; use warnings; @@ -6,20 +12,26 @@ use warnings; my $pages = $ARGV[0]; my $redirects = $ARGV[1]; +# open output files open(PAGES,">$pages") or die; open(REDIR,">$redirects") or die; -my %pagerank = (); +# Keep list of redirects as key, value hash pairs my %redirects = (); -my $currpage = ""; +my $currpage = ""; # will store current page title +my $inpage = 0; while (<STDIN>) { - if (/<title>(.*?)<\/title>/) { + if (/<title>(.*?)<\/title>/) { # start of new page! my $pagetitle = $1; - $pagerank{$1} = 0; $currpage = $1; - print PAGES "$currpage\n"; + print PAGES "$currpage\n"; # print page name to page list file + $inpage = 0; # this should have been zeroed anyway... } - if (/#REDIRECT:?\s*\[\[\s*(.*?)\s*\]\]/i) { + if (/.*?<text xml.*?>(.*)/) { + s/.*?<text xml.*?>//; + $inpage = 1; # keep track of when we've entered page text + } + if ((/#REDIRECT:?\s*\[\[\s*(.*?)\s*\]\]/i) and $inpage) { # this is a redirect my $bracketed = $1; if ($bracketed =~ /:Categ/i) { $bracketed =~ s/:(Categ)/$1/i; @@ -28,23 +40,30 @@ while (<STDIN>) { if ($bracketed =~ /(.*?)\s*[#\|]/) { $redirectpage = $1; } - $redirectpage = capitalize($redirectpage); - $redirectpage =~ s/_/ /g; + $redirectpage = capitalize($redirectpage); # first of page names needs to be capitalized + $redirectpage =~ s/_/ /g; # don't use underscores unless (exists $redirects{$currpage}) { + # it turns out some pages have multiple redirects on them; + # in wikipedia and in this program the first one listed "wins". $redirects{$currpage} = $redirectpage; + print REDIR "[[$currpage]]\t[[$redirectpage]]\n"; # print redirect to redirect list file } - print REDIR "[[$currpage]]\t[[$redirectpage]]\n"; } + if (/<\/text>/) { # out of text + $inpage = 0; + } } +# capitalize: takes in a string, capitalizes the first letter sub capitalize { my $word = shift; - #print "$word to "; my $firstletter = substr($word,0,1); if ($firstletter =~ /[a-z]/) { my $newletter = uc($firstletter); substr($word,0,1) = $newletter; } + # This is kind of a hack. I need to capitalize + # the first letters when they're accented. - mad unless ($firstletter =~ /[a-zA-Z]/) { $firstletter = substr($word,0,2); if ($firstletter eq "á") { diff --git a/tools/PageLinks.pl b/tools/PageLinks.pl index de430ad..b8df811 100755 --- a/tools/PageLinks.pl +++ b/tools/PageLinks.pl @@ -1,14 +1,21 @@ #!/usr/bin/perl +# create a file recording the outgoing links for each page +# each line of the file represents a page and it's links, +# the first is the page name, all following are articles it links to +# inputs are page list and redirects list, and wikipedia snapshot as stdin, eg.: +# bzcat ../../eswiki-20080416-pages-articles.xml.bz2 | ./PageLinks.pl top70k_pages all_redirects >| top70k_links +# NOTE - This program is slow to run. It's a little faster if you run with +# a smaller page list, so you don't bother assessing links for pages already excluded use strict; use warnings; -#my $input = $ARGV[0]; +# input my $pages = $ARGV[0]; my $redirects = $ARGV[1]; my %pagerank = (); -# read in pages +# read in pages as keys in a hash open (PAGE,$pages); while (<PAGE>) { chomp; @@ -17,7 +24,7 @@ while (<PAGE>) { close(PAGE); my %redirects = (); -# read in redirects +# read in redirects as key, value pairs in a hash open (REDIR,$redirects); while (<REDIR>) { if (/\[\[(.*?)\]\]\s*\[\[(.*?)\]\]/) { @@ -32,14 +39,16 @@ my $currpage = ""; my $intext = 0; my $counter = 0; while (<STDIN>) { - if (/<title>(.*?)<\/title>/) { + if (/<title>(.*?)<\/title>/) { # new page $counter++; $currpage = $1; if (exists $redirects{$currpage}) { next; - } else { + } elsif (exists $pagerank{$currpage}) { $currpage =~ s/ /_/g; print "$currpage "; + } else { + next; } #if (0 == $counter % 100) { # print "Working on ${counter}th page...\n"; @@ -47,7 +56,7 @@ while (<STDIN>) { my $intext = 0; my $readmore = 1; my $pagetext = ""; - while ($readmore) { + while ($readmore) { # read in all page text at once if (/<text xml:space="preserve">(.*)/) { $intext = 1; $_ = $1; @@ -58,41 +67,36 @@ while (<STDIN>) { $pagetext = $pagetext . $1; } elsif ($intext) { $pagetext = $pagetext . $_; - if (length($pagetext) > 500000) { -print "FAILPAGE! "; + if (length($pagetext) > 300000) { # hack to hanging on ridiculously large pages + #print "FAILPAGE! "; last; } } $_ = <STDIN>; } - # now $pagetext contains all of this page - # process pagetext... -#print "pagetext:\n$pagetext\n"; - $pagetext =~ s/\n/ <newline> /g; + $pagetext =~ s/\n/ <newline> /g; # to make sure we don't miss multi-line stuff $pagetext =~ s/(?<!=\[)\[([^\[\]]*?)\s*\](?!\])//g; # remove all singleton brackets "links" so I can ignore them - $pagetext =~ s/([^\[])\[([^\[])/$1$2/g; # remove stranded left brackets - $pagetext =~ s/([^\]])\]([^\]])/$1$2/g; # remove stranded right brackets -#print "processed pagetext:\n$pagetext\n"; + $pagetext =~ s/([^\[])\[([^\[])/$1$2/g; # remove stranded left brackets + $pagetext =~ s/([^\]])\]([^\]])/$1$2/g; # remove stranded right brackets + # now pull out each double bracketed object without brackets inside while ($pagetext =~ /^.*?\[\[\s*([^\[\]]*?)\s*\]\]/s) { my $match = $1; - #print "MATCH $match\n"; my $link = $match; - if (not ($match =~ /^imagen?\W*:/i)) { # check that it's not an image - if ($match =~ /^:Categ/i) { # remove preceding colon for category links + if (not ($match =~ /^imagen?\W*:/i)) { # check that it's not an image + if ($match =~ /^:Categ/i) { # remove preceding colon for category links $match =~ s/^:(Categ)/$1/; } - if ($match =~ /^(.*?)\s*[#\|]/) { # check for pipes or subsections + if ($match =~ /^(.*?)\s*[#\|]/) { # check for pipes or subsections and ignore $link = $1; } my $link = capitalize($link); - if (exists $pagerank{$link}) { + if (exists $pagerank{$link}) { # only print if it's a known page $link =~ s/ /_/g; print " $link"; } } $pagetext =~ s/^(.*?)\[\[([^\[\]]*?)\s*\]\]/$1/s; # remove this match from pagetext -# print "remaining pagetext:\n$pagetext\n"; } print "\n"; } @@ -100,10 +104,29 @@ print "FAILPAGE! "; sub capitalize { my $word = shift; - #print "$word to "; my $firstletter = substr($word,0,1); - my $newletter = uc($firstletter); - substr($word,0,1) = $newletter; + if ($firstletter =~ /[a-z]/) { + my $newletter = uc($firstletter); + substr($word,0,1) = $newletter; + } + # This is kind of a hack. I need to capitalize + # the first letters when they're accented. - mad + unless ($firstletter =~ /[a-zA-Z]/) { + $firstletter = substr($word,0,2); + if ($firstletter eq "á") { + my $newletter = "Á"; + substr($word,0,2) = $newletter; + } elsif ($firstletter eq "ñ") { + my $newletter = "Ñ"; + substr($word,0,2) = $newletter; + } elsif ($firstletter eq "é") { + my $newletter = "É"; + substr($word,0,2) = $newletter; + } elsif ($firstletter eq "ó") { + my $newletter = "Ó"; + substr($word,0,2) = $newletter; + } + } #print "$word\n"; return $word; } diff --git a/tools/RemoveBlacklist.pl b/tools/RemoveBlacklist.pl new file mode 100755 index 0000000..2c969b6 --- /dev/null +++ b/tools/RemoveBlacklist.pl @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +my $blacklist = $ARGV[0]; +my $redirects = $ARGV[1]; +my $pages = $ARGV[2]; + +open(BLACK,$blacklist) or die; +my %blacklist = (); +while (<BLACK>) { + chomp; + $blacklist{$_} = 0; +} +close(BLACK); + +my %redirects = (); +open(REDIR,$redirects) or die; +while (<REDIR>) { + if (/\[\[(.*)\]\].*\[\[(.*)\]\]/) { + $redirects{$1} = $2; + } +} +close(REDIR); + +open(PAGE,$pages) or die; +while (<PAGE>) { + chomp; + my $inblacklist = 0; + if (exists $blacklist{$_}) { + $inblacklist = 1; + } + if (exists $redirects{$_}) { + if (exists $blacklist{$redirects{$_}}) { + $inblacklist = 1; + } + } + unless ($inblacklist) { + print "$_\n"; + } +} + diff --git a/tools/RemoveUnlinked.pl b/tools/RemoveUnlinked.pl new file mode 100755 index 0000000..2b642bf --- /dev/null +++ b/tools/RemoveUnlinked.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +# remove unlinked pages from a list of pages +# also remove any matching certain keywords +# input: page list, redirect list (from GetPages.pl) +# and page links (from PageLinks.pl) + +my $pagelist = $ARGV[0]; +my $redirectlist = $ARGV[1]; +my $pagelinks = $ARGV[2]; + +# remove pages starting with these keywords +my @removepages = ("Wikipedia:","Ayuda:","Wikiproyecto:","MediaWiki:","Plantilla:","WP:","Portal:", "Categoría"); + +my %pagecounts = (); +open(PAGE,$pagelist) or die; +while(<PAGE>) { + chomp; + $pagecounts{$_} = 0; +} +close(PAGE); + +my %redirects = (); +open(REDIR,$redirectlist) or die; +while (<REDIR>) { + if (/\[\[(.*?)\]\]\s*\[\[(.*?)\]\]/) { + $redirects{$1} = $2; + } else { + die "redirects line is weird:\n$_\n"; + } +} +close(REDIR); + +open(LINKS,$pagelinks); +while (<LINKS>) { + my @data = split; + my $currpage = shift(@data); + $currpage =~ s/_/ /g; + unless (exists ($pagecounts{$currpage})) { + next; + } + foreach my $link (@data) { + $link =~ s/_/ /g; + if (exists ($redirects{$link})) { + if (exists ($pagecounts{$redirects{$link}})) { + $pagecounts{$redirects{$link}}++; + } else { +#print "Weird: $link redirects to $redirects{$link}, but this one isn't on the pagelist?\n"; + } + } + if (exists $pagecounts{$link}) { + $pagecounts{$link}++; + } else { +#print "$link does not exist on pagelist\n"; + } + } +} + +foreach my $page (keys %pagecounts) { + my $good = 1; + foreach my $remove (@removepages) { + if ($page =~ /^$remove/) { + $good = 0; + } + } + if (($pagecounts{$page} >= 1) and $good) { + print "$page\n"; + } +} |