Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/tools/GetPageCounts.pl
blob: 5b29f9e3a75101b3994cf7f08f46e5940286cadb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/perl

use URI::Escape;
use warnings;
use strict;

# using the traffic stats, get pagecounts for articles
# count pagecounts of redirect pages towards the main page,
# report counts for redirect as the counts of their main page

# page list and redirect list generated from GetPages.pl
my $pagelist = $ARGV[0];
my $redirectlist = $ARGV[1];
# traffic stats from file provided by Henrik
my $trafficlist = $ARGV[2];

# read in page list, store as keys in a "page counts" hash
# values initally zero, traffic numbers added to them
open(PAGES,$pagelist) or die;
my %pagecounts = ();
while (<PAGES>) {
  chomp;
  $pagecounts{$_} = 0;
}
close(PAGES);

# read in redirect list, store as hash key, value pairs
open(REDIRECTS,$redirectlist) or die;
my %redirects = ();
while (<REDIRECTS>) {
  if (/\[\[(.*)\]\]\s*\[\[(.*)\]\]/) {
    $redirects{$1} = $2;
  }
}
close(REDIRECTS);

# read traffic stats
open(TRAFFIC,$trafficlist) or die;
while (<TRAFFIC>) {
  my @data = split;
  my $page = $data[1];
  $page = uri_unescape($page);          # pages need to be unescaped
  $page =~ s/_/ /g;                     # and underscores converted
  if (exists $redirects{$page}) {                       # if redirect, also add count towards main page
    if (exists $pagecounts{$redirects{$page}}) {
      $pagecounts{$redirects{$page}} += $data[2];
    }
  }
  if (exists $pagecounts{$page}) {      # add count to this page
    $pagecounts{$page} += $data[2];
  } else {
    #print "$page doesn't exist on page list!\n";
  }
}

# now output traffic amounts, not ordered
foreach my $page (keys %pagecounts) {
  # If redirect, print target page's traffic score
  if (exists $redirects{$page}) {
    if (exists $pagecounts{$redirects{$page}}) {
      print "$pagecounts{$redirects{$page}}\t[[$page]]\n";
    } 
  } elsif (exists $pagecounts{$page}) {
    print "$pagecounts{$page}\t[[$page]]\n";
  }
}