crawl_mediawiki.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

#!/bin/env python3

import mwapi
from tempfile import mktemp
import xml.etree.ElementTree as ET
import os
import urllib

host = "http://pe.sugarlabs.org/" # can be any mediawiki,
api_path = "wiki/api.php"         # just point to its API
real_users = ["Sebastian", "Jclema", "Raul_Hugo", "Kikomayorga",
                "Kaametza", "Kokecontreras", "Bernie", "Tuukah", "Acaire",
                "Kaisi", "Ignacio_Rodríguez", "Laura_Vargas", "Lwong",
                "Michael", "Raulhugo", "Cjl"]

blacklist = [ "Wiki Clean Up",
              "Wiki Clean Up/Batch 01",
              "Wiki Clean Up/Batch 02",
              "Wiki Clean Up/Batch 03",
              "Wiki Clean Up/Batch 04",
              "Wiki Clean Up/Batch 05",
              "Wiki Clean Up/Batch 06",
              "Wiki Clean Up/Batch 07",
              "Wiki Clean Up/Batch 08",
              "Wiki Clean Up/Batch 09",
              "Wiki Clean Up/Batch 10",
              "Wiki Clean Up/Batch 11",
              "Wiki Clean Up/Batch 12",
              "Wiki Clean Up/Batch 13",
              "Wiki Clean Up/Batch 14",
              "Wiki Clean Up/Batch 15",
              "Wiki Clean Up/Batch 16",
              "Wiki Clean Up/Batch 17",
              "Wiki Clean Up/Batch 18",
              "Wiki Clean Up/Batch 19",
              "Wiki Clean Up/Batch 20" ]


pages = mwapi.MWApi( host, api_path )
already_crawled = []

def crawl_wikipage(title):

    if (title not in already_crawled) and (title not in blacklist):
        download_wikitext(title)
        print ("\tFound %s" % title)
        already_crawled.append(title)
    else:
        return

    result = pages.get ( action="query",
                titles=title,
                prop="links",
                pllimit=500)

    pageid, result = result['query']['pages'].popitem()
   
    if 'links' in result:
        for link in result['links']:
            crawl_wikipage(link['title'])

def crawl_wiki_contributions(user):
    print ("Crawling %s's contributions:" % user)

    result = pages.get ( action="query",
                list="usercontribs",
                ucuser=user,
                uclimit=500)

    for usercontrib in result['query']['usercontribs']:
        if usercontrib['ns']==0:
            crawl_wikipage(usercontrib['title'])


def download_wikitext(title):
    print ("Fetching "+title)
    result = pages.get ( action="query",
                titles=title,
                export=True)
    
    wikipage = ET.fromstring(result['query']['export']['*'])

    wikitext = ''
    for i in wikipage:
        for j in i:
            for k in j:
                if 'text' in k.tag: 
                    wikitext = k.text

    title = urllib.parse.quote(title, safe="/ ")
    dirname = os.path.dirname(title)
    if dirname:
        if os.path.isfile(dirname):
            parent = os.path.dirname(dirname)
            temp_name = os.path.basename(mktemp())
            os.rename(dirname, os.path.join( parent, temp_name))
            os.makedirs(dirname, exist_ok=True)
            os.rename(os.path.join(parent, temp_name), 
                        os.path.join(dirname,"Index"))
            
        os.makedirs(dirname, exist_ok=True)
    
    if os.path.isdir(title):
        # can't have files named like dirs
        title = os.path.join(title, "Index")

    with open(title, "w+") as f:
        f.write(wikitext)

for user in real_users:
    crawl_wiki_contributions(user)