1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
#!/bin/env python3
import mwapi
from tempfile import mktemp
import xml.etree.ElementTree as ET
import os
import urllib
host = "http://pe.sugarlabs.org/" # can be any mediawiki,
api_path = "wiki/api.php" # just point to its API
real_users = ["Sebastian", "Jclema", "Raul_Hugo", "Kikomayorga",
"Kaametza", "Kokecontreras", "Bernie", "Tuukah", "Acaire",
"Kaisi", "Ignacio_RodrÃguez", "Laura_Vargas", "Lwong",
"Michael", "Raulhugo", "Cjl"]
blacklist = [ "Wiki Clean Up",
"Wiki Clean Up/Batch 01",
"Wiki Clean Up/Batch 02",
"Wiki Clean Up/Batch 03",
"Wiki Clean Up/Batch 04",
"Wiki Clean Up/Batch 05",
"Wiki Clean Up/Batch 06",
"Wiki Clean Up/Batch 07",
"Wiki Clean Up/Batch 08",
"Wiki Clean Up/Batch 09",
"Wiki Clean Up/Batch 10",
"Wiki Clean Up/Batch 11",
"Wiki Clean Up/Batch 12",
"Wiki Clean Up/Batch 13",
"Wiki Clean Up/Batch 14",
"Wiki Clean Up/Batch 15",
"Wiki Clean Up/Batch 16",
"Wiki Clean Up/Batch 17",
"Wiki Clean Up/Batch 18",
"Wiki Clean Up/Batch 19",
"Wiki Clean Up/Batch 20" ]
pages = mwapi.MWApi( host, api_path )
already_crawled = []
def crawl_wikipage(title):
if (title not in already_crawled) and (title not in blacklist):
download_wikitext(title)
print ("\tFound %s" % title)
already_crawled.append(title)
else:
return
result = pages.get ( action="query",
titles=title,
prop="links",
pllimit=500)
pageid, result = result['query']['pages'].popitem()
if 'links' in result:
for link in result['links']:
crawl_wikipage(link['title'])
def crawl_wiki_contributions(user):
print ("Crawling %s's contributions:" % user)
result = pages.get ( action="query",
list="usercontribs",
ucuser=user,
uclimit=500)
for usercontrib in result['query']['usercontribs']:
if usercontrib['ns']==0:
crawl_wikipage(usercontrib['title'])
def download_wikitext(title):
print ("Fetching "+title)
result = pages.get ( action="query",
titles=title,
export=True)
wikipage = ET.fromstring(result['query']['export']['*'])
wikitext = ''
for i in wikipage:
for j in i:
for k in j:
if 'text' in k.tag:
wikitext = k.text
title = urllib.parse.quote(title, safe="/ ")
dirname = os.path.dirname(title)
if dirname:
if os.path.isfile(dirname):
parent = os.path.dirname(dirname)
temp_name = os.path.basename(mktemp())
os.rename(dirname, os.path.join( parent, temp_name))
os.makedirs(dirname, exist_ok=True)
os.rename(os.path.join(parent, temp_name),
os.path.join(dirname,"Index"))
os.makedirs(dirname, exist_ok=True)
if os.path.isdir(title):
# can't have files named like dirs
title = os.path.join(title, "Index")
with open(title, "w+") as f:
f.write(wikitext)
for user in real_users:
crawl_wiki_contributions(user)
|