Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSebastian Silva <sebastian@somosazucar.org>2013-08-25 05:04:06 (GMT)
committer Sebastian Silva <sebastian@somosazucar.org>2013-08-25 05:04:06 (GMT)
commitf2edbb89e24bf542178fa830728d141f1565c27d (patch)
treecf05d98ed67d40e5416f3f67e70393d02e080639
parent56e8aee3ceff6460995cd165c91af3cc0cdf0707 (diff)
Used for migrating pe.sugarlabs.org/go to pe.sugarlabs.org/irHEADmaster
-rw-r--r--crawl_mediawiki.py111
1 files changed, 111 insertions, 0 deletions
diff --git a/crawl_mediawiki.py b/crawl_mediawiki.py
new file mode 100644
index 0000000..987f753
--- /dev/null
+++ b/crawl_mediawiki.py
@@ -0,0 +1,111 @@
+#!/bin/env python3
+
+import mwapi
+from tempfile import mktemp
+import xml.etree.ElementTree as ET
+import os
+import urllib
+
+host = "http://pe.sugarlabs.org/" # can be any mediawiki,
+api_path = "wiki/api.php" # just point to its API
+real_users = ["Sebastian", "Jclema", "Raul_Hugo", "Kikomayorga",
+ "Kaametza", "Kokecontreras", "Bernie", "Tuukah", "Acaire",
+ "Kaisi", "Ignacio_Rodríguez", "Laura_Vargas", "Lwong",
+ "Michael", "Raulhugo", "Cjl"]
+
+blacklist = [ "Wiki Clean Up",
+ "Wiki Clean Up/Batch 01",
+ "Wiki Clean Up/Batch 02",
+ "Wiki Clean Up/Batch 03",
+ "Wiki Clean Up/Batch 04",
+ "Wiki Clean Up/Batch 05",
+ "Wiki Clean Up/Batch 06",
+ "Wiki Clean Up/Batch 07",
+ "Wiki Clean Up/Batch 08",
+ "Wiki Clean Up/Batch 09",
+ "Wiki Clean Up/Batch 10",
+ "Wiki Clean Up/Batch 11",
+ "Wiki Clean Up/Batch 12",
+ "Wiki Clean Up/Batch 13",
+ "Wiki Clean Up/Batch 14",
+ "Wiki Clean Up/Batch 15",
+ "Wiki Clean Up/Batch 16",
+ "Wiki Clean Up/Batch 17",
+ "Wiki Clean Up/Batch 18",
+ "Wiki Clean Up/Batch 19",
+ "Wiki Clean Up/Batch 20" ]
+
+
+pages = mwapi.MWApi( host, api_path )
+already_crawled = []
+
+def crawl_wikipage(title):
+
+ if (title not in already_crawled) and (title not in blacklist):
+ download_wikitext(title)
+ print ("\tFound %s" % title)
+ already_crawled.append(title)
+ else:
+ return
+
+ result = pages.get ( action="query",
+ titles=title,
+ prop="links",
+ pllimit=500)
+
+ pageid, result = result['query']['pages'].popitem()
+
+ if 'links' in result:
+ for link in result['links']:
+ crawl_wikipage(link['title'])
+
+def crawl_wiki_contributions(user):
+ print ("Crawling %s's contributions:" % user)
+
+ result = pages.get ( action="query",
+ list="usercontribs",
+ ucuser=user,
+ uclimit=500)
+
+ for usercontrib in result['query']['usercontribs']:
+ if usercontrib['ns']==0:
+ crawl_wikipage(usercontrib['title'])
+
+
+def download_wikitext(title):
+ print ("Fetching "+title)
+ result = pages.get ( action="query",
+ titles=title,
+ export=True)
+
+ wikipage = ET.fromstring(result['query']['export']['*'])
+
+ wikitext = ''
+ for i in wikipage:
+ for j in i:
+ for k in j:
+ if 'text' in k.tag:
+ wikitext = k.text
+
+ title = urllib.parse.quote(title, safe="/ ")
+ dirname = os.path.dirname(title)
+ if dirname:
+ if os.path.isfile(dirname):
+ parent = os.path.dirname(dirname)
+ temp_name = os.path.basename(mktemp())
+ os.rename(dirname, os.path.join( parent, temp_name))
+ os.makedirs(dirname, exist_ok=True)
+ os.rename(os.path.join(parent, temp_name),
+ os.path.join(dirname,"Index"))
+
+ os.makedirs(dirname, exist_ok=True)
+
+ if os.path.isdir(title):
+ # can't have files named like dirs
+ title = os.path.join(title, "Index")
+
+ with open(title, "w+") as f:
+ f.write(wikitext)
+
+for user in real_users:
+ crawl_wiki_contributions(user)