Used for migrating pe.sugarlabs.org/go to pe.sugarlabs.org/irHEAD master

author: Sebastian Silva <sebastian@somosazucar.org> 2013-08-25 05:04:06 (GMT)
committer: Sebastian Silva <sebastian@somosazucar.org> 2013-08-25 05:04:06 (GMT)
commit: f2edbb89e24bf542178fa830728d141f1565c27d (patch)
tree: cf05d98ed67d40e5416f3f67e70393d02e080639
parent: 56e8aee3ceff6460995cd165c91af3cc0cdf0707 (diff)
1 files changed, 111 insertions, 0 deletions
diff --git a/crawl_mediawiki.py b/crawl_mediawiki.py
new file mode 100644
index 0000000..987f753
--- /dev/null
+++ b/crawl_mediawiki.py
@@ -0,0 +1,111 @@
+#!/bin/env python3
+
+import mwapi
+from tempfile import mktemp
+import xml.etree.ElementTree as ET
+import os
+import urllib
+
+host = "http://pe.sugarlabs.org/" # can be any mediawiki,
+api_path = "wiki/api.php"         # just point to its API
+real_users = ["Sebastian", "Jclema", "Raul_Hugo", "Kikomayorga",
+                "Kaametza", "Kokecontreras", "Bernie", "Tuukah", "Acaire",
+                "Kaisi", "Ignacio_Rodríguez", "Laura_Vargas", "Lwong",
+                "Michael", "Raulhugo", "Cjl"]
+
+blacklist = [ "Wiki Clean Up",
+              "Wiki Clean Up/Batch 01",
+              "Wiki Clean Up/Batch 02",
+              "Wiki Clean Up/Batch 03",
+              "Wiki Clean Up/Batch 04",
+              "Wiki Clean Up/Batch 05",
+              "Wiki Clean Up/Batch 06",
+              "Wiki Clean Up/Batch 07",
+              "Wiki Clean Up/Batch 08",
+              "Wiki Clean Up/Batch 09",
+              "Wiki Clean Up/Batch 10",
+              "Wiki Clean Up/Batch 11",
+              "Wiki Clean Up/Batch 12",
+              "Wiki Clean Up/Batch 13",
+              "Wiki Clean Up/Batch 14",
+              "Wiki Clean Up/Batch 15",
+              "Wiki Clean Up/Batch 16",
+              "Wiki Clean Up/Batch 17",
+              "Wiki Clean Up/Batch 18",
+              "Wiki Clean Up/Batch 19",
+              "Wiki Clean Up/Batch 20" ]
+
+
+pages = mwapi.MWApi( host, api_path )
+already_crawled = []
+
+def crawl_wikipage(title):
+
+    if (title not in already_crawled) and (title not in blacklist):
+        download_wikitext(title)
+        print ("\tFound %s" % title)
+        already_crawled.append(title)
+    else:
+        return
+
+    result = pages.get ( action="query",
+                titles=title,
+                prop="links",
+                pllimit=500)
+
+    pageid, result = result['query']['pages'].popitem()
+   
+    if 'links' in result:
+        for link in result['links']:
+            crawl_wikipage(link['title'])
+
+def crawl_wiki_contributions(user):
+    print ("Crawling %s's contributions:" % user)
+
+    result = pages.get ( action="query",
+                list="usercontribs",
+                ucuser=user,
+                uclimit=500)
+
+    for usercontrib in result['query']['usercontribs']:
+        if usercontrib['ns']==0:
+            crawl_wikipage(usercontrib['title'])
+
+
+def download_wikitext(title):
+    print ("Fetching "+title)
+    result = pages.get ( action="query",
+                titles=title,
+                export=True)
+    
+    wikipage = ET.fromstring(result['query']['export']['*'])
+
+    wikitext = ''
+    for i in wikipage:
+        for j in i:
+            for k in j:
+                if 'text' in k.tag: 
+                    wikitext = k.text
+
+    title = urllib.parse.quote(title, safe="/ ")
+    dirname = os.path.dirname(title)
+    if dirname:
+        if os.path.isfile(dirname):
+            parent = os.path.dirname(dirname)
+            temp_name = os.path.basename(mktemp())
+            os.rename(dirname, os.path.join( parent, temp_name))
+            os.makedirs(dirname, exist_ok=True)
+            os.rename(os.path.join(parent, temp_name), 
+                        os.path.join(dirname,"Index"))
+            
+        os.makedirs(dirname, exist_ok=True)
+    
+    if os.path.isdir(title):
+        # can't have files named like dirs
+        title = os.path.join(title, "Index")
+
+    with open(title, "w+") as f:
+        f.write(wikitext)
+
+for user in real_users:
+    crawl_wiki_contributions(user)
author	Sebastian Silva <sebastian@somosazucar.org>	2013-08-25 05:04:06 (GMT)
committer	Sebastian Silva <sebastian@somosazucar.org>	2013-08-25 05:04:06 (GMT)
commit	f2edbb89e24bf542178fa830728d141f1565c27d (patch)
tree	cf05d98ed67d40e5416f3f67e70393d02e080639
parent	56e8aee3ceff6460995cd165c91af3cc0cdf0707 (diff)