From f2edbb89e24bf542178fa830728d141f1565c27d Mon Sep 17 00:00:00 2001 From: Sebastian Silva Date: Sun, 25 Aug 2013 05:04:06 +0000 Subject: Used for migrating pe.sugarlabs.org/go to pe.sugarlabs.org/ir --- diff --git a/crawl_mediawiki.py b/crawl_mediawiki.py new file mode 100644 index 0000000..987f753 --- /dev/null +++ b/crawl_mediawiki.py @@ -0,0 +1,111 @@ +#!/bin/env python3 + +import mwapi +from tempfile import mktemp +import xml.etree.ElementTree as ET +import os +import urllib + +host = "http://pe.sugarlabs.org/" # can be any mediawiki, +api_path = "wiki/api.php" # just point to its API +real_users = ["Sebastian", "Jclema", "Raul_Hugo", "Kikomayorga", + "Kaametza", "Kokecontreras", "Bernie", "Tuukah", "Acaire", + "Kaisi", "Ignacio_Rodríguez", "Laura_Vargas", "Lwong", + "Michael", "Raulhugo", "Cjl"] + +blacklist = [ "Wiki Clean Up", + "Wiki Clean Up/Batch 01", + "Wiki Clean Up/Batch 02", + "Wiki Clean Up/Batch 03", + "Wiki Clean Up/Batch 04", + "Wiki Clean Up/Batch 05", + "Wiki Clean Up/Batch 06", + "Wiki Clean Up/Batch 07", + "Wiki Clean Up/Batch 08", + "Wiki Clean Up/Batch 09", + "Wiki Clean Up/Batch 10", + "Wiki Clean Up/Batch 11", + "Wiki Clean Up/Batch 12", + "Wiki Clean Up/Batch 13", + "Wiki Clean Up/Batch 14", + "Wiki Clean Up/Batch 15", + "Wiki Clean Up/Batch 16", + "Wiki Clean Up/Batch 17", + "Wiki Clean Up/Batch 18", + "Wiki Clean Up/Batch 19", + "Wiki Clean Up/Batch 20" ] + + +pages = mwapi.MWApi( host, api_path ) +already_crawled = [] + +def crawl_wikipage(title): + + if (title not in already_crawled) and (title not in blacklist): + download_wikitext(title) + print ("\tFound %s" % title) + already_crawled.append(title) + else: + return + + result = pages.get ( action="query", + titles=title, + prop="links", + pllimit=500) + + pageid, result = result['query']['pages'].popitem() + + if 'links' in result: + for link in result['links']: + crawl_wikipage(link['title']) + +def crawl_wiki_contributions(user): + print ("Crawling %s's contributions:" % user) + + result = pages.get ( action="query", + list="usercontribs", + ucuser=user, + uclimit=500) + + for usercontrib in result['query']['usercontribs']: + if usercontrib['ns']==0: + crawl_wikipage(usercontrib['title']) + + +def download_wikitext(title): + print ("Fetching "+title) + result = pages.get ( action="query", + titles=title, + export=True) + + wikipage = ET.fromstring(result['query']['export']['*']) + + wikitext = '' + for i in wikipage: + for j in i: + for k in j: + if 'text' in k.tag: + wikitext = k.text + + title = urllib.parse.quote(title, safe="/ ") + dirname = os.path.dirname(title) + if dirname: + if os.path.isfile(dirname): + parent = os.path.dirname(dirname) + temp_name = os.path.basename(mktemp()) + os.rename(dirname, os.path.join( parent, temp_name)) + os.makedirs(dirname, exist_ok=True) + os.rename(os.path.join(parent, temp_name), + os.path.join(dirname,"Index")) + + os.makedirs(dirname, exist_ok=True) + + if os.path.isdir(title): + # can't have files named like dirs + title = os.path.join(title, "Index") + + with open(title, "w+") as f: + f.write(wikitext) + +for user in real_users: + crawl_wiki_contributions(user) -- cgit v0.9.1