1 files changed, 263 insertions, 0 deletions
diff --git a/Processing/MediaWiki_Helper.py b/Processing/MediaWiki_Helper.py
new file mode 100644
index 0000000..3a328f3
--- /dev/null
+++ b/Processing/MediaWiki_Helper.py
@@ -0,0 +1,263 @@
+# Copyright (C) IBM Corporation 2008
+
+import urllib
+import IO_Manager
+from xml.dom import minidom
+
+"""
+Extend urllib class to spoof user-agent
+"""
+class NewURLopener(urllib.FancyURLopener):
+    version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"
+
+class PageNotFoundError(Exception):
+    def __init__(self, value):
+        self.parameter = value
+    def __str__(self):
+        return repr(self.parameter)
+
+class NoResultsError(Exception):
+    def __init__(self, value):
+        self.parameter = value
+    def __str__(self):
+        return repr(self.parameter)
+
+"""
+Default media wikihost 
+"""
+defaultWiki = "en.wikipedia.org"
+
+
+"""
+This class handles interaction with Media Wiki. Getting 
+content based on a number of parameters such as URL, Title, Revision.
+"""
+class MediaWiki_Helper:
+    
+    def __init__(self):
+        self.proxies = IO_Manager.IO_Manager().proxies
+
+    def resolveTitle(self, title, wiki=defaultWiki):
+        """Check if a wiki article exists using the mediawiki api. Follow redirects.
+        
+        @param title: article title to resolve
+        @param wiki: optional. Defaults to default wiki
+        @return: validated article title
+        @rtype: string
+        @raise PageNotFoundError: if page not found"""
+        #replace spaces with underscores
+        title = title.replace(" ", "_")
+        #create the API request string
+        path = "http://%s/w/api.php?action=query&titles=%s&redirects&format=xml" % (wiki, title)
+        #parse the xml
+        xmldoc = minidom.parseString(self.getDoc(path))
+        #check page exists, return None if it doesn't
+        page = xmldoc.getElementsByTagName("page")
+        if (page != []):
+            if ("missing" in page[0].attributes.keys()):
+                raise PageNotFoundError("The article with title '%s' could not be found on wiki '%s'" % (title, wiki))
+        #check if there are any redirection tags defined
+        redirectList = xmldoc.getElementsByTagName("r")
+        #if the redirect list is empty, return the title
+        if redirectList == []:
+            return title
+        #if there is a redirect, recursively follow the chain
+        else:
+            return self.resolveTitle(redirectList[0].attributes["to"].value)
+    
+    def resolveRevision(self, revision, wiki=defaultWiki):
+        """ get an article by revision number.
+        
+         @param revision: revision number to resolve
+         @param wiki: optional. Defaults to default wiki
+         @return: revision number if valid
+         @rtype: string
+         @raise PageNotFoundError: if page not found"""
+        path = "http://%s/w/api.php?action=query&format=xml&revids=%s" % (wiki, revision)
+        if ("page" in self.getDoc(path)):
+            return revision
+        else:
+            raise PageNotFoundError("The article with revision id '%s' could not be found on wiki '%s'" % (revision, wiki))
+        
+    def getArticleAsWikiTextByTitle(self, title, wiki=defaultWiki):
+        """Gets the wiki markup of an article by its title from the wiki specified.
+        
+        @param title: title of article to retrieve
+        @param wiki: optional. Defaults to default wiki
+        @return: article content in wiki markup
+        @rtype: string"""
+        #resolve the article title 
+        title = self.resolveTitle(title)
+        #create the API request string
+        path = "http://%s/w/api.php?action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % (wiki, title)
+        #remove xml tags around article
+        return self.stripTags(getDoc(path), "rev")
+        
+    def getArticleAsWikiTextByURL(self, url):
+        """Gets the wiki markup of an article by its title from the wiki specified.
+        
+        @param url: url of article to retrieve
+        @param wiki: optional. Defaults to default wiki
+        @return: article content in wiki markup
+        @rtype: string"""
+        args = self.breakdownURL(url)
+        if len(args) == 3:
+            return self.getArticleAsWikiTextByRevision(args[2], args[0])
+        else:
+            return self.getArticleAsWikiTextByTitle(args[1], args[0])
+        
+    def getArticleAsWikiTextByRevision(self, revision, wiki=defaultWiki):
+        """Gets the wiki markup of an article by its revision id from the wiki specified.
+        
+        @param revision: revision id of article to retrieve
+        @param wiki: optional. Defaults to default wiki
+        @return: article content in wiki markup
+        @rtype: string"""
+        self.resolveRevision(revision, wiki)
+        path = "http://%s/w/api.php?action=query&prop=revisions&revids=%s&rvprop=content&format=xml" % (wiki, revision)
+        return self.stripTags(getDoc(path), "rev")
+        
+    def getArticleAsHTMLByTitle(self, title, wiki=defaultWiki):
+        """Gets the HTML markup of an article by its title from the wiki specified.
+        
+        @param title: title of article to retrieve
+        @param wiki: optional. Defaults to default wiki
+        @return: article content in HTML markup
+        @rtype: string"""
+        #resolve article title
+        title = self.resolveTitle(title, wiki)
+        #create the API request string
+        path = "http://%s/w/api.php?action=parse&page=%s&format=xml" % (wiki,title)
+        #remove xml tags around article and fix HTML tags and quotes
+        #return fixHTML(stripTags(getDoc(path), "text"))
+        return self.fixHTML(self.getDoc(path)), path
+        
+    def getArticleAsHTMLByURL(self, url):
+        """Gets the HTML markup of an article by its title from the wiki specified.
+        
+        @param url: url of article to retrieve
+        @param wiki: optional. Defaults to default wiki
+        @return: article content in HTML markup
+        @rtype: string"""
+        args = self.breakdownURL(url)
+        if len(args) == 3:
+            return self.getArticleAsHTMLByRevision(args[2], args[0])
+        else:
+            return self.getArticleAsHTMLByTitle(args[1], args[0])
+    
+    def getArticleAsHTMLByRevision(self, revision, wiki=defaultWiki):
+        """Gets the HTML markup of an article by its revision id from the wiki specified.
+        
+        @param revision: revision id of article to retrieve
+        @param wiki: optional. Defaults to default wiki
+        @return: article content in HTML markup
+        @rtype: string"""
+        self.resolveRevision(revision, wiki)
+        path = "http://%s/w/api.php?action=parse&oldid=%s&format=xml" % (wiki,revision)
+        #remove xml tags around article and fix HTML tags and quotes
+        return self.fixHTML(stripTags(getDoc(path), "text"))
+    
+    def breakdownURL(self, url):
+        """pulls out wiki address, title and revision id from a wiki URL
+        
+        @param url: url to process
+        @return: information from url
+        @rtype: list"""
+        outputlist = []
+        url = url.replace("http://", "")
+        outputlist.append(url.split("/")[0])
+        if ("title=" in url):
+            outputlist.append(url.split("title=")[-1].split("&")[0])
+        if ("oldid=" in url):
+            outputlist.append(url.split("oldid=")[-1].split("&")[0])
+        else:
+            outputlist.append(url.split("/")[-1])
+        return outputlist
+        
+    def getDoc(self, path):
+        """opens a remote file by http and retrieves data
+        
+        @param path: location of remote file 
+        @return: page contents
+        @rtype: string"""
+        urllib._urlopener = NewURLopener()
+        print "opening " + path
+        print "proxies: " + str(self.proxies)
+        doc = urllib.urlopen(path, proxies=self.proxies)
+        output = doc.read()
+        doc.close()
+        print "url opened successfully"
+        return output
+    
+    def stripTags(self, input, tag):
+        """removes specified tag
+    
+        @param input: string to work on
+        @param tag: tag to remove
+        @return: original string with specified tag removed
+        @rtype: string"""
+        return input.split("<%s>" % (tag), 1)[1].split("</%s>" % (tag), 1)[0]
+    
+    def fixHTML(self, input):
+        """fixes <, > and " characters in HTML
+        
+        @param input: input string to work on
+        @return: modified version of input
+        @rtype: string"""
+        return input.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;",'"')
+    
+    def getImageURLs(self, title, wiki=defaultWiki, revision=None):
+        """returns a list of the URLs of every image on the specified page on the (optional) specified wiki
+        @deprecated: This task is now performed at the parsing stage
+        """
+        #check article title is valid, follow redirects
+        title = self.resolveTitle(title, wiki)
+        #proceed if title is valid
+        if (title != None):
+            #create the API request string
+            path = "http://%s/w/api.php?action=query&prop=images&titles=%s&format=xml" % (wiki, title)
+            xmldoc = minidom.parseString(self.getDoc(path))
+            imglist = xmldoc.getElementsByTagName("im")
+            outputlist = []
+            for i in xrange(len(imglist)):
+                #create the API request string
+                path = "http://%s/w/api.php?action=query&titles=%s&prop=imageinfo&iiprop=url&format=xml" % (wiki, imglist[i].attributes["title"].value.replace(" ","_"))
+                xmldoc2 = minidom.parseString(self.getDoc(path))
+                #append image url to output
+                outputlist.append(xmldoc2.getElementsByTagName("ii")[0].attributes["url"].value)
+            #return outputlist
+            return []
+        
+    def getImages(self, title, wiki=defaultWiki):
+        """returns a list of the URLs of every image on the specified page on the (optional) specified wiki
+        @deprecated: This task is now performed at the saving stage
+        """
+        imglist = getImageURLs(title, wiki)
+        outputlist = []  
+        if imglist !=[]:
+            for i in imglist:
+                outputlist.append(getDoc(i))
+        return outputlist
+    
+    def searchWiki(self, search, wiki=defaultWiki):
+        """Search a wiki using the openSearch protocol.
+        
+        @param search: string to search for
+        @param wiki: optional. Defaults to default wiki
+        @return: search results and description pairs
+        @rtype: list"""
+        path = "http://%s/w/api.php?action=opensearch&search=%s&format=xml" % (wiki, search)
+        output = minidom.parseString(self.getDoc(path))
+        results = []
+        for item in output.getElementsByTagName("Item"):
+            results.append((item.getElementsByTagName("Text")[0].firstChild.data, item.getElementsByTagName("Description")[0].firstChild.data))
+        return results
+        
+    # TODO: make this work with new searchWiki method
+    """def getFirstSearchResult(search, wiki=defaultWiki):
+        xmldoc = minidom.parseString(searchWiki(search, wiki))
+        resultList = xmldoc.getElementsByTagName("Item")
+        if (len(resultList) > 0):
+            return stripTags(resultList[0].getElementsByTagName("Text")[0].toxml(), "Text")
+        else:
+            raise noResultsError("No results found for '%s' on wiki: %s" % (search, wiki))"""
+\ No newline at end of file