Processing/MediaWiki_Helper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

# Copyright (C) IBM Corporation 2008

import urllib
import IO_Manager
from xml.dom import minidom

"""
Extend urllib class to spoof user-agent
"""
class NewURLopener(urllib.FancyURLopener):
    version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"

class PageNotFoundError(Exception):
    def __init__(self, value):
        self.parameter = value
    def __str__(self):
        return repr(self.parameter)

class NoResultsError(Exception):
    def __init__(self, value):
        self.parameter = value
    def __str__(self):
        return repr(self.parameter)

"""
Default media wikihost 
"""
defaultWiki = "en.wikipedia.org"


"""
This class handles interaction with Media Wiki. Getting 
content based on a number of parameters such as URL, Title, Revision.
"""
class MediaWiki_Helper:
    
    def __init__(self):
        self.proxies = IO_Manager.IO_Manager().proxies

    def resolveTitle(self, title, wiki=defaultWiki):
        """Check if a wiki article exists using the mediawiki api. Follow redirects.
        
        @param title: article title to resolve
        @param wiki: optional. Defaults to default wiki
        @return: validated article title
        @rtype: string
        @raise PageNotFoundError: if page not found"""
        #replace spaces with underscores
        title = title.replace(" ", "_")
        #create the API request string
        path = "http://%s/w/api.php?action=query&titles=%s&redirects&format=xml" % (wiki, title)
        #parse the xml
        xmldoc = minidom.parseString(self.getDoc(path))
        #check page exists, return None if it doesn't
        page = xmldoc.getElementsByTagName("page")
        if (page != []):
            if ("missing" in page[0].attributes.keys()):
                raise PageNotFoundError("The article with title '%s' could not be found on wiki '%s'" % (title, wiki))
        #check if there are any redirection tags defined
        redirectList = xmldoc.getElementsByTagName("r")
        #if the redirect list is empty, return the title
        if redirectList == []:
            return title
        #if there is a redirect, recursively follow the chain
        else:
            return self.resolveTitle(redirectList[0].attributes["to"].value)
    
    def resolveRevision(self, revision, wiki=defaultWiki):
        """ get an article by revision number.
        
         @param revision: revision number to resolve
         @param wiki: optional. Defaults to default wiki
         @return: revision number if valid
         @rtype: string
         @raise PageNotFoundError: if page not found"""
        path = "http://%s/w/api.php?action=query&format=xml&revids=%s" % (wiki, revision)
        if ("page" in self.getDoc(path)):
            return revision
        else:
            raise PageNotFoundError("The article with revision id '%s' could not be found on wiki '%s'" % (revision, wiki))
        
    def getArticleAsWikiTextByTitle(self, title, wiki=defaultWiki):
        """Gets the wiki markup of an article by its title from the wiki specified.
        
        @param title: title of article to retrieve
        @param wiki: optional. Defaults to default wiki
        @return: article content in wiki markup
        @rtype: string"""
        #resolve the article title 
        title = self.resolveTitle(title)
        #create the API request string
        path = "http://%s/w/api.php?action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % (wiki, title)
        #remove xml tags around article
        return self.stripTags(getDoc(path), "rev")
        
    def getArticleAsWikiTextByURL(self, url):
        """Gets the wiki markup of an article by its title from the wiki specified.
        
        @param url: url of article to retrieve
        @param wiki: optional. Defaults to default wiki
        @return: article content in wiki markup
        @rtype: string"""
        args = self.breakdownURL(url)
        if len(args) == 3:
            return self.getArticleAsWikiTextByRevision(args[2], args[0])
        else:
            return self.getArticleAsWikiTextByTitle(args[1], args[0])
        
    def getArticleAsWikiTextByRevision(self, revision, wiki=defaultWiki):
        """Gets the wiki markup of an article by its revision id from the wiki specified.
        
        @param revision: revision id of article to retrieve
        @param wiki: optional. Defaults to default wiki
        @return: article content in wiki markup
        @rtype: string"""
        self.resolveRevision(revision, wiki)
        path = "http://%s/w/api.php?action=query&prop=revisions&revids=%s&rvprop=content&format=xml" % (wiki, revision)
        return self.stripTags(getDoc(path), "rev")
        
    def getArticleAsHTMLByTitle(self, title, wiki=defaultWiki):
        """Gets the HTML markup of an article by its title from the wiki specified.
        
        @param title: title of article to retrieve
        @param wiki: optional. Defaults to default wiki
        @return: article content in HTML markup
        @rtype: string"""
        #resolve article title
        title = self.resolveTitle(title, wiki)
        #create the API request string
        path = "http://%s/w/api.php?action=parse&page=%s&format=xml" % (wiki,title)
        #remove xml tags around article and fix HTML tags and quotes
        #return fixHTML(stripTags(getDoc(path), "text"))
        return self.fixHTML(self.getDoc(path)), path
        
    def getArticleAsHTMLByURL(self, url):
        """Gets the HTML markup of an article by its title from the wiki specified.
        
        @param url: url of article to retrieve
        @param wiki: optional. Defaults to default wiki
        @return: article content in HTML markup
        @rtype: string"""
        args = self.breakdownURL(url)
        if len(args) == 3:
            return self.getArticleAsHTMLByRevision(args[2], args[0])
        else:
            return self.getArticleAsHTMLByTitle(args[1], args[0])
    
    def getArticleAsHTMLByRevision(self, revision, wiki=defaultWiki):
        """Gets the HTML markup of an article by its revision id from the wiki specified.
        
        @param revision: revision id of article to retrieve
        @param wiki: optional. Defaults to default wiki
        @return: article content in HTML markup
        @rtype: string"""
        self.resolveRevision(revision, wiki)
        path = "http://%s/w/api.php?action=parse&oldid=%s&format=xml" % (wiki,revision)
        #remove xml tags around article and fix HTML tags and quotes
        return self.fixHTML(stripTags(getDoc(path), "text"))
    
    def breakdownURL(self, url):
        """pulls out wiki address, title and revision id from a wiki URL
        
        @param url: url to process
        @return: information from url
        @rtype: list"""
        outputlist = []
        url = url.replace("http://", "")
        outputlist.append(url.split("/")[0])
        if ("title=" in url):
            outputlist.append(url.split("title=")[-1].split("&")[0])
        if ("oldid=" in url):
            outputlist.append(url.split("oldid=")[-1].split("&")[0])
        else:
            outputlist.append(url.split("/")[-1])
        return outputlist
        
    def getDoc(self, path):
        """opens a remote file by http and retrieves data
        
        @param path: location of remote file 
        @return: page contents
        @rtype: string"""
        urllib._urlopener = NewURLopener()
        print "opening " + path
        print "proxies: " + str(self.proxies)
        doc = urllib.urlopen(path, proxies=self.proxies)
        output = doc.read()
        doc.close()
        print "url opened successfully"
        return output
    
    def stripTags(self, input, tag):
        """removes specified tag
    
        @param input: string to work on
        @param tag: tag to remove
        @return: original string with specified tag removed
        @rtype: string"""
        return input.split("<%s>" % (tag), 1)[1].split("</%s>" % (tag), 1)[0]
    
    def fixHTML(self, input):
        """fixes <, > and " characters in HTML
        
        @param input: input string to work on
        @return: modified version of input
        @rtype: string"""
        return input.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;",'"')
    
    def getImageURLs(self, title, wiki=defaultWiki, revision=None):
        """returns a list of the URLs of every image on the specified page on the (optional) specified wiki
        @deprecated: This task is now performed at the parsing stage
        """
        #check article title is valid, follow redirects
        title = self.resolveTitle(title, wiki)
        #proceed if title is valid
        if (title != None):
            #create the API request string
            path = "http://%s/w/api.php?action=query&prop=images&titles=%s&format=xml" % (wiki, title)
            xmldoc = minidom.parseString(self.getDoc(path))
            imglist = xmldoc.getElementsByTagName("im")
            outputlist = []
            for i in xrange(len(imglist)):
                #create the API request string
                path = "http://%s/w/api.php?action=query&titles=%s&prop=imageinfo&iiprop=url&format=xml" % (wiki, imglist[i].attributes["title"].value.replace(" ","_"))
                xmldoc2 = minidom.parseString(self.getDoc(path))
                #append image url to output
                outputlist.append(xmldoc2.getElementsByTagName("ii")[0].attributes["url"].value)
            #return outputlist
            return []
        
    def getImages(self, title, wiki=defaultWiki):
        """returns a list of the URLs of every image on the specified page on the (optional) specified wiki
        @deprecated: This task is now performed at the saving stage
        """
        imglist = getImageURLs(title, wiki)
        outputlist = []  
        if imglist !=[]:
            for i in imglist:
                outputlist.append(getDoc(i))
        return outputlist
    
    def searchWiki(self, search, wiki=defaultWiki):
        """Search a wiki using the openSearch protocol.
        
        @param search: string to search for
        @param wiki: optional. Defaults to default wiki
        @return: search results and description pairs
        @rtype: list"""
        path = "http://%s/w/api.php?action=opensearch&search=%s&format=xml" % (wiki, search)
        output = minidom.parseString(self.getDoc(path))
        results = []
        for item in output.getElementsByTagName("Item"):
            results.append((item.getElementsByTagName("Text")[0].firstChild.data, item.getElementsByTagName("Description")[0].firstChild.data))
        return results
        
    # TODO: make this work with new searchWiki method
    """def getFirstSearchResult(search, wiki=defaultWiki):
        xmldoc = minidom.parseString(searchWiki(search, wiki))
        resultList = xmldoc.getElementsByTagName("Item")
        if (len(resultList) > 0):
            return stripTags(resultList[0].getElementsByTagName("Text")[0].toxml(), "Text")
        else:
            raise noResultsError("No results found for '%s' on wiki: %s" % (search, wiki))"""