Processing/MediaWiki_Parser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

# Copyright (C) IBM Corporation 2008

from HTML_Parser import HTML_Parser
import re

class MediaWiki_Parser(HTML_Parser):
    
    #Overwriting the regexp so that various non-data content (see also, table of contents etc.) is removed
    remove_classes_regexp = re.compile("toc|noprint|metadata|sisterproject|boilerplate|reference(?!s)|thumb|navbox|editsection")
    
    def __init__(self, document_to_parse, title, source_url):
        if input == None:
            raise NoDocException("No content to parse - supply document to __init__")
        #find the revision id in the xml the wiki API returns
        revid = re.findall(re.compile('\<parse revid\=\"(?P<rid>[0-9]*)\">'), document_to_parse)
        #remove the xml padding to parse html inside
        input_content = document_to_parse.split("<text>")[1]
        input_content = input_content.split("</text>")[0]
        #call the normal constructor
        HTML_Parser.__init__(self, "<body>" + input_content + "</body>", title, source_url)
        #overwrite the source variable
        self.source = "http://" + source_url.replace("http://", "").split("/")[0] + "/w/index.php?oldid=%s" % revid[0]
    
    def specialise(self):
        """
            Uses DITA_Parser class's specialise() call to find the infobox in a wiki article
        """
        #infobox should be first table
        first_table = self.input.find("table")
        #the word "infobox" should be in the class name somewhere
        if (first_table != None and first_table.has_key("class")  and (re.match(re.compile("infobox"), first_table["class"]) != None)):
            #make a new output tag to work with
            infobox_tag = self.tag_generator("section", attrs=[("id", "infobox")])
            #sometimes infobox data is in an inner table
            inner_table = first_table.table
            #sometimes it isn't :-(
            if inner_table == None:
                #if there isn't an inner table, work on the outer table
                inner_table = first_table
                # the title _should_ be in a "colspan == 2" tag
                inner_table_title = first_table.find(attrs={ "colspan" : "2"})
                #don't break if title can't be found
                if inner_table_title != None:
                    #get the title
                    inner_table_title_temp = inner_table_title.renderContents()
                    #remove the title so it isn't processed twice
                    inner_table_title.extract()
                    inner_table_title = inner_table_title_temp
            else:
                # if there is an inner table, the title will be in the containing table - hunt it down.
                inner_table_title = inner_table.findParent("tr").findPreviousSibling("tr").findChild("th").renderContents()
            #finally append the title to the tag
            infobox_tag.append(self.tag_generator("title", inner_table_title))
            #generate the properties list
            properties_tag = self.tag_generator("properties")
            infobox_tag.append(properties_tag)
            #each property is a row in the table
            for tr in inner_table.findAll("tr"):
                #make sure the row isn't empty
                if tr.findChild() != None:
                    #make a new <property> tag
                    property_tag = self.tag_generator("property")
                    #table cells are either th or td
                    table_cells = tr.findAll(re.compile("th|td"))
                    if len(table_cells) == 0:
                        pass
                    elif len(table_cells) == 1:
                        #if there's only one cell on the row, make it a value
                        property_tag.append(self.tag_generator("propvalue", table_cells[0].renderContents()))
                    else:
                        #if there are two cells on the row, the first is the property type, the second is the value
                        property_tag.append(self.tag_generator("proptype", table_cells[0].renderContents().replace(":", "")))
                        property_tag.append(self.tag_generator("propvalue", table_cells[1].renderContents()))
                    #add the property to the <properties> tag
                    properties_tag.append(property_tag)
            #add the infobox to the output
            self.output_soup.refbody.append(infobox_tag)
            #remove the first table to avoid parsing twice
            first_table.extract()