Processing/Article_Builder.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234

# Copyright (C) IBM Corporation 2008

from BeautifulSoup import Tag
from NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup
from Processing.Article.Article_Data import *
import re
import IO_Manager

class Article_Builder:
    """
    Created by Christopher Leonard.
    
    ID descriptions:
    0 - picture
    1 - heading
    > 1 - anything
    
    This class converts between DITA and article_data representation of articles. Badly in need of refactoring!
    """
    
    

    def get_article_from_dita(self, dita):
        """
        This method takes an article in DITA format as input, parses the DITA, and outputs the corresponding article_data object
        """
        workingDir = IO_Manager.IO_Manager().workingDir
        self.sentences = []
        has_shortdesc = False
        input = BeautifulStoneSoup(dita)
        article_id = input.resourceid['id']
        current_section_id = ""
        current_p_id = ""
        sentence_data_list = []
        paragraph_data_list = []
        section_data_list = []
        if input.find("shortdesc") != None:
            paragraph_data=[]
            for ph in input.shortdesc.findAll("ph"):
                id = ph['id']
                source_sentence_id = id
                source_paragraph_id = "shortdesc"
                source_section_id = "shortdesc"
                source_article_id = article_id
                text = ph.renderContents().replace("\n", "").replace("&amp;#160;", "").strip() + " "
                if text[0:5] == "Satur":
                    print unicode(text)
                sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
                sentence_data_list.append(sentence_data)
            paragraph_data.append(Paragraph_Data("shortdesc", article_id, "shortdesc", "shortdesc", sentence_data_list))
            section_data = Section_Data("shortdesc", article_id, "shortdesc", paragraph_data)
            section_data_list.append(section_data)
            sentence_data_list = []
            input.shortdesc.extract()
            has_shortdesc = True
        taglist = input.findAll(re.compile("refbody|section|p|ph|image"))
        for i in xrange(len(taglist)):
            tag = taglist[len(taglist) - i - 1]
            if tag.name == "ph":
                id = tag['id']
                source_sentence_id = id
                source_paragraph_id = current_p_id
                source_section_id = current_section_id
                source_article_id = article_id
                text = tag.renderContents().replace("\n", "").replace("&amp;#160;", "").strip() + " "
                sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
                sentence_data_list.insert(0, sentence_data)
            elif tag.name == "p":
                if not tag.has_key("id"):
                    id = -1
                else:
                    id = tag['id']
                source_paragraph_id = id
                source_section_id = current_section_id
                source_article_id = article_id
                paragraph_data = Paragraph_Data(id, source_article_id, source_section_id, source_paragraph_id, sentence_data_list)
                paragraph_data_list.insert(0, paragraph_data)
                sentence_data_list = []
                current_p_id = id
            elif tag.name == "refbody" :
                if tag.findParent("reference").has_key("id"):
                    id = "r" + tag.findParent("reference")['id']
                else:
                    id = "r90000"
                source_section_id = id
                source_article_id = article_id
                section_data = Section_Data(id, source_article_id, source_section_id, paragraph_data_list)
                if has_shortdesc:
                    section_data_list.insert(1,section_data)
                else:
                    section_data_list.insert(0,section_data)
                if tag.findChild("title", recursive=False) != None:
                    heading = tag.findChild('title').renderContents().replace("\n", "").replace("&amp;#160;", "").strip()
                    sen = Sentence_Data(1, source_article_id, source_section_id, 1, 1, heading)
                    par = Paragraph_Data(1, source_article_id, source_section_id, 1, [sen])                    
                    headingdata = Section_Data(1, source_article_id, source_section_id, [par])
                    
                    if has_shortdesc:
                        section_data_list.insert(1,headingdata)
                    else:
                        section_data_list.insert(0,headingdata)                    
                paragraph_data_list = []
                current_section_id = tag.name[0] + id
                
            elif tag.name == "section":
                id = "s" + tag['id']
                source_section_id = id
                source_article_id = article_id

                section_data = Section_Data(id, source_article_id, source_section_id, paragraph_data_list)
                if has_shortdesc:
                    section_data_list.insert(1,section_data)
                else:
                    section_data_list.insert(0,section_data)
                if tag.findChild("title", recursive=False) != None:
                    heading = tag.findChild('title').renderContents().replace("\n", "").replace("&amp;#160;", "").strip()
                    sen = Sentence_Data(1, source_article_id, source_section_id, 1, 1, heading)
                    par = Paragraph_Data(1, source_article_id, source_section_id, 1, [sen])                    
                    headingdata = Section_Data(1, source_article_id, source_section_id, [par])
                    
                    if has_shortdesc:
                        section_data_list.insert(1,headingdata)
                    else:
                        section_data_list.insert(0,headingdata)
                paragraph_data_list = []
                current_section_id = id
                
            elif tag.name == "image":
                
                if tag.parent.name == "p":
                    source_article_id = article_id
                    text = tag['href'].replace("..", workingDir)
                    picture_data = Picture_Data(source_article_id, text)
                    sentence_data_list.insert(0, picture_data)
                
        article_title = input.find("title").renderContents().replace("\n", "").strip()
        
        image_list = []
        imglist_tag = input.find(True, attrs={"id" : "imagelist"})
        if imglist_tag != None:
            for img in imglist_tag.findAll("image"):
                caption = img.findChild("alt")
                if caption != None:
                    caption = caption.renderContents().replace("\n", "").strip()
                else:
                    caption = ""
                image_list.append((img['href'], caption))
        
        data = Article_Data(article_id, article_id, article_title, "theme", section_data_list, image_list)                   
        
        return data
    

    def get_dita_from_article(self, article):
        """
        This method takes as input an instance of the Article class.
        It calls the getData method of the article class to get the article_data representation of the article.
        It then constructs the corresponding DITA representation of the article.
        """
        workingDir = IO_Manager.IO_Manager().workingDir
        article_data = article.getData()
        output = BeautifulStoneSoup("<?xml version='1.0' encoding='utf-8'?><!DOCTYPE reference PUBLIC \"-//IBM//DTD DITA IBM Reference//EN\" \"ibm-reference.dtd\"><reference><title>%s</title><prolog></prolog></reference>" % article_data.article_title)
        current_ref = output.reference            
        current_title = None
        for section in article_data.sections_data:
            #headings check
            if len(section.paragraphs_data) == 1 and len(section.paragraphs_data[0].sentences_data) == 1 and section.paragraphs_data[0].sentences_data[0].id == 1:
                paragraph = section.paragraphs_data[0]
                current_title = paragraph.sentences_data[0].text
            elif str(section.id).startswith("r"):
                reference_tag = self.tag_generator(output, "reference", attrs=[("id", section.id.replace("r", ""))])
                if current_title != None:
                    reference_tag.append(self.tag_generator(output, "title", contents=current_title))
                    current_title = None
                reference_tag.append(self.tag_generator(output, "refbody"))
                for paragraph in section.paragraphs_data:
                    if paragraph.id == "shortdesc":
                        paragraph_tag = self.tag_generator(output, "shortdesc")
                    else:
                        paragraph_tag = self.tag_generator(output, "p", attrs=[("id", str(paragraph.id))])
                    for sentence in paragraph.sentences_data:
                        ph_tag = self.tag_generator(output, "ph", attrs=[("id", str(sentence.id))], contents = sentence.text)
                        paragraph_tag.append(ph_tag)
                    reference_tag.refbody.append(paragraph_tag) 
                output.reference.append(reference_tag)
                current_ref = reference_tag.refbody
            else:
                if section.id == "shortdesc":
                    section_tag = self.tag_generator(output, "section", attrs=[("id", "shortdesc")])
                else:
                    section_tag = self.tag_generator(output, "section", attrs=[("id", str(section.id).replace("s", ""))])
                if current_title != None:
                    section_tag.append(self.tag_generator(output, "title", contents=current_title))
                    current_title = None
                for paragraph in section.paragraphs_data:
                    paragraph_tag = self.tag_generator(output, "p", attrs=[("id", str(paragraph.id))])
                    for sentence in paragraph.sentences_data:
                        if sentence.type == "sentence":
                            ph_tag = self.tag_generator(output, "ph", attrs=[("id", str(sentence.id))], contents = sentence.text)
                            paragraph_tag.append(ph_tag)
                        elif sentence.type == "picture":
                            # switch image to relative path
                            image_tag = self.tag_generator(output, "image", attrs=[("href", sentence.text.replace(workingDir, ".."))])
                            paragraph_tag.append(image_tag)
                        else:
                            print sentence.type
                            
                    section_tag.append(paragraph_tag)
                current_ref.append(section_tag)
            if current_title != None:
                current_ref.append('<section id="56756757"><p id="6875534"><ph id="65657657">%s</ph></p></section>' % current_title)
                current_title = None
        if article_data.image_list != []:
            for unnecessary_tag in output.findAll(True, attrs={"id" : "imagelist"}):
                unnecessary_tag.extract()
            image_list = self.tag_generator(output, "reference", [("id", "imagelist")])
            output.reference.append(image_list)
            image_list_body = self.tag_generator(output, "refbody")
            image_list.append(image_list_body)
            for image in article_data.image_list:
                image_tag = self.tag_generator(output, "image", [("href", image[0])], "<alt>" + image[-1] + "</alt>")
                image_list_body.append(image_tag)
        dita = output.prettify()
        return dita
                
    def tag_generator(self, soup, name, attrs=[], contents=None):
        if attrs != []:
            new_tag = Tag(soup, name, attrs)
        else:
            new_tag = Tag(soup, name)
        if contents != None:
            new_tag.insert(0, contents)
        return new_tag