1 files changed, 234 insertions, 0 deletions
diff --git a/Processing/Article_Builder.py b/Processing/Article_Builder.py
new file mode 100644
index 0000000..2b83c18
--- /dev/null
+++ b/Processing/Article_Builder.py
@@ -0,0 +1,234 @@
+# Copyright (C) IBM Corporation 2008
+
+from BeautifulSoup import Tag
+from NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup
+from Processing.Article.Article_Data import *
+import re
+import IO_Manager
+
+class Article_Builder:
+    """
+    Created by Christopher Leonard.
+    
+    ID descriptions:
+    0 - picture
+    1 - heading
+    > 1 - anything
+    
+    This class converts between DITA and article_data representation of articles. Badly in need of refactoring!
+    """
+    
+    
+
+    def get_article_from_dita(self, dita):
+        """
+        This method takes an article in DITA format as input, parses the DITA, and outputs the corresponding article_data object
+        """
+        workingDir = IO_Manager.IO_Manager().workingDir
+        self.sentences = []
+        has_shortdesc = False
+        input = BeautifulStoneSoup(dita)
+        article_id = input.resourceid['id']
+        current_section_id = ""
+        current_p_id = ""
+        sentence_data_list = []
+        paragraph_data_list = []
+        section_data_list = []
+        if input.find("shortdesc") != None:
+            paragraph_data=[]
+            for ph in input.shortdesc.findAll("ph"):
+                id = ph['id']
+                source_sentence_id = id
+                source_paragraph_id = "shortdesc"
+                source_section_id = "shortdesc"
+                source_article_id = article_id
+                text = ph.renderContents().replace("\n", "").replace("&amp;#160;", "").strip() + " "
+                if text[0:5] == "Satur":
+                    print unicode(text)
+                sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
+                sentence_data_list.append(sentence_data)
+            paragraph_data.append(Paragraph_Data("shortdesc", article_id, "shortdesc", "shortdesc", sentence_data_list))
+            section_data = Section_Data("shortdesc", article_id, "shortdesc", paragraph_data)
+            section_data_list.append(section_data)
+            sentence_data_list = []
+            input.shortdesc.extract()
+            has_shortdesc = True
+        taglist = input.findAll(re.compile("refbody|section|p|ph|image"))
+        for i in xrange(len(taglist)):
+            tag = taglist[len(taglist) - i - 1]
+            if tag.name == "ph":
+                id = tag['id']
+                source_sentence_id = id
+                source_paragraph_id = current_p_id
+                source_section_id = current_section_id
+                source_article_id = article_id
+                text = tag.renderContents().replace("\n", "").replace("&amp;#160;", "").strip() + " "
+                sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
+                sentence_data_list.insert(0, sentence_data)
+            elif tag.name == "p":
+                if not tag.has_key("id"):
+                    id = -1
+                else:
+                    id = tag['id']
+                source_paragraph_id = id
+                source_section_id = current_section_id
+                source_article_id = article_id
+                paragraph_data = Paragraph_Data(id, source_article_id, source_section_id, source_paragraph_id, sentence_data_list)
+                paragraph_data_list.insert(0, paragraph_data)
+                sentence_data_list = []
+                current_p_id = id
+            elif tag.name == "refbody" :
+                if tag.findParent("reference").has_key("id"):
+                    id = "r" + tag.findParent("reference")['id']
+                else:
+                    id = "r90000"
+                source_section_id = id
+                source_article_id = article_id
+                section_data = Section_Data(id, source_article_id, source_section_id, paragraph_data_list)
+                if has_shortdesc:
+                    section_data_list.insert(1,section_data)
+                else:
+                    section_data_list.insert(0,section_data)
+                if tag.findChild("title", recursive=False) != None:
+                    heading = tag.findChild('title').renderContents().replace("\n", "").replace("&amp;#160;", "").strip()
+                    sen = Sentence_Data(1, source_article_id, source_section_id, 1, 1, heading)
+                    par = Paragraph_Data(1, source_article_id, source_section_id, 1, [sen])                    
+                    headingdata = Section_Data(1, source_article_id, source_section_id, [par])
+                    
+                    if has_shortdesc:
+                        section_data_list.insert(1,headingdata)
+                    else:
+                        section_data_list.insert(0,headingdata)                    
+                paragraph_data_list = []
+                current_section_id = tag.name[0] + id
+                
+            elif tag.name == "section":
+                id = "s" + tag['id']
+                source_section_id = id
+                source_article_id = article_id
+
+                section_data = Section_Data(id, source_article_id, source_section_id, paragraph_data_list)
+                if has_shortdesc:
+                    section_data_list.insert(1,section_data)
+                else:
+                    section_data_list.insert(0,section_data)
+                if tag.findChild("title", recursive=False) != None:
+                    heading = tag.findChild('title').renderContents().replace("\n", "").replace("&amp;#160;", "").strip()
+                    sen = Sentence_Data(1, source_article_id, source_section_id, 1, 1, heading)
+                    par = Paragraph_Data(1, source_article_id, source_section_id, 1, [sen])                    
+                    headingdata = Section_Data(1, source_article_id, source_section_id, [par])
+                    
+                    if has_shortdesc:
+                        section_data_list.insert(1,headingdata)
+                    else:
+                        section_data_list.insert(0,headingdata)
+                paragraph_data_list = []
+                current_section_id = id
+                
+            elif tag.name == "image":
+                
+                if tag.parent.name == "p":
+                    source_article_id = article_id
+                    text = tag['href'].replace("..", workingDir)
+                    picture_data = Picture_Data(source_article_id, text)
+                    sentence_data_list.insert(0, picture_data)
+                
+        article_title = input.find("title").renderContents().replace("\n", "").strip()
+        
+        image_list = []
+        imglist_tag = input.find(True, attrs={"id" : "imagelist"})
+        if imglist_tag != None:
+            for img in imglist_tag.findAll("image"):
+                caption = img.findChild("alt")
+                if caption != None:
+                    caption = caption.renderContents().replace("\n", "").strip()
+                else:
+                    caption = ""
+                image_list.append((img['href'], caption))
+        
+        data = Article_Data(article_id, article_id, article_title, "theme", section_data_list, image_list)                   
+        
+        return data
+    
+
+    def get_dita_from_article(self, article):
+        """
+        This method takes as input an instance of the Article class.
+        It calls the getData method of the article class to get the article_data representation of the article.
+        It then constructs the corresponding DITA representation of the article.
+        """
+        workingDir = IO_Manager.IO_Manager().workingDir
+        article_data = article.getData()
+        output = BeautifulStoneSoup("<?xml version='1.0' encoding='utf-8'?><!DOCTYPE reference PUBLIC \"-//IBM//DTD DITA IBM Reference//EN\" \"ibm-reference.dtd\"><reference><title>%s</title><prolog></prolog></reference>" % article_data.article_title)
+        current_ref = output.reference            
+        current_title = None
+        for section in article_data.sections_data:
+            #headings check
+            if len(section.paragraphs_data) == 1 and len(section.paragraphs_data[0].sentences_data) == 1 and section.paragraphs_data[0].sentences_data[0].id == 1:
+                paragraph = section.paragraphs_data[0]
+                current_title = paragraph.sentences_data[0].text
+            elif str(section.id).startswith("r"):
+                reference_tag = self.tag_generator(output, "reference", attrs=[("id", section.id.replace("r", ""))])
+                if current_title != None:
+                    reference_tag.append(self.tag_generator(output, "title", contents=current_title))
+                    current_title = None
+                reference_tag.append(self.tag_generator(output, "refbody"))
+                for paragraph in section.paragraphs_data:
+                    if paragraph.id == "shortdesc":
+                        paragraph_tag = self.tag_generator(output, "shortdesc")
+                    else:
+                        paragraph_tag = self.tag_generator(output, "p", attrs=[("id", str(paragraph.id))])
+                    for sentence in paragraph.sentences_data:
+                        ph_tag = self.tag_generator(output, "ph", attrs=[("id", str(sentence.id))], contents = sentence.text)
+                        paragraph_tag.append(ph_tag)
+                    reference_tag.refbody.append(paragraph_tag) 
+                output.reference.append(reference_tag)
+                current_ref = reference_tag.refbody
+            else:
+                if section.id == "shortdesc":
+                    section_tag = self.tag_generator(output, "section", attrs=[("id", "shortdesc")])
+                else:
+                    section_tag = self.tag_generator(output, "section", attrs=[("id", str(section.id).replace("s", ""))])
+                if current_title != None:
+                    section_tag.append(self.tag_generator(output, "title", contents=current_title))
+                    current_title = None
+                for paragraph in section.paragraphs_data:
+                    paragraph_tag = self.tag_generator(output, "p", attrs=[("id", str(paragraph.id))])
+                    for sentence in paragraph.sentences_data:
+                        if sentence.type == "sentence":
+                            ph_tag = self.tag_generator(output, "ph", attrs=[("id", str(sentence.id))], contents = sentence.text)
+                            paragraph_tag.append(ph_tag)
+                        elif sentence.type == "picture":
+                            # switch image to relative path
+                            image_tag = self.tag_generator(output, "image", attrs=[("href", sentence.text.replace(workingDir, ".."))])
+                            paragraph_tag.append(image_tag)
+                        else:
+                            print sentence.type
+                            
+                    section_tag.append(paragraph_tag)
+                current_ref.append(section_tag)
+            if current_title != None:
+                current_ref.append('<section id="56756757"><p id="6875534"><ph id="65657657">%s</ph></p></section>' % current_title)
+                current_title = None
+        if article_data.image_list != []:
+            for unnecessary_tag in output.findAll(True, attrs={"id" : "imagelist"}):
+                unnecessary_tag.extract()
+            image_list = self.tag_generator(output, "reference", [("id", "imagelist")])
+            output.reference.append(image_list)
+            image_list_body = self.tag_generator(output, "refbody")
+            image_list.append(image_list_body)
+            for image in article_data.image_list:
+                image_tag = self.tag_generator(output, "image", [("href", image[0])], "<alt>" + image[-1] + "</alt>")
+                image_list_body.append(image_tag)
+        dita = output.prettify()
+        return dita
+                
+    def tag_generator(self, soup, name, attrs=[], contents=None):
+        if attrs != []:
+            new_tag = Tag(soup, name, attrs)
+        else:
+            new_tag = Tag(soup, name)
+        if contents != None:
+            new_tag.insert(0, contents)
+        return new_tag
+        
+\ No newline at end of file