Add ability to import text/html files from journal, which can then be used

to serve as article resources.
author: Aneesh Dogra <lionaneesh@gmail.com> 2012-12-25 18:32:03 (GMT)
committer: Aneesh Dogra <lionaneesh@gmail.com> 2012-12-25 18:32:03 (GMT)
commit: 4b5c3cf1afc2a447d8302e2d9bbdf592e50f2cc8 (patch)
tree: 9be4b25ea6d1d6041b79d427b6cea25d5cddb4f4
parent: cd0167548b70263901e25b3b0fc4e715a7c0944b (diff)
3 files changed, 113 insertions, 1 deletions
diff --git a/edit.py b/edit.py
index e48509e..90715f1 100644
--- a/edit.py
+++ b/edit.py
@@ -15,6 +15,7 @@
 from gi.repository import Gtk
 from gi.repository import GObject
 from gettext import gettext as _
+from sugar3.graphics.objectchooser import ObjectChooser
 
 from sugar3.graphics.toolbutton import ToolButton
 from sugar3.graphics.toggletoolbutton import ToggleToolButton
@@ -23,6 +24,8 @@ from infoslicer.widgets.Edit_Pane import Edit_Pane
 from infoslicer.widgets.Format_Pane import Format_Pane
 from infoslicer.widgets.Image_Pane import Image_Pane
 import book
+from infoslicer.processing.HTML_strip import dehtml
+from infoslicer.processing.Article import Article
 
 TABS = (Edit_Pane(),
         Image_Pane(),
@@ -54,6 +57,7 @@ class ToolbarBuilder():
 
         self.txt_toggle = ToggleToolButton('ascii')
         self.img_toggle = ToggleToolButton('image')
+        self.journal_chooser = ToolButton('ascii')
 
         self.txt_toggle.set_tooltip(_('Text'))
         self.txt_toggle.connect('toggled', self._toggle_cb,
@@ -65,19 +69,41 @@ class ToolbarBuilder():
             [self.txt_toggle, self.img_toggle])
         toolbar.insert(self.img_toggle, -1)
 
+        self.journal_chooser.set_tooltip(_('Choose Journal Images'))
+        self.journal_chooser.connect('clicked', self._journal_chooser_cb)
+        toolbar.insert(self.journal_chooser, -1)
+
         for tab in TABS:
             for i in tab.toolitems:
                 toolbar.insert(i, -1)
 
         self.txt_toggle.set_active(True)
 
+    def _journal_chooser_cb(self, widget):
+        chooser = ObjectChooser(what_filter='text/html')
+        result = chooser.run()
+        if result == Gtk.ResponseType.ACCEPT:
+            jobject = chooser.get_selected_object()
+            if jobject and jobject.file_path:
+                title = str(jobject.metadata['title'])
+                path  = str(jobject.file_path)
+                fp = open(path, 'r')
+                text = fp.read()
+                fp.close()
+                article_data = dehtml(text, title)
+                print Article(article_data), book.wiki.article
+                TABS[0].set_source_article(Article(article_data))
+                
+
     def sensitize_all(self):
         self.txt_toggle.set_sensitive(True)
         self.img_toggle.set_sensitive(True)
+        self.journal_chooser.set_sensitive(True)
 
     def unsensitize_all(self):
         self.txt_toggle.set_sensitive(False)
         self.img_toggle.set_sensitive(False)
+        self.journal_chooser.set_sensitive(False)
 
     def _toggle_cb(self, widget, toggles):
         for tab in TABS:
diff --git a/infoslicer/processing/Article_Builder.py b/infoslicer/processing/Article_Builder.py
index b6bd750..80c76b3 100644
--- a/infoslicer/processing/Article_Builder.py
+++ b/infoslicer/processing/Article_Builder.py
@@ -150,7 +150,7 @@ def get_article_from_dita(image_path, dita):
             else:
                 image_list.append((img['href'], caption, img['orig_href']))
     
-    data = Article_Data(article_id, article_id, article_title, "theme", section_data_list, image_list)                   
+    data = Article_Data(article_id, article_id, article_title, "theme", section_data_list, image_list)
     
     return data
 
diff --git a/infoslicer/processing/HTML_strip.py b/infoslicer/processing/HTML_strip.py
new file mode 100644
index 0000000..e41ce72
--- /dev/null
+++ b/infoslicer/processing/HTML_strip.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2012 Aneesh Dogra <lionaneesh@gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+from HTMLParser import HTMLParser
+from re import sub
+from infoslicer.processing.Article_Data import Sentence_Data,  \
+                                               Paragraph_Data, \
+                                               Section_Data, \
+                                               Article_Data
+import string
+
+class HTML_Strip(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.__text = []
+
+    def handle_data(self, data):
+        text = data.strip()
+        if len(text) > 0:
+            text = sub('[\t\r\n]+', '', text)
+            # replace multiple spaces with one
+            text = sub('[ ]+', ' ', text)
+            text = filter(lambda x: x in string.printable, text)
+            self.__text.append(text + '')
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'p':
+            self.__text.append('<PARAGRAPH>')
+        elif tag == 'br':
+            self.__text.append('<SENTENCE>')
+        if tag == 'div':
+            self.__text.append('<SECTION>')
+
+    def text(self):
+        return ''.join(self.__text).strip()
+
+
+# takes in a HTML document and returns a list of Section objects.
+def dehtml(text, title):
+    try:
+        parser = HTML_Strip()
+        parser.feed(text)
+        parser.close()
+        text_stripped = parser.text()
+    except:
+        text_stripped = text
+
+    # We now need to convert this stripped data to an
+    # Article Data object.
+    sections = text_stripped.split('<SECTION>')
+    section_objs = []
+    for section in sections:
+        s = section.strip()
+        if s:
+            paragraphs = text_stripped.split('<PARAGRAPH>')
+            p_objs = []
+            for para in paragraphs:
+                if para[:len('<SECTION>')] == '<SECTION>':
+                    para = para[len('<SECTION>'):]
+                if para.endswith('<SECTION>'):
+                    para = para[:-len('<SECTION>')]
+                p = para.strip()
+                if p:
+                    sentences = para.split('<SENTENCE>')
+                    s_objs = []
+                    for sentence in sentences:
+                        s = sentence.strip()
+                        if s:
+                            s_objs += [Sentence_Data(text=s)]
+                            s_objs += [Sentence_Data(text='\n')]
+                    p_objs += [Paragraph_Data(sentences_data=s_objs)]
+            section_objs += [Section_Data(paragraphs_data=p_objs)]
+    return Article_Data(article_title=title, sections_data=section_objs)
author	Aneesh Dogra <lionaneesh@gmail.com>	2012-12-25 18:32:03 (GMT)
committer	Aneesh Dogra <lionaneesh@gmail.com>	2012-12-25 18:32:03 (GMT)
commit	4b5c3cf1afc2a447d8302e2d9bbdf592e50f2cc8 (patch)
tree	9be4b25ea6d1d6041b79d427b6cea25d5cddb4f4
parent	cd0167548b70263901e25b3b0fc4e715a7c0944b (diff)