From 4b5c3cf1afc2a447d8302e2d9bbdf592e50f2cc8 Mon Sep 17 00:00:00 2001 From: Aneesh Dogra Date: Tue, 25 Dec 2012 18:32:03 +0000 Subject: Add ability to import text/html files from journal, which can then be used to serve as article resources. --- diff --git a/edit.py b/edit.py index e48509e..90715f1 100644 --- a/edit.py +++ b/edit.py @@ -15,6 +15,7 @@ from gi.repository import Gtk from gi.repository import GObject from gettext import gettext as _ +from sugar3.graphics.objectchooser import ObjectChooser from sugar3.graphics.toolbutton import ToolButton from sugar3.graphics.toggletoolbutton import ToggleToolButton @@ -23,6 +24,8 @@ from infoslicer.widgets.Edit_Pane import Edit_Pane from infoslicer.widgets.Format_Pane import Format_Pane from infoslicer.widgets.Image_Pane import Image_Pane import book +from infoslicer.processing.HTML_strip import dehtml +from infoslicer.processing.Article import Article TABS = (Edit_Pane(), Image_Pane(), @@ -54,6 +57,7 @@ class ToolbarBuilder(): self.txt_toggle = ToggleToolButton('ascii') self.img_toggle = ToggleToolButton('image') + self.journal_chooser = ToolButton('ascii') self.txt_toggle.set_tooltip(_('Text')) self.txt_toggle.connect('toggled', self._toggle_cb, @@ -65,19 +69,41 @@ class ToolbarBuilder(): [self.txt_toggle, self.img_toggle]) toolbar.insert(self.img_toggle, -1) + self.journal_chooser.set_tooltip(_('Choose Journal Images')) + self.journal_chooser.connect('clicked', self._journal_chooser_cb) + toolbar.insert(self.journal_chooser, -1) + for tab in TABS: for i in tab.toolitems: toolbar.insert(i, -1) self.txt_toggle.set_active(True) + def _journal_chooser_cb(self, widget): + chooser = ObjectChooser(what_filter='text/html') + result = chooser.run() + if result == Gtk.ResponseType.ACCEPT: + jobject = chooser.get_selected_object() + if jobject and jobject.file_path: + title = str(jobject.metadata['title']) + path = str(jobject.file_path) + fp = open(path, 'r') + text = fp.read() + fp.close() + article_data = dehtml(text, title) + print Article(article_data), book.wiki.article + TABS[0].set_source_article(Article(article_data)) + + def sensitize_all(self): self.txt_toggle.set_sensitive(True) self.img_toggle.set_sensitive(True) + self.journal_chooser.set_sensitive(True) def unsensitize_all(self): self.txt_toggle.set_sensitive(False) self.img_toggle.set_sensitive(False) + self.journal_chooser.set_sensitive(False) def _toggle_cb(self, widget, toggles): for tab in TABS: diff --git a/infoslicer/processing/Article_Builder.py b/infoslicer/processing/Article_Builder.py index b6bd750..80c76b3 100644 --- a/infoslicer/processing/Article_Builder.py +++ b/infoslicer/processing/Article_Builder.py @@ -150,7 +150,7 @@ def get_article_from_dita(image_path, dita): else: image_list.append((img['href'], caption, img['orig_href'])) - data = Article_Data(article_id, article_id, article_title, "theme", section_data_list, image_list) + data = Article_Data(article_id, article_id, article_title, "theme", section_data_list, image_list) return data diff --git a/infoslicer/processing/HTML_strip.py b/infoslicer/processing/HTML_strip.py new file mode 100644 index 0000000..e41ce72 --- /dev/null +++ b/infoslicer/processing/HTML_strip.py @@ -0,0 +1,86 @@ +# Copyright (C) 2012 Aneesh Dogra +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +from HTMLParser import HTMLParser +from re import sub +from infoslicer.processing.Article_Data import Sentence_Data, \ + Paragraph_Data, \ + Section_Data, \ + Article_Data +import string + +class HTML_Strip(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.__text = [] + + def handle_data(self, data): + text = data.strip() + if len(text) > 0: + text = sub('[\t\r\n]+', '', text) + # replace multiple spaces with one + text = sub('[ ]+', ' ', text) + text = filter(lambda x: x in string.printable, text) + self.__text.append(text + '') + + def handle_starttag(self, tag, attrs): + if tag == 'p': + self.__text.append('') + elif tag == 'br': + self.__text.append('') + if tag == 'div': + self.__text.append('
') + + def text(self): + return ''.join(self.__text).strip() + + +# takes in a HTML document and returns a list of Section objects. +def dehtml(text, title): + try: + parser = HTML_Strip() + parser.feed(text) + parser.close() + text_stripped = parser.text() + except: + text_stripped = text + + # We now need to convert this stripped data to an + # Article Data object. + sections = text_stripped.split('
') + section_objs = [] + for section in sections: + s = section.strip() + if s: + paragraphs = text_stripped.split('') + p_objs = [] + for para in paragraphs: + if para[:len('
')] == '
': + para = para[len('
'):] + if para.endswith('
'): + para = para[:-len('
')] + p = para.strip() + if p: + sentences = para.split('') + s_objs = [] + for sentence in sentences: + s = sentence.strip() + if s: + s_objs += [Sentence_Data(text=s)] + s_objs += [Sentence_Data(text='\n')] + p_objs += [Paragraph_Data(sentences_data=s_objs)] + section_objs += [Section_Data(paragraphs_data=p_objs)] + return Article_Data(article_title=title, sections_data=section_objs) -- cgit v0.9.1