From 8ba2a7015f057a4206df0d6e6ed4b56e201e0b2c Mon Sep 17 00:00:00 2001 From: Gonzalo Odiard Date: Sat, 07 Jul 2012 22:13:32 +0000 Subject: Initial push Signed-off-by: Gonzalo Odiard --- diff --git a/epubfactory.py b/epubfactory.py new file mode 100644 index 0000000..ef519e0 --- /dev/null +++ b/epubfactory.py @@ -0,0 +1,315 @@ +# Copyright (C) 2011, Gonzalo Odiard + +import os +import shutil +import zipfile +import BeautifulSoup +import re + + +class EpubFactory(): + + def __init__(self, title, creator, language): + self._title = title + self._creator = creator + # TODO create unique id + self._id = 'asdfasdfvsadfgsdfhfghfghdfhdfghf' + self._language = language + self._cover_image = None + + def make_epub(self, file_list): + self._tmp_directory = '/tmp' + self._list_files = file_list + + self.root_directory = self._tmp_directory + "/epub%udir" % os.getpid() + os.mkdir(self.root_directory) + + self.mimetype_file = self.create_mimetype_file() + + metainf_dir = self.root_directory + '/META-INF' + os.mkdir(metainf_dir) + self.create_container_file(metainf_dir) + + oebps_dir = self.root_directory + '/OEBPS' + os.mkdir(oebps_dir) + + self.create_toc_file(oebps_dir, file_list) + + self.images = [] + self.css = [] + for file_name in file_list: + if file_name.endswith('.html') or file_name.endswith('.htm'): + self.clean_html_file(file_name, + os.path.join(self.root_directory, 'OEBPS')) + else: + shutil.copyfile(file_name, + os.path.join(self.root_directory, 'OEBPS', + os.path.basename(file_name))) + + if len(self.images) > 0: + os.mkdir(os.path.join(oebps_dir, 'images')) + if len(self.css) > 0: + os.mkdir(os.path.join(oebps_dir, 'css')) + + content_file_list = [] + for file_name in file_list: + content_file_list.append(os.path.basename(file_name)) + + for img_name in self.images: + shutil.copyfile(img_name, + os.path.join(self.root_directory, 'OEBPS', 'images', + os.path.basename(img_name))) + content_file_list.append(os.path.join('images', + os.path.basename(img_name))) + + for css_name in self.css: + shutil.copyfile(css_name, + os.path.join(self.root_directory, 'OEBPS', 'css', + os.path.basename(css_name))) + content_file_list.append(os.path.join('css', + os.path.basename(css_name))) + + self.create_content_file(oebps_dir, content_file_list) + + def create_mimetype_file(self): + file_name = self.root_directory + "/mimetype" + fd = open(file_name, 'w') + fd.write('application/epub+zip') + fd.close() + return file_name + + def create_container_file(self, metainf_dir): + fd = open(metainf_dir + "/container.xml", 'w') + fd.write('\n') + fd.write('\n') + fd.write('\n') + fd.write('\n') + fd.write('\n') + fd.write('') + fd.close() + + def create_content_file(self, oebps_dir, file_list): + fd = open(oebps_dir + "/content.opf", 'w') + + fd.write('\n') + fd.write('\n') + + # metadata + fd.write('\n') + fd.write('%s\n' % self._title) + fd.write('%s\n' % self._creator) + fd.write('' + + 'urn:uuid:%s\n' % self._id) + fd.write('%s\n' % self._language) + fd.write('\n' % self._cover_image) + fd.write('\n') + + # manifest + fd.write('\n') + fd.write('\n') + + if self._cover_image != None: + fd.write('\n') + + count = 0 + for file_name in file_list: + if file_name.endswith('.html') or file_name.endswith('.htm'): + mime = 'application/xhtml+xml' + elif file_name.endswith('.css'): + mime = 'text/css' + elif file_name.endswith('.png'): + mime = 'image/png' + elif file_name.endswith('.jpg') or file_name.endswith('.jpeg'): + mime = 'image/jpeg' + elif file_name.endswith('.gif'): + mime = 'image/gif' + + content_id = 'content' + if count > 0: + content_id = 'content%d' % count + + fd.write('\n' % mime) + count = count + 1 + + if self._cover_image != None: + fd.write('\n') + fd.write('\n') + + # spine + fd.write('\n') + if self._cover_image != None: + fd.write('\n') + fd.write('\n') + fd.write('\n') + + # guide + fd.write('\n') + if self._cover_image != None: + fd.write('\n') + fd.write('\n') + fd.write('\n') + fd.close() + + def create_toc_file(self, oebps_dir, file_list): + fd = open(oebps_dir + "/toc.ncx", 'w') + fd.write('\n') + fd.write('\n') + fd.write('\n') + + fd.write('\n') + fd.write('\n' % self._id) + fd.write('\n') + fd.write('\n') + fd.write('\n') + fd.write('\n') + + fd.write('\n') + fd.write('%s\n' % self._title) + fd.write('\n') + + fd.write('\n') + np = 1 + if self._cover_image != None: + fd.write('\n') + fd.write('\n') + fd.write('Book cover\n') + fd.write('\n') + fd.write('\n') + fd.write('\n') + np = np + 1 + + for file_name in file_list: + fd.write('\n' % (np, np)) + fd.write('\n') + fd.write('Contents\n') + fd.write('\n') + fd.write('\n' % os.path.basename(file_name)) + fd.write('\n') + np = np + 1 + + fd.write('\n') + fd.write('\n') + fd.close() + + def create_archive(self, epub_file_name): + '''Create the ZIP archive. + The mimetype must be the first file in the archive + and it must not be compressed.''' + + epub_name = '%s.epub' % epub_file_name + + # The EPUB must contain the META-INF and mimetype files at the root, so + # we'll create the archive in the working directory first + # and move it later + os.chdir(self.root_directory) + + # Open a new zipfile for writing + epub = zipfile.ZipFile(epub_name, 'w') + + # Add the mimetype file first and set it to be uncompressed + epub.write('mimetype', compress_type=zipfile.ZIP_STORED) + + # For the remaining paths in the EPUB, add all of their files + # using normal ZIP compression + self._scan_dir('.', epub) + epub.close() + + def _scan_dir(self, path, epub_file): + for p in os.listdir(path): + print "P", p + if os.path.isdir(os.path.join(path, p)): + self._scan_dir(os.path.join(path, p), epub_file) + else: + if p != 'mimetype': + epub_file.write(os.path.join(path, p), + compress_type=zipfile.ZIP_DEFLATED) + + def clean_html_file(self, file_name, dest_directory): + file_content = open(file_name).read() + soup = BeautifulSoup.BeautifulSoup(file_content) + + # change src in images and add to the image list + for img in soup.findAll('img'): + del(img['border']) + + if not img['src'].startswith('http://'): + # Same problem again: We flatten layers, so this won't work + # properly in the wild + self.images.append(os.path.join(os.path.dirname(file_name), + img['src'])) + img['src'] = os.path.join('images', + os.path.basename(img['src'])) + else: + # we need implement this + pass + + # change href in css links and add to the css list + for css in soup.findAll('link'): + if css['rel'] == 'stylesheet': + if not css['href'].startswith('http://'): + self.css.append(os.path.join( + os.path.dirname(file_name), css['href'])) + css['href'] = os.path.join('css', + os.path.basename(css['href'])) + + # remove all the script nodes + [item.extract() for item in soup.findAll('script')] + + # remove all the form nodes + [item.extract() for item in soup.findAll('form')] + + # remove all the cooments + comments = soup.findAll(text=lambda text: isinstance(text, + BeautifulSoup.Comment)) + [comment.extract() for comment in comments] + + # remove links who execute javascript (TODO is not working) + for link in soup.findAll('a'): + try: + if link['href'].startswith('javascript:'): + link.extract() + except: + pass + del(link['name']) + + # remove clear in style attribute (TODO is not working) + for element in soup.find(True): + try: + print element + if element['style'].find('clear') > -1: + del(element['style']) + except: + pass + + # remove lang property in html node + for html in soup.findAll('html'): + del(html['lang']) + + # remove onload property in body node + for body in soup.findAll('body'): + del(body['onload']) + + fd = open(os.path.join(dest_directory, + os.path.basename(file_name)), 'w') + fd.write(str(soup)) + fd.close() + + +if __name__ == '__main__': + epf = EpubFactory('Historia de la Argentina', 'Gonzalo', 'es_ES') + epf.make_epub(['datos/NewToolbar.html', + 'datos/essential.shtml.html', + 'datos/essential1.shtml.html']) + epf.create_archive('/tmp/test-f1') diff --git a/epubfactory.txt b/epubfactory.txt new file mode 100644 index 0000000..4d97fb8 --- /dev/null +++ b/epubfactory.txt @@ -0,0 +1,66 @@ +Creating EPUB files: +URLS: + http://www.ibm.com/developerworks/xml/tutorials/x-epubtut/section3.html + http://www.manuel-strehl.de/dev/simple_epub_ebooks_with_python + http://pypi.python.org/pypi/epubC/0.1.1 + http://www.epubbooks.com/blog/183/creating-an-epub-document + http://threepress.org/document/epub-validate/ + +Structure + +mimetype +META-INF/ + container.xml +OEBPS/ + content.opf + title.html + content.html + stylesheet.css + toc.ncx + images/ + cover.png + +mimetype: +application/epub+zip + +META-INF/container.xml: + + + + + + + + +content.opf: + + + + + Hello World: My First EPUB + My Name + urn:uuid:0cc33cbd-94e2-49c1-909a-72ae16bc2658 + en-US + + + + + + + + + + + + + + + + + + -- cgit v0.9.1