diff options
-rw-r--r-- | epubfactory.py | 315 | ||||
-rw-r--r-- | epubfactory.txt | 66 |
2 files changed, 381 insertions, 0 deletions
diff --git a/epubfactory.py b/epubfactory.py new file mode 100644 index 0000000..ef519e0 --- /dev/null +++ b/epubfactory.py @@ -0,0 +1,315 @@ +# Copyright (C) 2011, Gonzalo Odiard <gonzalo@laptop.org> + +import os +import shutil +import zipfile +import BeautifulSoup +import re + + +class EpubFactory(): + + def __init__(self, title, creator, language): + self._title = title + self._creator = creator + # TODO create unique id + self._id = 'asdfasdfvsadfgsdfhfghfghdfhdfghf' + self._language = language + self._cover_image = None + + def make_epub(self, file_list): + self._tmp_directory = '/tmp' + self._list_files = file_list + + self.root_directory = self._tmp_directory + "/epub%udir" % os.getpid() + os.mkdir(self.root_directory) + + self.mimetype_file = self.create_mimetype_file() + + metainf_dir = self.root_directory + '/META-INF' + os.mkdir(metainf_dir) + self.create_container_file(metainf_dir) + + oebps_dir = self.root_directory + '/OEBPS' + os.mkdir(oebps_dir) + + self.create_toc_file(oebps_dir, file_list) + + self.images = [] + self.css = [] + for file_name in file_list: + if file_name.endswith('.html') or file_name.endswith('.htm'): + self.clean_html_file(file_name, + os.path.join(self.root_directory, 'OEBPS')) + else: + shutil.copyfile(file_name, + os.path.join(self.root_directory, 'OEBPS', + os.path.basename(file_name))) + + if len(self.images) > 0: + os.mkdir(os.path.join(oebps_dir, 'images')) + if len(self.css) > 0: + os.mkdir(os.path.join(oebps_dir, 'css')) + + content_file_list = [] + for file_name in file_list: + content_file_list.append(os.path.basename(file_name)) + + for img_name in self.images: + shutil.copyfile(img_name, + os.path.join(self.root_directory, 'OEBPS', 'images', + os.path.basename(img_name))) + content_file_list.append(os.path.join('images', + os.path.basename(img_name))) + + for css_name in self.css: + shutil.copyfile(css_name, + os.path.join(self.root_directory, 'OEBPS', 'css', + os.path.basename(css_name))) + content_file_list.append(os.path.join('css', + os.path.basename(css_name))) + + self.create_content_file(oebps_dir, content_file_list) + + def create_mimetype_file(self): + file_name = self.root_directory + "/mimetype" + fd = open(file_name, 'w') + fd.write('application/epub+zip') + fd.close() + return file_name + + def create_container_file(self, metainf_dir): + fd = open(metainf_dir + "/container.xml", 'w') + fd.write('<?xml version="1.0"?>\n') + fd.write('<container version="1.0" ') + fd.write('xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n') + fd.write('<rootfiles>\n') + fd.write('<rootfile full-path="OEBPS/content.opf" ') + fd.write('media-type="application/oebps-package+xml" />\n') + fd.write('</rootfiles>\n') + fd.write('</container>') + fd.close() + + def create_content_file(self, oebps_dir, file_list): + fd = open(oebps_dir + "/content.opf", 'w') + + fd.write('<?xml version="1.0" encoding="utf-8"?>\n') + fd.write('<package xmlns="http://www.idpf.org/2007/opf" ') + fd.write('xmlns:dc="http://purl.org/dc/elements/1.1/" ') + fd.write('unique-identifier="bookid" version="2.0">\n') + + # metadata + fd.write('<metadata>\n') + fd.write('<dc:title>%s</dc:title>\n' % self._title) + fd.write('<dc:creator>%s</dc:creator>\n' % self._creator) + fd.write('<dc:identifier id="bookid">' + + 'urn:uuid:%s</dc:identifier>\n' % self._id) + fd.write('<dc:language>%s</dc:language>\n' % self._language) + fd.write('<meta name="cover" content="%s"/>\n' % self._cover_image) + fd.write('</metadata>\n') + + # manifest + fd.write('<manifest>\n') + fd.write('<item id="ncx" href="toc.ncx" ' + + 'media-type="application/x-dtbncx+xml"/>\n') + + if self._cover_image != None: + fd.write('<item id="cover" href="title.html" ' + + 'media-type="application/xhtml+xml"/>\n') + + count = 0 + for file_name in file_list: + if file_name.endswith('.html') or file_name.endswith('.htm'): + mime = 'application/xhtml+xml' + elif file_name.endswith('.css'): + mime = 'text/css' + elif file_name.endswith('.png'): + mime = 'image/png' + elif file_name.endswith('.jpg') or file_name.endswith('.jpeg'): + mime = 'image/jpeg' + elif file_name.endswith('.gif'): + mime = 'image/gif' + + content_id = 'content' + if count > 0: + content_id = 'content%d' % count + + fd.write('<item id="%s" href="%s" ' % (content_id, file_name) + + 'media-type="%s"/>\n' % mime) + count = count + 1 + + if self._cover_image != None: + fd.write('<item id="cover-image" href="images/cover.png" ' + + 'media-type="image/png"/>\n') + fd.write('</manifest>\n') + + # spine + fd.write('<spine toc="ncx">\n') + if self._cover_image != None: + fd.write('<itemref idref="cover" linear="no"/>\n') + fd.write('<itemref idref="content"/>\n') + fd.write('</spine>\n') + + # guide + fd.write('<guide>\n') + if self._cover_image != None: + fd.write('<reference href="title.html" type="cover" ' + + 'title="Cover"/>\n') + fd.write('</guide>\n') + fd.write('</package>\n') + fd.close() + + def create_toc_file(self, oebps_dir, file_list): + fd = open(oebps_dir + "/toc.ncx", 'w') + fd.write('<?xml version="1.0" encoding="utf-8"?>\n') + fd.write('<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"\n') + fd.write('"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">\n') + fd.write('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" ' + + 'version="2005-1">\n') + + fd.write('<head>\n') + fd.write('<meta name="dtb:uid" ' + + 'content="urn:uuid:%s"/>\n' % self._id) + fd.write('<meta name="dtb:depth" content="1"/>\n') + fd.write('<meta name="dtb:totalPageCount" content="0"/>\n') + fd.write('<meta name="dtb:maxPageNumber" content="0"/>\n') + fd.write('</head>\n') + + fd.write('<docTitle>\n') + fd.write('<text>%s</text>\n' % self._title) + fd.write('</docTitle>\n') + + fd.write('<navMap>\n') + np = 1 + if self._cover_image != None: + fd.write('<navPoint id="navpoint-1" playOrder="1">\n') + fd.write('<navLabel>\n') + fd.write('<text>Book cover</text>\n') + fd.write('</navLabel>\n') + fd.write('<content src="title.html"/>\n') + fd.write('</navPoint>\n') + np = np + 1 + + for file_name in file_list: + fd.write('<navPoint id="navpoint-%d" playOrder="%d">\n' % (np, np)) + fd.write('<navLabel>\n') + fd.write('<text>Contents</text>\n') + fd.write('</navLabel>\n') + fd.write('<content src="%s"/>\n' % os.path.basename(file_name)) + fd.write('</navPoint>\n') + np = np + 1 + + fd.write('</navMap>\n') + fd.write('</ncx>\n') + fd.close() + + def create_archive(self, epub_file_name): + '''Create the ZIP archive. + The mimetype must be the first file in the archive + and it must not be compressed.''' + + epub_name = '%s.epub' % epub_file_name + + # The EPUB must contain the META-INF and mimetype files at the root, so + # we'll create the archive in the working directory first + # and move it later + os.chdir(self.root_directory) + + # Open a new zipfile for writing + epub = zipfile.ZipFile(epub_name, 'w') + + # Add the mimetype file first and set it to be uncompressed + epub.write('mimetype', compress_type=zipfile.ZIP_STORED) + + # For the remaining paths in the EPUB, add all of their files + # using normal ZIP compression + self._scan_dir('.', epub) + epub.close() + + def _scan_dir(self, path, epub_file): + for p in os.listdir(path): + print "P", p + if os.path.isdir(os.path.join(path, p)): + self._scan_dir(os.path.join(path, p), epub_file) + else: + if p != 'mimetype': + epub_file.write(os.path.join(path, p), + compress_type=zipfile.ZIP_DEFLATED) + + def clean_html_file(self, file_name, dest_directory): + file_content = open(file_name).read() + soup = BeautifulSoup.BeautifulSoup(file_content) + + # change src in images and add to the image list + for img in soup.findAll('img'): + del(img['border']) + + if not img['src'].startswith('http://'): + # Same problem again: We flatten layers, so this won't work + # properly in the wild + self.images.append(os.path.join(os.path.dirname(file_name), + img['src'])) + img['src'] = os.path.join('images', + os.path.basename(img['src'])) + else: + # we need implement this + pass + + # change href in css links and add to the css list + for css in soup.findAll('link'): + if css['rel'] == 'stylesheet': + if not css['href'].startswith('http://'): + self.css.append(os.path.join( + os.path.dirname(file_name), css['href'])) + css['href'] = os.path.join('css', + os.path.basename(css['href'])) + + # remove all the script nodes + [item.extract() for item in soup.findAll('script')] + + # remove all the form nodes + [item.extract() for item in soup.findAll('form')] + + # remove all the cooments + comments = soup.findAll(text=lambda text: isinstance(text, + BeautifulSoup.Comment)) + [comment.extract() for comment in comments] + + # remove links who execute javascript (TODO is not working) + for link in soup.findAll('a'): + try: + if link['href'].startswith('javascript:'): + link.extract() + except: + pass + del(link['name']) + + # remove clear in style attribute (TODO is not working) + for element in soup.find(True): + try: + print element + if element['style'].find('clear') > -1: + del(element['style']) + except: + pass + + # remove lang property in html node + for html in soup.findAll('html'): + del(html['lang']) + + # remove onload property in body node + for body in soup.findAll('body'): + del(body['onload']) + + fd = open(os.path.join(dest_directory, + os.path.basename(file_name)), 'w') + fd.write(str(soup)) + fd.close() + + +if __name__ == '__main__': + epf = EpubFactory('Historia de la Argentina', 'Gonzalo', 'es_ES') + epf.make_epub(['datos/NewToolbar.html', + 'datos/essential.shtml.html', + 'datos/essential1.shtml.html']) + epf.create_archive('/tmp/test-f1') diff --git a/epubfactory.txt b/epubfactory.txt new file mode 100644 index 0000000..4d97fb8 --- /dev/null +++ b/epubfactory.txt @@ -0,0 +1,66 @@ +Creating EPUB files: +URLS: + http://www.ibm.com/developerworks/xml/tutorials/x-epubtut/section3.html + http://www.manuel-strehl.de/dev/simple_epub_ebooks_with_python + http://pypi.python.org/pypi/epubC/0.1.1 + http://www.epubbooks.com/blog/183/creating-an-epub-document + http://threepress.org/document/epub-validate/ + +Structure + +mimetype +META-INF/ + container.xml +OEBPS/ + content.opf + title.html + content.html + stylesheet.css + toc.ncx + images/ + cover.png + +mimetype: +application/epub+zip + +META-INF/container.xml: + +<?xml version="1.0"?> +<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> + <rootfiles> + <rootfile full-path="OEBPS/content.opf" + media-type="application/oebps-package+xml" /> + </rootfiles> +</container> + +content.opf: + +<?xml version='1.0' encoding='utf-8'?> +<package xmlns="http://www.idpf.org/2007/opf" + xmlns:dc="http://purl.org/dc/elements/1.1/" + unique-identifier="bookid" version="2.0"> + <metadata> + <dc:title>Hello World: My First EPUB</dc:title> + <dc:creator>My Name</dc:creator> + <dc:identifier +id="bookid">urn:uuid:0cc33cbd-94e2-49c1-909a-72ae16bc2658</dc:identifier> + <dc:language>en-US</dc:language> + <meta name="cover" content="cover-image" /> + </metadata> + <manifest> + <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/> + <item id="cover" href="title.html" media-type="application/xhtml+xml"/> + <item id="content" href="content.html" +media-type="application/xhtml+xml"/> + <item id="cover-image" href="images/cover.png" media-type="image/png"/> + <item id="css" href="stylesheet.css" media-type="text/css"/> + </manifest> + <spine toc="ncx"> + <itemref idref="cover" linear="no"/> + <itemref idref="content"/> + </spine> + <guide> + <reference href="title.html" type="cover" title="Cover"/> + </guide> +</package> + |