Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--epubfactory.py315
-rw-r--r--epubfactory.txt66
2 files changed, 381 insertions, 0 deletions
diff --git a/epubfactory.py b/epubfactory.py
new file mode 100644
index 0000000..ef519e0
--- /dev/null
+++ b/epubfactory.py
@@ -0,0 +1,315 @@
+# Copyright (C) 2011, Gonzalo Odiard <gonzalo@laptop.org>
+
+import os
+import shutil
+import zipfile
+import BeautifulSoup
+import re
+
+
+class EpubFactory():
+
+ def __init__(self, title, creator, language):
+ self._title = title
+ self._creator = creator
+ # TODO create unique id
+ self._id = 'asdfasdfvsadfgsdfhfghfghdfhdfghf'
+ self._language = language
+ self._cover_image = None
+
+ def make_epub(self, file_list):
+ self._tmp_directory = '/tmp'
+ self._list_files = file_list
+
+ self.root_directory = self._tmp_directory + "/epub%udir" % os.getpid()
+ os.mkdir(self.root_directory)
+
+ self.mimetype_file = self.create_mimetype_file()
+
+ metainf_dir = self.root_directory + '/META-INF'
+ os.mkdir(metainf_dir)
+ self.create_container_file(metainf_dir)
+
+ oebps_dir = self.root_directory + '/OEBPS'
+ os.mkdir(oebps_dir)
+
+ self.create_toc_file(oebps_dir, file_list)
+
+ self.images = []
+ self.css = []
+ for file_name in file_list:
+ if file_name.endswith('.html') or file_name.endswith('.htm'):
+ self.clean_html_file(file_name,
+ os.path.join(self.root_directory, 'OEBPS'))
+ else:
+ shutil.copyfile(file_name,
+ os.path.join(self.root_directory, 'OEBPS',
+ os.path.basename(file_name)))
+
+ if len(self.images) > 0:
+ os.mkdir(os.path.join(oebps_dir, 'images'))
+ if len(self.css) > 0:
+ os.mkdir(os.path.join(oebps_dir, 'css'))
+
+ content_file_list = []
+ for file_name in file_list:
+ content_file_list.append(os.path.basename(file_name))
+
+ for img_name in self.images:
+ shutil.copyfile(img_name,
+ os.path.join(self.root_directory, 'OEBPS', 'images',
+ os.path.basename(img_name)))
+ content_file_list.append(os.path.join('images',
+ os.path.basename(img_name)))
+
+ for css_name in self.css:
+ shutil.copyfile(css_name,
+ os.path.join(self.root_directory, 'OEBPS', 'css',
+ os.path.basename(css_name)))
+ content_file_list.append(os.path.join('css',
+ os.path.basename(css_name)))
+
+ self.create_content_file(oebps_dir, content_file_list)
+
+ def create_mimetype_file(self):
+ file_name = self.root_directory + "/mimetype"
+ fd = open(file_name, 'w')
+ fd.write('application/epub+zip')
+ fd.close()
+ return file_name
+
+ def create_container_file(self, metainf_dir):
+ fd = open(metainf_dir + "/container.xml", 'w')
+ fd.write('<?xml version="1.0"?>\n')
+ fd.write('<container version="1.0" ')
+ fd.write('xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n')
+ fd.write('<rootfiles>\n')
+ fd.write('<rootfile full-path="OEBPS/content.opf" ')
+ fd.write('media-type="application/oebps-package+xml" />\n')
+ fd.write('</rootfiles>\n')
+ fd.write('</container>')
+ fd.close()
+
+ def create_content_file(self, oebps_dir, file_list):
+ fd = open(oebps_dir + "/content.opf", 'w')
+
+ fd.write('<?xml version="1.0" encoding="utf-8"?>\n')
+ fd.write('<package xmlns="http://www.idpf.org/2007/opf" ')
+ fd.write('xmlns:dc="http://purl.org/dc/elements/1.1/" ')
+ fd.write('unique-identifier="bookid" version="2.0">\n')
+
+ # metadata
+ fd.write('<metadata>\n')
+ fd.write('<dc:title>%s</dc:title>\n' % self._title)
+ fd.write('<dc:creator>%s</dc:creator>\n' % self._creator)
+ fd.write('<dc:identifier id="bookid">' +
+ 'urn:uuid:%s</dc:identifier>\n' % self._id)
+ fd.write('<dc:language>%s</dc:language>\n' % self._language)
+ fd.write('<meta name="cover" content="%s"/>\n' % self._cover_image)
+ fd.write('</metadata>\n')
+
+ # manifest
+ fd.write('<manifest>\n')
+ fd.write('<item id="ncx" href="toc.ncx" ' +
+ 'media-type="application/x-dtbncx+xml"/>\n')
+
+ if self._cover_image != None:
+ fd.write('<item id="cover" href="title.html" ' +
+ 'media-type="application/xhtml+xml"/>\n')
+
+ count = 0
+ for file_name in file_list:
+ if file_name.endswith('.html') or file_name.endswith('.htm'):
+ mime = 'application/xhtml+xml'
+ elif file_name.endswith('.css'):
+ mime = 'text/css'
+ elif file_name.endswith('.png'):
+ mime = 'image/png'
+ elif file_name.endswith('.jpg') or file_name.endswith('.jpeg'):
+ mime = 'image/jpeg'
+ elif file_name.endswith('.gif'):
+ mime = 'image/gif'
+
+ content_id = 'content'
+ if count > 0:
+ content_id = 'content%d' % count
+
+ fd.write('<item id="%s" href="%s" ' % (content_id, file_name) +
+ 'media-type="%s"/>\n' % mime)
+ count = count + 1
+
+ if self._cover_image != None:
+ fd.write('<item id="cover-image" href="images/cover.png" ' +
+ 'media-type="image/png"/>\n')
+ fd.write('</manifest>\n')
+
+ # spine
+ fd.write('<spine toc="ncx">\n')
+ if self._cover_image != None:
+ fd.write('<itemref idref="cover" linear="no"/>\n')
+ fd.write('<itemref idref="content"/>\n')
+ fd.write('</spine>\n')
+
+ # guide
+ fd.write('<guide>\n')
+ if self._cover_image != None:
+ fd.write('<reference href="title.html" type="cover" ' +
+ 'title="Cover"/>\n')
+ fd.write('</guide>\n')
+ fd.write('</package>\n')
+ fd.close()
+
+ def create_toc_file(self, oebps_dir, file_list):
+ fd = open(oebps_dir + "/toc.ncx", 'w')
+ fd.write('<?xml version="1.0" encoding="utf-8"?>\n')
+ fd.write('<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"\n')
+ fd.write('"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">\n')
+ fd.write('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" ' +
+ 'version="2005-1">\n')
+
+ fd.write('<head>\n')
+ fd.write('<meta name="dtb:uid" ' +
+ 'content="urn:uuid:%s"/>\n' % self._id)
+ fd.write('<meta name="dtb:depth" content="1"/>\n')
+ fd.write('<meta name="dtb:totalPageCount" content="0"/>\n')
+ fd.write('<meta name="dtb:maxPageNumber" content="0"/>\n')
+ fd.write('</head>\n')
+
+ fd.write('<docTitle>\n')
+ fd.write('<text>%s</text>\n' % self._title)
+ fd.write('</docTitle>\n')
+
+ fd.write('<navMap>\n')
+ np = 1
+ if self._cover_image != None:
+ fd.write('<navPoint id="navpoint-1" playOrder="1">\n')
+ fd.write('<navLabel>\n')
+ fd.write('<text>Book cover</text>\n')
+ fd.write('</navLabel>\n')
+ fd.write('<content src="title.html"/>\n')
+ fd.write('</navPoint>\n')
+ np = np + 1
+
+ for file_name in file_list:
+ fd.write('<navPoint id="navpoint-%d" playOrder="%d">\n' % (np, np))
+ fd.write('<navLabel>\n')
+ fd.write('<text>Contents</text>\n')
+ fd.write('</navLabel>\n')
+ fd.write('<content src="%s"/>\n' % os.path.basename(file_name))
+ fd.write('</navPoint>\n')
+ np = np + 1
+
+ fd.write('</navMap>\n')
+ fd.write('</ncx>\n')
+ fd.close()
+
+ def create_archive(self, epub_file_name):
+ '''Create the ZIP archive.
+ The mimetype must be the first file in the archive
+ and it must not be compressed.'''
+
+ epub_name = '%s.epub' % epub_file_name
+
+ # The EPUB must contain the META-INF and mimetype files at the root, so
+ # we'll create the archive in the working directory first
+ # and move it later
+ os.chdir(self.root_directory)
+
+ # Open a new zipfile for writing
+ epub = zipfile.ZipFile(epub_name, 'w')
+
+ # Add the mimetype file first and set it to be uncompressed
+ epub.write('mimetype', compress_type=zipfile.ZIP_STORED)
+
+ # For the remaining paths in the EPUB, add all of their files
+ # using normal ZIP compression
+ self._scan_dir('.', epub)
+ epub.close()
+
+ def _scan_dir(self, path, epub_file):
+ for p in os.listdir(path):
+ print "P", p
+ if os.path.isdir(os.path.join(path, p)):
+ self._scan_dir(os.path.join(path, p), epub_file)
+ else:
+ if p != 'mimetype':
+ epub_file.write(os.path.join(path, p),
+ compress_type=zipfile.ZIP_DEFLATED)
+
+ def clean_html_file(self, file_name, dest_directory):
+ file_content = open(file_name).read()
+ soup = BeautifulSoup.BeautifulSoup(file_content)
+
+ # change src in images and add to the image list
+ for img in soup.findAll('img'):
+ del(img['border'])
+
+ if not img['src'].startswith('http://'):
+ # Same problem again: We flatten layers, so this won't work
+ # properly in the wild
+ self.images.append(os.path.join(os.path.dirname(file_name),
+ img['src']))
+ img['src'] = os.path.join('images',
+ os.path.basename(img['src']))
+ else:
+ # we need implement this
+ pass
+
+ # change href in css links and add to the css list
+ for css in soup.findAll('link'):
+ if css['rel'] == 'stylesheet':
+ if not css['href'].startswith('http://'):
+ self.css.append(os.path.join(
+ os.path.dirname(file_name), css['href']))
+ css['href'] = os.path.join('css',
+ os.path.basename(css['href']))
+
+ # remove all the script nodes
+ [item.extract() for item in soup.findAll('script')]
+
+ # remove all the form nodes
+ [item.extract() for item in soup.findAll('form')]
+
+ # remove all the cooments
+ comments = soup.findAll(text=lambda text: isinstance(text,
+ BeautifulSoup.Comment))
+ [comment.extract() for comment in comments]
+
+ # remove links who execute javascript (TODO is not working)
+ for link in soup.findAll('a'):
+ try:
+ if link['href'].startswith('javascript:'):
+ link.extract()
+ except:
+ pass
+ del(link['name'])
+
+ # remove clear in style attribute (TODO is not working)
+ for element in soup.find(True):
+ try:
+ print element
+ if element['style'].find('clear') > -1:
+ del(element['style'])
+ except:
+ pass
+
+ # remove lang property in html node
+ for html in soup.findAll('html'):
+ del(html['lang'])
+
+ # remove onload property in body node
+ for body in soup.findAll('body'):
+ del(body['onload'])
+
+ fd = open(os.path.join(dest_directory,
+ os.path.basename(file_name)), 'w')
+ fd.write(str(soup))
+ fd.close()
+
+
+if __name__ == '__main__':
+ epf = EpubFactory('Historia de la Argentina', 'Gonzalo', 'es_ES')
+ epf.make_epub(['datos/NewToolbar.html',
+ 'datos/essential.shtml.html',
+ 'datos/essential1.shtml.html'])
+ epf.create_archive('/tmp/test-f1')
diff --git a/epubfactory.txt b/epubfactory.txt
new file mode 100644
index 0000000..4d97fb8
--- /dev/null
+++ b/epubfactory.txt
@@ -0,0 +1,66 @@
+Creating EPUB files:
+URLS:
+ http://www.ibm.com/developerworks/xml/tutorials/x-epubtut/section3.html
+ http://www.manuel-strehl.de/dev/simple_epub_ebooks_with_python
+ http://pypi.python.org/pypi/epubC/0.1.1
+ http://www.epubbooks.com/blog/183/creating-an-epub-document
+ http://threepress.org/document/epub-validate/
+
+Structure
+
+mimetype
+META-INF/
+ container.xml
+OEBPS/
+ content.opf
+ title.html
+ content.html
+ stylesheet.css
+ toc.ncx
+ images/
+ cover.png
+
+mimetype:
+application/epub+zip
+
+META-INF/container.xml:
+
+<?xml version="1.0"?>
+<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
+ <rootfiles>
+ <rootfile full-path="OEBPS/content.opf"
+ media-type="application/oebps-package+xml" />
+ </rootfiles>
+</container>
+
+content.opf:
+
+<?xml version='1.0' encoding='utf-8'?>
+<package xmlns="http://www.idpf.org/2007/opf"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ unique-identifier="bookid" version="2.0">
+ <metadata>
+ <dc:title>Hello World: My First EPUB</dc:title>
+ <dc:creator>My Name</dc:creator>
+ <dc:identifier
+id="bookid">urn:uuid:0cc33cbd-94e2-49c1-909a-72ae16bc2658</dc:identifier>
+ <dc:language>en-US</dc:language>
+ <meta name="cover" content="cover-image" />
+ </metadata>
+ <manifest>
+ <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
+ <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
+ <item id="content" href="content.html"
+media-type="application/xhtml+xml"/>
+ <item id="cover-image" href="images/cover.png" media-type="image/png"/>
+ <item id="css" href="stylesheet.css" media-type="text/css"/>
+ </manifest>
+ <spine toc="ncx">
+ <itemref idref="cover" linear="no"/>
+ <itemref idref="content"/>
+ </spine>
+ <guide>
+ <reference href="title.html" type="cover" title="Cover"/>
+ </guide>
+</package>
+