epubfactory.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315

# Copyright (C) 2011, Gonzalo Odiard <gonzalo@laptop.org>

import os
import shutil
import zipfile
import BeautifulSoup
import re


class EpubFactory():

    def __init__(self, title, creator, language):
        self._title = title
        self._creator = creator
        # TODO create unique id
        self._id = 'asdfasdfvsadfgsdfhfghfghdfhdfghf'
        self._language = language
        self._cover_image = None

    def make_epub(self, file_list):
        self._tmp_directory = '/tmp'
        self._list_files = file_list

        self.root_directory = self._tmp_directory + "/epub%udir" % os.getpid()
        os.mkdir(self.root_directory)

        self.mimetype_file = self.create_mimetype_file()

        metainf_dir = self.root_directory + '/META-INF'
        os.mkdir(metainf_dir)
        self.create_container_file(metainf_dir)

        oebps_dir = self.root_directory + '/OEBPS'
        os.mkdir(oebps_dir)

        self.create_toc_file(oebps_dir, file_list)

        self.images = []
        self.css = []
        for file_name in file_list:
            if file_name.endswith('.html') or file_name.endswith('.htm'):
                self.clean_html_file(file_name,
                        os.path.join(self.root_directory, 'OEBPS'))
            else:
                shutil.copyfile(file_name,
                    os.path.join(self.root_directory, 'OEBPS',
                    os.path.basename(file_name)))

        if len(self.images) > 0:
            os.mkdir(os.path.join(oebps_dir, 'images'))
        if len(self.css) > 0:
            os.mkdir(os.path.join(oebps_dir, 'css'))

        content_file_list = []
        for file_name in file_list:
            content_file_list.append(os.path.basename(file_name))

        for img_name in self.images:
            shutil.copyfile(img_name,
                os.path.join(self.root_directory, 'OEBPS', 'images',
                os.path.basename(img_name)))
            content_file_list.append(os.path.join('images',
                    os.path.basename(img_name)))

        for css_name in self.css:
            shutil.copyfile(css_name,
                os.path.join(self.root_directory, 'OEBPS', 'css',
                os.path.basename(css_name)))
            content_file_list.append(os.path.join('css',
                    os.path.basename(css_name)))

        self.create_content_file(oebps_dir, content_file_list)

    def create_mimetype_file(self):
        file_name = self.root_directory + "/mimetype"
        fd = open(file_name, 'w')
        fd.write('application/epub+zip')
        fd.close()
        return file_name

    def create_container_file(self, metainf_dir):
        fd = open(metainf_dir + "/container.xml", 'w')
        fd.write('<?xml version="1.0"?>\n')
        fd.write('<container version="1.0" ')
        fd.write('xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n')
        fd.write('<rootfiles>\n')
        fd.write('<rootfile full-path="OEBPS/content.opf" ')
        fd.write('media-type="application/oebps-package+xml" />\n')
        fd.write('</rootfiles>\n')
        fd.write('</container>')
        fd.close()

    def create_content_file(self, oebps_dir, file_list):
        fd = open(oebps_dir + "/content.opf", 'w')

        fd.write('<?xml version="1.0" encoding="utf-8"?>\n')
        fd.write('<package xmlns="http://www.idpf.org/2007/opf" ')
        fd.write('xmlns:dc="http://purl.org/dc/elements/1.1/" ')
        fd.write('unique-identifier="bookid" version="2.0">\n')

        # metadata
        fd.write('<metadata>\n')
        fd.write('<dc:title>%s</dc:title>\n' % self._title)
        fd.write('<dc:creator>%s</dc:creator>\n' % self._creator)
        fd.write('<dc:identifier id="bookid">' +
                'urn:uuid:%s</dc:identifier>\n' % self._id)
        fd.write('<dc:language>%s</dc:language>\n' % self._language)
        fd.write('<meta name="cover" content="%s"/>\n' % self._cover_image)
        fd.write('</metadata>\n')

        # manifest
        fd.write('<manifest>\n')
        fd.write('<item id="ncx" href="toc.ncx" ' +
                'media-type="application/x-dtbncx+xml"/>\n')

        if self._cover_image != None:
            fd.write('<item id="cover" href="title.html" ' +
                    'media-type="application/xhtml+xml"/>\n')

        count = 0
        for file_name in file_list:
            if file_name.endswith('.html') or file_name.endswith('.htm'):
                mime = 'application/xhtml+xml'
            elif file_name.endswith('.css'):
                mime = 'text/css'
            elif file_name.endswith('.png'):
                mime = 'image/png'
            elif file_name.endswith('.jpg') or file_name.endswith('.jpeg'):
                mime = 'image/jpeg'
            elif file_name.endswith('.gif'):
                mime = 'image/gif'

            content_id = 'content'
            if count > 0:
                content_id = 'content%d' % count

            fd.write('<item id="%s" href="%s" ' % (content_id, file_name) +
                    'media-type="%s"/>\n' % mime)
            count = count + 1

        if self._cover_image != None:
            fd.write('<item id="cover-image" href="images/cover.png" ' +
                    'media-type="image/png"/>\n')
        fd.write('</manifest>\n')

        # spine
        fd.write('<spine toc="ncx">\n')
        if self._cover_image != None:
            fd.write('<itemref idref="cover" linear="no"/>\n')
        fd.write('<itemref idref="content"/>\n')
        fd.write('</spine>\n')

        # guide
        fd.write('<guide>\n')
        if self._cover_image != None:
            fd.write('<reference href="title.html" type="cover" ' +
                    'title="Cover"/>\n')
        fd.write('</guide>\n')
        fd.write('</package>\n')
        fd.close()

    def create_toc_file(self, oebps_dir, file_list):
        fd = open(oebps_dir + "/toc.ncx", 'w')
        fd.write('<?xml version="1.0" encoding="utf-8"?>\n')
        fd.write('<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"\n')
        fd.write('"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">\n')
        fd.write('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" ' +
                'version="2005-1">\n')

        fd.write('<head>\n')
        fd.write('<meta name="dtb:uid" ' +
                'content="urn:uuid:%s"/>\n' % self._id)
        fd.write('<meta name="dtb:depth" content="1"/>\n')
        fd.write('<meta name="dtb:totalPageCount" content="0"/>\n')
        fd.write('<meta name="dtb:maxPageNumber" content="0"/>\n')
        fd.write('</head>\n')

        fd.write('<docTitle>\n')
        fd.write('<text>%s</text>\n' % self._title)
        fd.write('</docTitle>\n')

        fd.write('<navMap>\n')
        np = 1
        if self._cover_image != None:
            fd.write('<navPoint id="navpoint-1" playOrder="1">\n')
            fd.write('<navLabel>\n')
            fd.write('<text>Book cover</text>\n')
            fd.write('</navLabel>\n')
            fd.write('<content src="title.html"/>\n')
            fd.write('</navPoint>\n')
            np = np + 1

        for file_name in file_list:
            fd.write('<navPoint id="navpoint-%d" playOrder="%d">\n' % (np, np))
            fd.write('<navLabel>\n')
            fd.write('<text>Contents</text>\n')
            fd.write('</navLabel>\n')
            fd.write('<content src="%s"/>\n' % os.path.basename(file_name))
            fd.write('</navPoint>\n')
            np = np + 1

        fd.write('</navMap>\n')
        fd.write('</ncx>\n')
        fd.close()

    def create_archive(self, epub_file_name):
        '''Create the ZIP archive.
        The mimetype must be the first file in the archive
        and it must not be compressed.'''

        epub_name = '%s.epub' % epub_file_name

        # The EPUB must contain the META-INF and mimetype files at the root, so
        # we'll create the archive in the working directory first
        # and move it later
        os.chdir(self.root_directory)

        # Open a new zipfile for writing
        epub = zipfile.ZipFile(epub_name, 'w')

        # Add the mimetype file first and set it to be uncompressed
        epub.write('mimetype', compress_type=zipfile.ZIP_STORED)

        # For the remaining paths in the EPUB, add all of their files
        # using normal ZIP compression
        self._scan_dir('.', epub)
        epub.close()

    def _scan_dir(self, path, epub_file):
        for p in os.listdir(path):
            print "P", p
            if os.path.isdir(os.path.join(path, p)):
                self._scan_dir(os.path.join(path, p), epub_file)
            else:
                if p != 'mimetype':
                    epub_file.write(os.path.join(path, p),
                            compress_type=zipfile.ZIP_DEFLATED)

    def clean_html_file(self, file_name, dest_directory):
        file_content = open(file_name).read()
        soup = BeautifulSoup.BeautifulSoup(file_content)

        # change src in images and add to the image list
        for img in soup.findAll('img'):
            del(img['border'])

            if not img['src'].startswith('http://'):
                # Same problem again: We flatten layers, so this won't work
                # properly in the wild
                self.images.append(os.path.join(os.path.dirname(file_name),
                        img['src']))
                img['src'] = os.path.join('images',
                        os.path.basename(img['src']))
            else:
                # we need implement this
                pass

        # change href in css links and add to the css list
        for css in soup.findAll('link'):
            if css['rel'] == 'stylesheet':
                if not css['href'].startswith('http://'):
                    self.css.append(os.path.join(
                                os.path.dirname(file_name), css['href']))
                    css['href'] = os.path.join('css',
                            os.path.basename(css['href']))

        # remove all the script nodes
        [item.extract() for item in soup.findAll('script')]

        # remove all the form nodes
        [item.extract() for item in soup.findAll('form')]

        # remove all the cooments
        comments = soup.findAll(text=lambda text: isinstance(text,
                    BeautifulSoup.Comment))
        [comment.extract() for comment in comments]

        # remove links who execute javascript (TODO is not working)
        for link in soup.findAll('a'):
            try:
                if link['href'].startswith('javascript:'):
                    link.extract()
            except:
                pass
            del(link['name'])

        # remove clear in style attribute (TODO is not working)
        for element in soup.find(True):
            try:
                print element
                if element['style'].find('clear') > -1:
                    del(element['style'])
            except:
                pass

        # remove lang property in html node
        for html in soup.findAll('html'):
            del(html['lang'])

        # remove onload property in body node
        for body in soup.findAll('body'):
            del(body['onload'])

        fd = open(os.path.join(dest_directory,
                    os.path.basename(file_name)), 'w')
        fd.write(str(soup))
        fd.close()


if __name__ == '__main__':
    epf = EpubFactory('Historia de la Argentina', 'Gonzalo', 'es_ES')
    epf.make_epub(['datos/NewToolbar.html',
                'datos/essential.shtml.html',
                'datos/essential1.shtml.html'])
    epf.create_archive('/tmp/test-f1')