This repository has been archived by the owner on Oct 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
epubC.py
316 lines (258 loc) · 10.6 KB
/
epubC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
##############################################################################
# File: epubC.py
# Description: Classes for editing an epub file.
# Authors: Awad Mackie <[email protected]>
# Date: 02-07-2010
# Copyright: (c) 2010 firesock serwalek
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
##############################################################################/
"""
This modules provides classes to create a EPUB file from scratch given data.
See documentation on class Epub.
"""
import tempfile, shutil, os, zipfile, hashlib, datetime
from lxml.builder import ElementMaker
from lxml import etree
DCMI = "http://purl.org/dc/elements/1.1/"
OPF = "http://www.idpf.org/2007/opf"
NCX = "http://www.daisy.org/z3986/2005/ncx/"
class Epub(object):
"""
Class is used to create epub files, writing all necessary XML and contents pages.
Example of use:
import epubC
ep = epubC.Epub("Title", "Author", "ID")
As specifed, ID should be a unique identifier such as an ISBN. Attribs are set on the object
for easy change.
ep.add("content.html", "application/xhtml+xml", io.StringIO(html), "Chapter 1")
Arcname is the internal name incase any other files reference it. html contents must be
valid XHTML, so set media type accordingly. data is a stream out of an html string in memory,
but can be data of any type, even binary.
Title is the Chapter/Section name.
ep.add("test.css", "text/css", io.StringIO(css))
If the title is omitted or None, it assumes that it is not in the linear reading order
and only adds it to the file.
ep.write("test.epub")
Creates the final file in name specified. No extensions are added, so specify full path.
ep.close()
Frees up temporary files made in the process. Call this unless you want your storage
littered with files.
Context managers are also supported, so:
with epubC.Epub("Title", "Author", "ID") as ep:
ep.add("content.html", "application/xhtml+xml", io.StringIO(html), "Chapter 1")
ep.write("test.epub")
In addition to attributes specified above, additional metadata attributes are supported:
- subject, description, publisher, contributor, date, type, format, source, relation, coverage, rights
Set them on the Epub object to write as OPF metadata. Please refer to the spec for further details.
Lists will be written as multiple tags and dates converted from Python datetime.date objects.
"""
_uid_id_str = "bookid"
_metadata = dict(
title = ["title", {}],
author = ["creator", {"{%s}role" % OPF : "aut"}],
uid = ["identifier", {"id" : _uid_id_str}],
language = ["language", {}],
subject = ["subject", {}],
description = ["description", {}],
publisher = ["publisher", {}],
contributor = ["contributor", {}],
date = ["date", {}],
type = ["type", {}],
format = ["format", {}],
source = ["source", {}],
relation = ["relation", {}],
coverage = ["coverage", {}],
rights = ["rights", {}],
)
def _dcmi_taglist(self, attr_vals, tag, attribs):
"""Internal: get tags for single supported DCMI tag"""
if not isinstance(attr_vals, list):
attr_vals = [attr_vals]
taglist = []
for value in attr_vals:
#Just run through some common transformations
if isinstance(value, datetime.date):
value = value.isoformat()
taglist.append(getattr(self._DCMI, tag)(value, **attribs))
return taglist
def _dcmi_tags(self):
"""Internal: get list of all set DCMI tags"""
tags = []
for py_attrib, (tag, attribs) in self._metadata.items():
attr_val = getattr(self, py_attrib, None)
if attr_val is not None:
tags.extend(self._dcmi_taglist(attr_val, tag, attribs))
return tags
def _standard_setup(self):
"""Internal function to create standard files at the beginning."""
self._meta = os.path.abspath(os.path.join(self._path, "META-INF"))
self._oebps = os.path.abspath(os.path.join(self._path, "OEBPS"))
os.mkdir(self._meta)
os.mkdir(self._oebps)
with open(os.path.join(self._path, "mimetype"), "wt", encoding = "utf-8") as mimefile:
mimefile.write("application/epub+zip")
#Contents pretty much the same always but use lxml anyway
E = ElementMaker(nsmap = {None: "urn:oasis:names:tc:opendocument:xmlns:container"})
rootfile_tag = E.rootfile()
rootfile_tag.set("full-path", "OEBPS/content.opf")
rootfile_tag.set("media-type", "application/oebps-package+xml")
topelem = E.container(
E.rootfiles(
rootfile_tag
),
version="1.0"
)
tree = topelem.getroottree()
tree.write(os.path.join(self._meta, "container.xml"), pretty_print = True,
xml_declaration = True, encoding = "utf-8")
def __init__(self, title, author, uid, language="en-US"):
"""Creates temporary space for epub file, and writes simple files.
Call write() to create final output.
Call close() to clear temporary space on the file system.
Attributes are set on object using same names, e.g. title, author,
uid and language and can be changed the same way"""
self.title = title
self.author = author
self.uid = uid
self.language = language
#This holds everything the user adds in a list of tuples
#(Filename in OEBPS, media-type, Title or None, id - Text)
self._files = []
self._path = os.path.abspath(tempfile.mkdtemp())
#Content.opf creators
#OPF duplicate declarations to get exactly the namespacing behaviour we want from lxml
self._DCMI = ElementMaker(namespace = DCMI)
self._OPF = ElementMaker(nsmap = {None: OPF, "opf": OPF, "dc": DCMI})
self._NCX = ElementMaker(nsmap = {None: NCX})
self._standard_setup()
def close(self):
"""Clears filesystem of redundant files, obsoletes object.
MAKE SURE TO CALL THIS ALWAYS."""
shutil.rmtree(self._path)
def _flush(self):
"""Internal function that writes XML files into directory. Regenerates
content-dependent data."""
O = self._OPF
N = self._NCX
o_ncx = O.item(id = "ncx", href = "toc.ncx")
o_ncx.set("media-type", "application/x-dtbncx+xml")
manifest = O.manifest(o_ncx)
spine = O.spine(toc="ncx")
navmap = N.navMap()
#Keep track of content added for ncx play order
play_order = 1
for filename, media, title, uid in self._files:
item = O.item(id = uid, href = filename)
item.set("media-type", media)
manifest.append(item)
#No title for content, its a resource.
#Therefore, not in spine or ncx
if title is None:
continue
spine.append(O.itemref(idref = uid))
navmap.append(N.navPoint(
N.navLabel(
N.text(title),
),
N.content(src = filename),
id = ("nav" + str(play_order)),
playOrder = str(play_order)
))
play_order += 1
#Generate full trees
opf_topelem = O.package(
O.metadata(
*self._dcmi_tags()
),
manifest,
spine,
version = "2.0"
)
opf_topelem.set("unique-identifier", self._uid_id_str)
opf_tree = opf_topelem.getroottree()
ncx_topelem = N.ncx(
N.head(
N.meta(name = "dtb:uid", content = self.uid),
N.meta(name = "dtb:depth", content = "1"),
N.meta(name = "dtb:totalPageCount", content = "0"),
N.meta(name = "dtb:maxPageNumber", content = "0")
),
N.docTitle(
N.text(self.title)
),
navmap,
version = "2005-1"
)
ncx_tree = ncx_topelem.getroottree()
opf_tree.write(os.path.join(self._oebps, "content.opf"), pretty_print = True,
xml_declaration = True, encoding = "utf-8")
#lxml etree.tostring supports doctype decl but write doesn't
#As we fix the output to utf-8 in code, do encoding roundtrip to force xml decl from lxml
ncx_text = etree.tostring(ncx_tree, pretty_print = True, xml_declaration = True,
encoding = "utf-8",
doctype = "<!DOCTYPE ncx PUBLIC '-//NISO//DTD ncx 2005-1//EN' 'http://www.daisy.org/z3986/2005/ncx-2005-1.dtd'>\n").decode('utf-8')
with open(os.path.join(self._oebps, "toc.ncx"), "wt", encoding = "utf-8") as ncx:
ncx.write(ncx_text)
def write(self, file):
"""Given an absolute filename, file, write as epub file.
No extension is added, so provide full file and absolute path.
Can be called as many times as necessary until close() is called.
"""
self._flush()
with zipfile.ZipFile(file, mode="w", compression=zipfile.ZIP_DEFLATED) as epub:
#Write things outside of OEBPS dir
#mimetype needs to be uncompressed and first
epub.write(os.path.join(self._path, "mimetype"), arcname = "mimetype",
compress_type=zipfile.ZIP_STORED)
epub.write(os.path.join(self._meta, "container.xml"),
arcname = os.path.join("META-INF", "container.xml"))
#Walk the directory chain, add everything in OEBPS
top_len = len(self._path)
for path, dirnames, filenames in os.walk(self._oebps):
#Make archive directories relative to top, not absolute
arc_dir = path[top_len:]
for filen in filenames:
epub.write(os.path.join(path, filen), arcname = os.path.join(arc_dir, filen))
def add(self, arcname, media_type, data, content_title = None):
"""Main method to add contents to epub file.
arcname is the epub internal name for a file.
media_type is the MIME type for a file.
data is a stream object that contains a file to be added.
content_title is the title of this file. e.g. Chapter.
- With no title it is treated as a resource not in the reading
order.
Will overwrite previous file with same arcname."""
#Use md5 as ID, might be useful somewhere.
md5 = hashlib.md5()
with open(os.path.join(self._oebps, arcname), "wb") as arcfile:
block = data.read(md5.block_size)
if isinstance(block, str):
block = block.encode()
data_read = lambda : data.read(md5.block_size).encode()
else:
data_read = lambda : data.read(md5.block_size)
while len(block) != 0:
md5.update(block)
arcfile.write(block)
block = data_read()
#id's need to start with a letter according to epubcheck
self._files.append((arcname, media_type, content_title, "h" +
md5.hexdigest()))
#Simple context manager support
def __enter__(self):
"""Return self to use object as context manager."""
return self
def __exit__(self, type, val, tb):
"""Make sure everything is closed and pass exceptions up."""
self.close()
return False