pmd/04-build-ebook.py at main · DanteDT/pmd · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
"""
A few manual steps are required in the final EPUB book
- Create TOC navigation from H1 and H2 HTML headers, if nav.xhtml, below, does not work for your e-reader
- Remove IE6 fixes from the .css stylesheet
- Replace named-entities like &eacute; in the text, since some e-readers do not support
"""
import os
import shutil
import subprocess
import zipfile
import uuid

from bs4 import BeautifulSoup
from epubcheck import EpubCheck
import utils.utilities as utl
import utils.config as config

logger = utl.init_logger()
config_data = config.load_config()
version = config_data["exe_mode"]["version"]
book_id = config_data["exe_mode"]["uuid"]
debugging = config_data["exe_mode"]["debugging"]
epub_ref = config_data["exe_mode"]["epub_ref"]
ttl_lower = config_data["proj_dirs"]["ttl_lower"]

# Folders
IMG_SRC   = config_data["proj_dirs"]["img_dir"]      # source images
CSS_SRC   = config_data["proj_dirs"]["custom_dir"]   # Custom CSS for EPUB
XHTML_SRC = config_data["proj_dirs"]["ch_xhtml"]
CUSTOM_SRC= config_data["proj_dirs"]["custom_dir"]   # custom front and back matter
CUSTOM_IMG= config_data["proj_dirs"]["custom_img"]   # custom images for EPUB

# EPUB structure - separate releases for Footnotes and Hyperlinks
EPUB_BOOK = config_data["epub_dirs"]["epub_book"].format(epub_ref)
EPUB_DIR  = config_data["epub_dirs"]["book_dir"].format(epub_ref)
MET_DIR   = os.path.join(EPUB_DIR, config_data["epub_dirs"]["meta_dir"])
OEB_DIR   = os.path.join(EPUB_DIR, config_data["epub_dirs"]["oeb_dir"])
CSS_DIR   = os.path.join(OEB_DIR, config_data["epub_dirs"]["css_dir"])
IMG_DIR   = os.path.join(OEB_DIR, config_data["epub_dirs"]["img_dir"])

# contents.opf manifest and spine entries
chapters = ['<li><a href="ca-001.xhtml">Cover 1851.</a></li>',
            '        <li><a href="ca-002.xhtml">Front pages 1851.</a></li>',
            '        <li><a href="ca-003.xhtml">Notes from the editor.</a></li>',
            '        <li><a href="toc.xhtml">Contents.</a></li>'
           ]
opf_mani = ['<item id="ca-001" href="ca-001.xhtml" media-type="application/xhtml+xml" properties="svg"/>',
            '    <item id="ca-002" href="ca-002.xhtml" media-type="application/xhtml+xml" properties="svg"/>',
            '    <item id="ca-003" href="ca-003.xhtml" media-type="application/xhtml+xml"/>',
            '    <item id="toc" href="toc.xhtml" media-type="application/xhtml+xml"/>'
            ]
opf_spin = ['<itemref idref="ca-001"/>',
            '    <itemref idref="ca-002"/>',
            '    <itemref idref="ca-003"/>',
            '    <itemref idref="toc"/>'
            ]

# 1. Create temp folder structure, fresh start - so remove EPUB_DIR target dir
if os.path.exists(EPUB_BOOK):
  os.remove(EPUB_BOOK)
  logger.info(f"Removed prior epub \"{EPUB_BOOK}\".")

utl.init_dir(EPUB_DIR)
utl.init_dir(MET_DIR)
utl.init_dir(OEB_DIR)
utl.init_dir(CSS_DIR)
utl.init_dir(IMG_DIR)

# 2. Copy XHTML chapter(s) into OEBPS
for fname in os.listdir(XHTML_SRC):
    '''
    Copy each chapter_xxx.xhtml into OEBPS, and build TOC entries from H1 Title and H2 Subtitle tags.
    - Rules for TOC entries:
      - Remove any leading "CHAPTER " from H1, UPPERCASED during previous cleaning
      - If only H1 title, TOC entry is H1-text (without the leading "CHAPTER ")
      - If exactly one H2 subtitle, collapse to one entry, H1-text. - H2-text.
      - Otherwise nest sibling H2 subtitles as ordered list within H1 title entry
    - Title Case for H2 text, only (leave H1 as-is)
    - Add each chapter to manifest and spine in contents.opf
    '''
    if fname.endswith(".xhtml"):
        chapter_number = int(fname.replace("chapter_", "").replace(".xhtml", ""))

        # For debugging
        if debugging and chapter_number != 0:
            continue

        with open(os.path.join(XHTML_SRC, fname), "r", encoding="utf-8") as f:
            content = f.read()

            # For back-reference to header IDs in chapters
            ttlcnt  = 0
            sttlcnt = 0

            soup = BeautifulSoup(content, "html.parser")
            h1_tags = soup.find_all("h1")

            # Build a TOC entry for every H1 title
            toc_entry = ""
            for h1_tag in h1_tags:
                ttl = h1_tag.get_text().replace("CHAPTER ", "").strip()
                ttlcnt += 1

                extra_br = ""
                if ttlcnt > 1:
                    extra_br = "\n"

                h2_tags = h1_tag.find_next_siblings("h2")

                if not h2_tags:
                    # No Subtitles, only the Chapter entry for PMD "chapters" like 137 through 150
                    ttl = ttl.title()
                    toc_entry += f'{extra_br}        <li><a href="{fname}#title_{ttlcnt:03d}">{ttl}</a></li>'
                    logger.info(f'Created Title-only TOC entry for {fname}, from title "{ttl}".')
                else:
                    # Handle custom TOC entry cases for PMD chapters, below.
                    # Case-insensitive comparison, while preserving original PMD casing in toc_entry
                    len_h2_tags = len(h2_tags)

                    if len_h2_tags == 1:
                        sttlcnt += 1
                        # User custom Title Case function that handles contractions, and lowercases small words
                        subtitle = utl.titlecase(h2_tags[0].get_text().strip(), ignore=ttl_lower)
                        if subtitle:
                            # Exactly one Subtitle for most PMD chapters, 1 through 135
                            # Handle custom TOC entry cases for PMD chapter
                            # 1:   I. - LOOMINGS.
                            if ttl.upper() == "I." and subtitle.upper() == "LOOMINGS.":
                                toc_entry += f'{extra_br}        <li><a href="{fname}#Page_Loomings">{ttl} - {subtitle}</a></li>'
                            else:
                                toc_entry += f'{extra_br}        <li><a href="{fname}#title_{ttlcnt:03d}">{ttl} - {subtitle}</a></li>'
                            logger.info(f'Created Title - Subtitle TOC entry for {fname}, from title "{ttl}" and "{subtitle}".')
                        else:
                            # Title only, empty H2 Subtitle tag, for Epilogue chapter 136
                            # Handle custom TOC entry cases for PMD chapter
                            # 136: CXXXVI. EPILOGUE.
                            if ttl.upper() == "CXXXVI. EPILOGUE.":
                                toc_entry += f'{extra_br}        <li><a href="{fname}#Page_Epilogue">CXXXVI. Epilogue.</a></li>'
                            else:
                                ttl = ttl.title()
                                toc_entry += f'{extra_br}        <li><a href="{fname}#title_{ttlcnt:03d}">{ttl}</a></li>'
                            logger.info(f'Created Title-only TOC entry for {fname}, from title "{ttl}" and "{subtitle}".')
                    else:
                        # Multiple Subtitle sections. Nest the subtitles in PMD front-matter "chapter" 0
                        # 0:   (H1)FRONT MATTER., (H2)ETYMOLOGY AND EXTRACTS., (H2)ETYMOLOGY., (H2)EXTRACTS.
                        if ttl.upper() == "FRONT MATTER.":
                            ttl = "Front Matter."
                            toc_entry += f'{extra_br}        <li><a href="{fname}#Page_FrontMatter">{ttl}</a><ol class="nav-toc">'
                        else:
                            ttl = ttl.title()
                            toc_entry += f'{extra_br}        <li><a href="{fname}#title_{ttlcnt:03d}">{ttl}</a><ol class="nav-toc">'

                        for h2_tag in h2_tags:
                            sttlcnt += 1
                            subtitle = h2_tag.get_text().strip().title()
                            if subtitle:
                                if ttl.upper() == "FRONT MATTER.":
                                    if subtitle.upper() == "ETYMOLOGY AND EXTRACTS.":
                                        subtitle = "Dedication."
                                        toc_entry += f'\n            <li><a href="{fname}#Page_Dedication">{subtitle}</a></li>'
                                    elif subtitle.upper() == "ETYMOLOGY.":
                                        toc_entry += f'\n            <li><a href="{fname}#Page_Etymology">{subtitle}</a></li>'
                                    elif subtitle.upper() == "EXTRACTS.":
                                        toc_entry += f'\n            <li><a href="{fname}#Page_Extracts">{subtitle}</a></li>'
                                else:
                                    toc_entry += f'\n            <li><a href="{fname}#subtitle_{sttlcnt:03d}">{subtitle}</a></li>'
                        toc_entry += '\n        </ol></li>'
                        logger.info(f'Created Title nested Subtitle TOC entries for {fname}.')

        chapters.append(toc_entry)

        with open(os.path.join(OEB_DIR, fname), "w", encoding="utf-8") as f:
            f.write(content)

        # Log manifest and spine for each chapter, for contents.opf
        opf_mani.append(f'    <item id="chapter_{chapter_number:03d}" href="chapter_{chapter_number:03d}.xhtml" media-type="application/xhtml+xml"/>')
        opf_spin.append(f'    <itemref idref="chapter_{chapter_number:03d}"/>')
    logger.info(f"Copied chapter {chapter_number:03d} to EPUB {OEB_DIR}.")

# add in the custom pages and fonts
for fname in os.listdir(CUSTOM_SRC):
    if fname.endswith('.xhtml') or fname.endswith('.ttf'):
      shutil.copy(os.path.join("custom", fname), OEB_DIR)
    logger.info(f"Copied custom file {fname} to EPUB {OEB_DIR}.")

chapters.append('        <li><a href="license.xhtml">Ebook license.</a></li>')
chapters.append('        <li><a href="cz-001.xhtml">Back pages and cover 1851.</a></li>')

opf_mani.append('    <item id="license" href="license.xhtml" media-type="application/xhtml+xml"/>')
opf_mani.append('    <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>')
opf_mani.append('    <item id="cz-001" href="cz-001.xhtml" media-type="application/xhtml+xml" properties="svg"/>')

opf_spin.append('    <itemref idref="license"/>')
opf_spin.append('    <itemref idref="cz-001"/>')
# Do not add navigation doc to spine
# opf_spin.append('    <itemref idref="nav"/>')

# 3. Copy CSS from CSS_SRC to CSS_DIR in EPUB_DIR
cssidx=0
for fname in os.listdir(CSS_SRC):
    if fname.endswith(".css"):
        cssidx+=1
        with open(os.path.join(CSS_SRC, fname), "r", encoding="utf-8") as f:
            css_content = f.read()
        with open(os.path.join(CSS_DIR, fname), "w", encoding="utf-8") as f:
            f.write(css_content)
        opf_mani.append(f'    <item id="css_{cssidx:03d}" href="css/{fname}" media-type="text/css"/>')

# 3. Copy images, jpg, from IMG_SRC and from CUSTOM_IMG to IMG_DIR in EPUB_DIR
for dirpath in [IMG_SRC, CUSTOM_IMG]:
    for fname in os.listdir(dirpath):
        if fname.endswith('.jpg'):
            try:
                shutil.copy(os.path.join(dirpath, fname), IMG_DIR)
                if fname == "cover.jpg":
                    prop_cover='properties="cover-image"'
                else:
                    prop_cover=""
                opf_mani.append(f'    <item id="{fname.replace(".jpg", "")}" href="images/{fname}" media-type="image/jpeg" {prop_cover}/>')
            except Exception as exc:
                logger.error(f"Failed to copy image {fname} from {dirpath} to EPUB images: {exc}")

# 4. Create mimetype (must be uncompressed)
with open(f"{EPUB_DIR}/mimetype", "w", encoding="utf-8") as f:
    f.write("application/epub+zip")

# 5. Create META-INF/container.xml
container_xml = '''<?xml version="1.0" encoding="UTF-8" ?>
<container version="1.0"
           xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  <rootfiles>
    <rootfile full-path="OEBPS/content.opf"
              media-type="application/oebps-package+xml"/>
  </rootfiles>
</container>
'''
with open(os.path.join(MET_DIR, "container.xml"), "w", encoding="utf-8") as f:
    f.write(container_xml)

# 6. Create content.opf

opf_mani.append('    <item id="id-4" href="font_CSIL.ttf" media-type="application/vnd.ms-opentype"/>')
opf_mani.append('    <item id="id-2" href="font_DanteMT.ttf" media-type="application/vnd.ms-opentype"/>')

created_date = utl.get_utc_now().strip()

opf_all=f'''<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="3.0">
  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/"
            xmlns:epub="http://www.idpf.org/2007/ops" xmlns:opf="http://www.idpf.org/2007/opf"
            xmlns:svg="http://www.w3.org/2000/svg" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <dc:title id="title">Moby-Dick; Or, The Whale (Power)</dc:title>
    <dc:title id="fulltitle">Moby-Dick; Or, The Whale (Power, {epub_ref}, v{version})</dc:title>
    <dc:creator id="author">Herman Melville</dc:creator>
    <dc:publisher>Power Moby Dick</dc:publisher>
    <dc:language>en</dc:language>
    <dc:identifier id="uuid_id">urn:uuid:{book_id}</dc:identifier>
    <dc:date>1851-11-14T00:00:00+00:00</dc:date>
    <opf:meta property="dcterms:modified">{created_date}</opf:meta>
    <opf:meta refines="#title" property="title-type">main</opf:meta>
    <opf:meta refines="#title" property="file-as">Moby-Dick; Or, The Whale (Power)</opf:meta>
    <opf:meta refines="#fulltitle" property="title-type">extended</opf:meta>
    <opf:meta refines="#author" property="role" scheme="marc:relators">aut</opf:meta>
    <opf:meta refines="#author" property="file-as">Melville, Herman</opf:meta>
  </metadata>

  <manifest>
    {"\n".join([next for next in opf_mani])}
  </manifest>

  <spine>
    {"\n".join([next for next in opf_spin])}
  </spine>
</package>
'''
with open(os.path.join(OEB_DIR, "content.opf"), "w", encoding="utf-8") as f:
    f.write(opf_all)

# 7. Create toc.xhtml chapter, and nav.xhtml navigation element
# Create separate TOC and Nav, with similar content, since e-readers don't agree
# 1 - as an OEBPS/toc.xhtml, with images and without item attribute properties="nav"
# 2 - without images, as a root nav.xhtml and with item attribute properties="nav"
def write_nav_xhtml (dest="nav") -> int:
    nav_id="nav"
    head='''<head>
        <title>Navigation</title>
        <link type="text/css" rel="stylesheet" href="css/mobydick.css"/>
    </head>'''
    toc_top='''<nav epub:type="toc" id="{}">'''.format(nav_id)
    toc_end='''</nav>'''

    if dest != "nav":
        nav_id="toc"
        head='''<head>
        <title>Contents</title>
        <link type="text/css" rel="stylesheet" href="css/mobydick.css"/>
    </head>'''
        toc_top='''<div id="Title_00004"><img class="full_page_image" src="images/cover-add-004-toc.jpg"/></div>
        <div id="Title_00005"><img class="full_page_image" src="images/cover-add-005-toc.jpg"/></div>
        <div>
          <a href="http://www.powermobydick.com/"><img class="center_img" src="images/PowerMobyDickLogo.jpg"/></a>
        </div>'''
        toc_end='''<div><figure>
          <a href="http://www.powermobydick.com/"><img class="center_img" src="images/mobydicklightlowres.jpg"/></a>
          <h2>Visit <a href="http://www.powermobydick.com/">Power Moby Dick</a>.</h2>
        </figure></div>'''

    nav_xhtml = f'''<?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE html>
    <html lang="en-US" xml:lang="en-US" xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
    {head}
    <body>
        {toc_top}
        <h1>Contents.</h1>
        <ol class="nav-toc">
            {"\n".join([entry for entry in chapters])}
        </ol>
        {toc_end}
    </body>
    </html>
    '''
    with open(os.path.join(OEB_DIR, f"{nav_id}.xhtml"), "w", encoding="utf-8") as f:
        f.write(nav_xhtml)

    logger.info(f"{dest.upper()} written successfully to epub as {nav_id}.xhtml")

    return 0

# Write these directly to EPUB location
write_nav_xhtml("nav")
write_nav_xhtml("toc")

# 8. Create EPUB zip
with zipfile.ZipFile(EPUB_BOOK, 'w') as epub:
    # mimetype must be first and uncompressed
    epub.write(f"{EPUB_DIR}/mimetype", "mimetype", compress_type=zipfile.ZIP_STORED)

    # nav.xhtml with NAV property to same ebook destination
    # epub.write(f"{OEB_DIR}/nav.xhtml", "OEBPS/nav.xhtml")

    # Add META-INF folder
    for root, dirs, files in os.walk(MET_DIR):
        for file in files:
            epub.write(os.path.join(root, file),
                       os.path.join("META-INF", file))

    # Add OEBPS folder
    for root, dirs, files in os.walk(OEB_DIR):
        for file in files:
            full_path = os.path.join(root, file)
            arc_path = os.path.join("OEBPS", os.path.relpath(full_path, OEB_DIR))
            epub.write(full_path, arc_path)

# Book created
logger.info(f"EPUB created: {EPUB_BOOK}")

# pyresult.valid for log
pyresult = EpubCheck(EPUB_BOOK)

if pyresult.valid:
    logger.info("EpubCheck validation SUCCESS!")
else:
    logger.warning(f"EpubCheck validation FAIL! Messages {pyresult.messages}")

# Create XLS as record of any epubcheck messages
sysresult = subprocess.run(f"epubcheck -x EPUB-{epub_ref}.xls \"{EPUB_BOOK}\"")
logger.info(f"System EpubCheck stdout: {sysresult.stdout}, stderr: {sysresult.stderr}")

logger.info(f"EPUB created and checked: {EPUB_BOOK}.")
logger.info(f"See EPUB-{epub_ref}.xls.")