book-translation/build_epub.py at master · DynamicDevices/book-translation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
"""
Build an EPUB from OCR text files and optional cover image.

When the first page is the book title/cover, it is kept as the cover image only;
OCR text from page 2 onward is used for the body.

Requires: pip install ebooklib

Usage:
  python build_epub.py [--pages-dir DIR] [--output FILE] [--title TITLE] [--author AUTHOR]
  python build_epub.py --pages-dir love-and-limerence-pages -o Love_and_Limerence.epub --title "Love and Limerence" --author "Dorothy Tennov"
"""

import argparse
import re
from pathlib import Path
from typing import List

try:
    import ebooklib
    from ebooklib import epub
except ImportError:
    print("Install ebooklib: pip install ebooklib", file=__import__("sys").stderr)
    raise SystemExit(1)


def natural_sort_key(p: Path) -> tuple:
    m = re.search(r"(\d+)\s*$", p.stem)
    return (int(m.group(1)),) if m else (0,)


def find_text_files(pages_dir: Path) -> List[Path]:
    files = list(pages_dir.glob("*.txt"))
    return sorted(set(files), key=natural_sort_key)


def html_escape(s: str) -> str:
    return (
        s.replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
        .replace('"', "&quot;")
    )


def main():
    ap = argparse.ArgumentParser(description="Build EPUB from OCR text files.")
    ap.add_argument("--pages-dir", type=Path, default=Path("stand-on-zanzibar-pages"), help="Directory with .txt (and optional images)")
    ap.add_argument("--output", "-o", type=Path, default=Path("Stand_on_Zanzibar.epub"), help="Output EPUB path")
    ap.add_argument("--title", default="Stand on Zanzibar", help="Book title")
    ap.add_argument("--author", default="John Brunner", help="Author")
    ap.add_argument("--chapters-per-part", type=int, default=50, help="Group N pages per chapter (0 = one chapter for whole book)")
    ap.add_argument("--first-page-is-cover", action="store_true", default=True, help="Use first page as cover image only; body text starts from page 2 (default)")
    ap.add_argument("--no-first-page-is-cover", action="store_false", dest="first_page_is_cover", help="Include first page OCR in body (don't treat as cover-only)")
    args = ap.parse_args()

    pages_dir = args.pages_dir.resolve()
    if not pages_dir.is_dir():
        ap.error(f"Directory not found: {pages_dir}")

    txt_files = find_text_files(pages_dir)
    if not txt_files:
        ap.error("No .txt files found.")

    # First page = cover image only; body text starts from page 2 (skip first .txt)
    if args.first_page_is_cover and len(txt_files) > 1:
        txt_files_for_body = txt_files[1:]
    else:
        txt_files_for_body = txt_files

    book = epub.EpubBook()
    book.set_identifier("stand-on-zanzibar-ocr")
    book.set_title(args.title)
    book.set_language("en")
    book.add_author(args.author)

    # Optional cover: first image in directory (page 1 = title/cover)
    for ext in ["png", "jpg", "jpeg"]:
        cover_candidates = sorted(pages_dir.glob(f"*.{ext}"), key=natural_sort_key)
        if cover_candidates:
            cover_path = cover_candidates[0]
            cover_content = cover_path.read_bytes()
            book.set_cover(f"cover.{ext}", cover_content)
            if args.first_page_is_cover:
                print(f"Cover: {cover_path.name} (first page as image)")
            break

    # Collect body text from OCR (page 2 onward when first page is cover)
    all_paragraphs = []  # type: List[str]
    for tf in txt_files_for_body:
        raw = tf.read_text(encoding="utf-8", errors="replace").strip()
        if not raw:
            continue
        # Normalize: split into paragraphs (double newline or single long line)
        for block in re.split(r"\n\s*\n", raw):
            block = block.strip()
            if block:
                all_paragraphs.append(block)

    if not all_paragraphs:
        ap.error("No text content in .txt files (or only one page when --first-page-is-cover).")

    chapters_per_part = max(0, args.chapters_per_part)
    if chapters_per_part <= 0:
        # One chapter for the whole book
        html_blocks = ["<p>" + html_escape(p).replace("\n", "</p><p>") + "</p>" for p in all_paragraphs]
        body = "\n".join(html_blocks)
        ch = epub.EpubHtml(title="Contents", file_name="content.xhtml", lang="en")
        ch.content = f"""<?xml version='1.0' encoding='utf-8'?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Contents</title></head>
<body>{body}</body>
</html>"""
        book.add_item(ch)
        book.toc = (ch,)
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())
        book.spine = ["nav", ch]
    else:
        # Multiple chapters (e.g. every N pages)
        spine = ["nav"]
        toc_list = []
        for start in range(0, len(all_paragraphs), chapters_per_part):
            part = all_paragraphs[start : start + chapters_per_part]
            part_num = start // chapters_per_part + 1
            html_blocks = ["<p>" + html_escape(p).replace("\n", "</p><p>") + "</p>" for p in part]
            body = "\n".join(html_blocks)
            ch = epub.EpubHtml(
                title=f"Part {part_num}",
                file_name=f"part_{part_num:04d}.xhtml",
                lang="en",
            )
            ch.content = f"""<?xml version='1.0' encoding='utf-8'?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Part {part_num}</title></head>
<body>{body}</body>
</html>"""
            book.add_item(ch)
            toc_list.append(ch)
            spine.append(ch)
        book.toc = tuple(toc_list)
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())
        book.spine = spine

    out_path = args.output.resolve()
    epub.write_epub(str(out_path), book)
    print(f"Wrote: {out_path}")


if __name__ == "__main__":
    main()