-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtmlpdf.py
executable file
·142 lines (114 loc) · 5.09 KB
/
htmlpdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python3
"""A tool for creating PDF documents from YAML data and Jinja2 HTML templates."""
import re
import collections.abc
import os.path
import logging
import numbers
import argparse
from typing import Mapping, Iterable, Callable, Union
import yaml
import jinja2
import weasyprint
from unidecode import unidecode
Leaf = Union[str, numbers.Real, None]
Node = Union[Mapping, Iterable, Leaf]
def create_pdf(html: str, base_url: str) -> bytes:
"""Create PDF file contents from an HTML document."""
font_config = weasyprint.text.fonts.FontConfiguration()
document = weasyprint.HTML(string=html, base_url=base_url)
pdf: bytes = document.write_pdf(font_config=font_config, pdf_version="1.4")
return pdf
def render_html_files(data_filenames: Iterable[str], html_template_filename: str) -> str:
"""Render a Jinja2 template using YAML files as data sources.
Data from any subsequent YAML file overwrites data loaded from previous
file(s). You can refer to nodes from previous files using YAML anchors and
aliases: https://yaml.org/spec/1.2/spec.html#alias// """
return render_html(
(open(df, "r", encoding="utf-8").read() for df in data_filenames),
open(html_template_filename, "r", encoding="utf-8").read()
)
def render_html(yaml_inputs: Iterable[str], html_template: str) -> str:
"""Render a Jinja2 template using YAML strings as data sources."""
environment = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
template = environment.from_string(html_template)
return template.render(process_tags(yaml.safe_load("\n".join(yaml_inputs))))
def process_tags(node: Node) -> Node:
"""Recursively process a limited subset of Markdown into HTML."""
def recursive_map(node: Node, func: Callable[[Leaf], Leaf]) -> Node:
"""Apply a function recursively to a node (similar to map(), but recursive)."""
if isinstance(node, str):
return func(node)
if isinstance(node, numbers.Real):
return func(node)
if node is None:
return func(node)
if isinstance(node, collections.abc.Mapping):
return {k: recursive_map(v, func) for k, v in node.items()}
if isinstance(node, collections.abc.Iterable):
return [recursive_map(elem, func) for elem in node]
raise ValueError
return recursive_map(node, process_tags_for_scalar)
def process_tags_for_scalar(inp: Leaf) -> Leaf:
"""Process a limited subset of Markdown into HTML."""
if isinstance(inp, numbers.Real):
return inp
if inp is None:
return inp
tags = [
(r"__([^_]+)__", r"<strong>\1</strong>"),
(r"_([^_]+)_", r"<em>\1</em>"),
(r"\[([^]]+)\]\(([^)]+)\)", r'<a href="\2">\1</a>'),
]
for pattern, replacement in tags:
inp = re.sub(pattern, replacement, inp)
return inp
def get_title(html: str) -> str:
"""Extract the meta title from an HTML document."""
meta: Mapping[str, str] = weasyprint.html.get_html_metadata(weasyprint.HTML(string=html))
return meta["title"]
def create_output_filename(title: str) -> str:
"""Create a CLI friendly filename (no spaces, no accented characters etc.)"""
ascii_title: str = unidecode(title)
return re.sub(r"\W+", "_", ascii_title) + ".pdf"
def get_base_url(filename: str) -> str:
"""Get a base URL that is usable as WeasyPrint base_url parameter."""
dirname = os.path.dirname(filename)
return dirname if dirname else "."
def enable_weasyprint_logging() -> None:
"""Make the WeasyPrint warnings visible.
Most HTML / CSS errors are not fatal and will not prevent WeasyPrint from
rendering a document, but they can make the document appear ugly or broken.
Therefore it's important to take heed of the warning messages.
https://weasyprint.readthedocs.io/en/latest/tutorial.html#logging"""
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("WeasyPrint %(levelname)s: %(message)s"))
weasyprint.LOGGER.addHandler(handler)
def main() -> None:
"""A simple CLI for the module. Run with `-h` for help."""
parser = argparse.ArgumentParser()
parser.add_argument(
"-i", action="append", dest="data_filenames", required=True,
help="YAML data filename (repeat to load multiple files)", metavar="DATA_FILENAME",
)
parser.add_argument(
"-t", dest="html_template_filename", required=True,
help="HTML template filename"
)
parser.add_argument(
"-o", dest="output_filename", required=False,
help="output PDF filename (default: generated based on HTML document title)"
)
args = parser.parse_args()
enable_weasyprint_logging()
html = render_html_files(args.data_filenames, args.html_template_filename)
output_filename = (
args.output_filename
if args.output_filename
else create_output_filename(get_title(html))
)
with open(output_filename, "wb") as output_file:
output_file.write(create_pdf(html, get_base_url(args.html_template_filename)))
print(f"output saved to {output_filename}")
if __name__ == "__main__":
main()