Skip to content

Commit 025226a

Browse files
committed
ENH: Add support for BrotliDecode filter (PDF 2.0) #3223
1 parent 96ba79c commit 025226a

15 files changed

+293
-24
lines changed

pypdf/_doc_common.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,8 @@ class PdfDocCommon:
270270

271271
strict: bool = False # default
272272

273+
flattened_pages: Optional[List[PageObject]] = None
274+
273275
_encryption: Optional[Encryption] = None
274276

275277
_readonly: bool = False
@@ -333,8 +335,6 @@ def viewer_preferences(self) -> Optional[ViewerPreferences]:
333335
self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
334336
return o
335337

336-
flattened_pages: Optional[List[PageObject]] = None
337-
338338
def get_num_pages(self) -> int:
339339
"""
340340
Calculate the number of pages in this PDF file.
@@ -1128,7 +1128,16 @@ def _flatten(
11281128
indirect_reference: Optional[IndirectObject] = None,
11291129
) -> None:
11301130
"""
1131-
Prepare the document pages to ease searching
1131+
Process the document pages to ease searching.
1132+
1133+
Attributes of a page may inherit from ancestor nodes
1134+
in the page tree. Flattening means moving
1135+
any inheritance data into descendant nodes,
1136+
effectively removing the inheritance dependency.
1137+
1138+
Note: It is distinct from another use of "flattening" applied to PDFs.
1139+
Flattening a PDF also means combining all the contents into one single layer
1140+
and making the file less editable.
11321141
11331142
Args:
11341143
list_only: Will only list the pages within _flatten_pages.
@@ -1156,7 +1165,7 @@ def _flatten(
11561165

11571166
if PA.TYPE in pages:
11581167
t = cast(str, pages[PA.TYPE])
1159-
# if pdf has no type, considered as a page if /Kids is missing
1168+
# if the page tree node has no /Type, consider as a page if /Kids is also missing
11601169
elif PA.KIDS not in pages:
11611170
t = "/Page"
11621171
else:
@@ -1181,8 +1190,8 @@ def _flatten(
11811190
)
11821191
elif t == "/Page":
11831192
for attr_in, value in inherit.items():
1184-
# if the page has it's own value, it does not inherit the
1185-
# parent's value:
1193+
# if the page has its own value, it does not inherit the
1194+
# parent's value
11861195
if attr_in not in pages:
11871196
pages[attr_in] = value
11881197
page_obj = PageObject(self, indirect_reference)

pypdf/_page.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,8 +2009,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20092009
# A special case is a translating only tm:
20102010
# tm = [1, 0, 0, 1, e, f]
20112011
# i.e. tm[4] += tx, tm[5] += ty.
2012-
tx = float(operands[0])
2013-
ty = float(operands[1])
2012+
tx, ty = float(operands[0]), float(operands[1])
20142013
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
20152014
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
20162015
str_widths = compute_str_widths(_actual_str_size["str_widths"])
@@ -2022,7 +2021,10 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20222021
_actual_str_size["str_widths"] = 0.0
20232022
elif operator == b"T*":
20242023
check_crlf_space = True
2025-
tm_matrix[5] -= TL
2024+
tm_matrix[4] -= TL * tm_matrix[2]
2025+
tm_matrix[5] -= TL * tm_matrix[3]
2026+
str_widths = compute_str_widths(_actual_str_size["str_widths"])
2027+
_actual_str_size["str_widths"] = 0.0
20262028
elif operator == b"Tj":
20272029
check_crlf_space = True
20282030
text, rtl_dir, _actual_str_size = self._handle_tj(

pypdf/_writer.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,13 +1502,12 @@ def _write_increment(self, stream: StreamType) -> None:
15021502

15031503
def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
15041504
object_positions = []
1505-
free_objects = [] # will contain list of all free entries
1505+
free_objects = []
15061506
stream.write(self.pdf_header.encode() + b"\n")
15071507
stream.write(b"%\xE2\xE3\xCF\xD3\n")
15081508

1509-
for i, obj in enumerate(self._objects):
1509+
for idnum, obj in enumerate(self._objects, start=1):
15101510
if obj is not None:
1511-
idnum = i + 1
15121511
object_positions.append(stream.tell())
15131512
stream.write(f"{idnum} 0 obj\n".encode())
15141513
if self._encryption and obj != self._encrypt_entry:
@@ -1517,8 +1516,8 @@ def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]
15171516
stream.write(b"\nendobj\n")
15181517
else:
15191518
object_positions.append(-1)
1520-
free_objects.append(i + 1)
1521-
free_objects.append(0) # add 0 to loop in accordance with PDF spec
1519+
free_objects.append(idnum)
1520+
free_objects.append(0) # add 0 to loop in accordance with specification
15221521
return object_positions, free_objects
15231522

15241523
def _write_xref_table(
@@ -1760,7 +1759,7 @@ def get_reference(self, obj: PdfObject) -> IndirectObject:
17601759

17611760
def get_outline_root(self) -> TreeObject:
17621761
if CO.OUTLINES in self._root_object:
1763-
# Table 3.25 Entries in the catalog dictionary
1762+
# Entries in the catalog dictionary
17641763
outline = cast(TreeObject, self._root_object[CO.OUTLINES])
17651764
if not isinstance(outline, TreeObject):
17661765
t = TreeObject(outline)
@@ -1784,12 +1783,12 @@ def get_threads_root(self) -> ArrayObject:
17841783
See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
17851784
17861785
Returns:
1787-
An array (possibly empty) of Dictionaries with ``/F`` and
1788-
``/I`` properties.
1786+
An array (possibly empty) of Dictionaries with an ``/F`` key,
1787+
and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
17891788
17901789
"""
17911790
if CO.THREADS in self._root_object:
1792-
# Table 3.25 Entries in the catalog dictionary
1791+
# Entries in the catalog dictionary
17931792
threads = cast(ArrayObject, self._root_object[CO.THREADS])
17941793
else:
17951794
threads = ArrayObject()
@@ -1801,9 +1800,10 @@ def threads(self) -> ArrayObject:
18011800
"""
18021801
Read-only property for the list of threads.
18031802
1804-
See §8.3.2 from PDF 1.7 spec.
1803+
See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
18051804
1806-
Each element is a dictionary with ``/F`` and ``/I`` keys.
1805+
Each element is a dictionary with an ``/F`` key, and optionally
1806+
information about the thread in ``/I`` or ``/Metadata`` keys.
18071807
"""
18081808
return self.get_threads_root()
18091809

pypdf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ class FilterTypes(StrEnum):
245245
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
246246
DCT_DECODE = "/DCTDecode" # abbreviation: DCT
247247
JPX_DECODE = "/JPXDecode"
248+
BROTLI_DECODE = "/BrotliDecode" # abbreviation: Br, PDF 2.0
248249

249250

250251
class FilterTypeAbbreviations:
@@ -257,6 +258,7 @@ class FilterTypeAbbreviations:
257258
RL = "/RL"
258259
CCF = "/CCF"
259260
DCT = "/DCT"
261+
BR = "/Br"
260262

261263

262264
class LzwFilterParameters:

pypdf/filters.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@
6565
NullObject,
6666
)
6767

68+
try:
69+
import brotli
70+
except ImportError:
71+
brotli = None
72+
6873

6974
def decompress(data: bytes) -> bytes:
7075
"""
@@ -481,6 +486,68 @@ def decode(
481486
return data
482487

483488

489+
class BrotliDecode:
490+
"""
491+
Decompress the given data using Brotli.
492+
493+
Decodes data that has been encoded using the Brotli compression algorithm.
494+
Brotli is a general-purpose lossless compression algorithm that combines
495+
LZ77 and Huffman coding. It typically achieves better compression ratios
496+
than Flate encoding, though with slightly slower compression speeds.
497+
498+
See ISO 32000-2:2020, Section 7.4.11.
499+
500+
Args:
501+
data: The input data to be decompressed.
502+
decode_parms: Optional decoding parameters (currently unused).
503+
**kwargs: Additional keyword arguments (currently unused).
504+
505+
Returns:
506+
The decompressed data.
507+
"""
508+
@staticmethod
509+
def decode(
510+
data: bytes,
511+
decode_parms: Optional[DictionaryObject] = None,
512+
**kwargs: Any,
513+
) -> bytes:
514+
"""
515+
Decode Brotli-compressed data.
516+
517+
Args:
518+
data: Brotli-compressed data.
519+
decode_parms: A dictionary of parameter values (unused).
520+
521+
Returns:
522+
The decompressed data.
523+
524+
Raises:
525+
ImportError: If the 'brotli' library is not installed.
526+
"""
527+
if brotli is None:
528+
raise ImportError("Brotli library not installed. Required for BrotliDecode filter.")
529+
return brotli.decompress(data)
530+
531+
@staticmethod
532+
def encode(data: bytes, **kwargs: Any) -> bytes:
533+
"""
534+
Encode data using Brotli compression.
535+
536+
Args:
537+
data: The data to be compressed.
538+
**kwargs: Additional keyword arguments (unused).
539+
540+
Returns:
541+
The compressed data.
542+
543+
Raises:
544+
ImportError: If the 'brotli' library is not installed.
545+
"""
546+
if brotli is None:
547+
raise ImportError("Brotli library not installed. Required for BrotliDecode filter.")
548+
return brotli.compress(data)
549+
550+
484551
@dataclass
485552
class CCITTParameters:
486553
"""§7.4.6, optional parameters for the CCITTFaxDecode filter."""
@@ -666,6 +733,8 @@ def decode_stream_data(stream: Any) -> bytes:
666733
data = DCTDecode.decode(data)
667734
elif filter_name == FT.JPX_DECODE:
668735
data = JPXDecode.decode(data)
736+
elif filter_name == FT.BROTLI_DECODE:
737+
data = BrotliDecode.decode(data)
669738
elif filter_name == "/Crypt":
670739
if "/Name" in params or "/Type" in params:
671740
raise NotImplementedError(

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,11 @@ Source = "https://github.com/py-pdf/pypdf"
4242
crypto = ["cryptography"]
4343
cryptodome = ["PyCryptodome"]
4444
image = ["Pillow>=8.0.0"]
45+
brotli = ["brotli"]
4546
full = [
4647
"cryptography",
47-
"Pillow>=8.0.0"
48+
"Pillow>=8.0.0",
49+
"brotli",
4850
]
4951
dev = [
5052
"black",

requirements/ci-3.11.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,5 @@ typing-extensions==4.12.2
7575
# via
7676
# mypy
7777
# typeguard
78+
brotli==1.1.0
79+
# via -r requirements/ci.in

requirements/ci.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ pytest-cov
1313
typeguard
1414
types-Pillow
1515
pyyaml
16+
brotli

requirements/ci.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,5 @@ typing-extensions==4.12.2
7777
# typeguard
7878
zipp==3.20.2
7979
# via importlib-metadata
80+
brotli==1.1.0
81+
# via -r requirements/ci.in

requirements/dev.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ pre-commit
44
pytest-cov
55
flit
66
wheel
7+
brotli

requirements/dev.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,5 @@ zipp==3.20.2
8686
# The following packages are considered to be unsafe in a requirements file:
8787
# pip
8888
# setuptools
89+
brotli==1.1.0
90+
# via -r requirements/ci.in
Binary file not shown.

resources/create_brotli_test_pdf.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env python
2+
"""
3+
Create a minimal PDF with Brotli compression for testing purposes.
4+
5+
This script generates a simple PDF file that uses Brotli compression
6+
for the content stream, allowing for testing of the BrotliDecode filter
7+
in pypdf.
8+
9+
Note: /BrotliDecode is not a standard PDF filter. This file is specifically
10+
for testing PDF library support for this filter (e.g., in pypdf).
11+
Standard PDF viewers will likely not render this file correctly.
12+
"""
13+
14+
import logging
15+
from pathlib import Path
16+
17+
import brotli
18+
19+
logging.basicConfig(level=logging.INFO, format="%(name)s: %(levelname)s: %(message)s")
20+
logger = logging.getLogger(__name__)
21+
22+
content_stream = b"BT /F1 24 Tf 100 700 Td (Hello, Brotli!) Tj ET"
23+
compressed_content = brotli.compress(content_stream, quality=5)
24+
25+
xref_offsets = [0] * 6
26+
current_offset = 0
27+
pdf_parts = []
28+
29+
part = b"%PDF-1.7\n%\xc2\xa5\xc2\xb1\xc3\xab\xc3\xbf\n" # Binary marker
30+
pdf_parts.append(part)
31+
current_offset += len(part)
32+
xref_offsets[1] = current_offset
33+
34+
part = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
35+
pdf_parts.append(part)
36+
current_offset += len(part)
37+
xref_offsets[2] = current_offset
38+
39+
part = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
40+
pdf_parts.append(part)
41+
current_offset += len(part)
42+
xref_offsets[3] = current_offset
43+
44+
part = (
45+
b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
46+
b"/Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n"
47+
)
48+
pdf_parts.append(part)
49+
current_offset += len(part)
50+
xref_offsets[4] = current_offset
51+
52+
part_header = (
53+
f"4 0 obj\n<< /Length {len(compressed_content)} /Filter /BrotliDecode >>\nstream\n"
54+
).encode("ascii")
55+
part_footer = b"\nendstream\nendobj\n"
56+
pdf_parts.append(part_header)
57+
pdf_parts.append(compressed_content)
58+
pdf_parts.append(part_footer)
59+
current_offset += len(part_header) + len(compressed_content) + len(part_footer)
60+
xref_offsets[5] = current_offset
61+
62+
part = b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n"
63+
pdf_parts.append(part)
64+
current_offset += len(part)
65+
xref_table_start_offset = current_offset
66+
67+
xref_lines = [b"xref\n0 6\n", b"0000000000 65535 f \n"]
68+
xref_lines.extend(
69+
f"{xref_offsets[i]:010d} 00000 n \n".encode("ascii") for i in range(1, 6)
70+
)
71+
pdf_parts.extend(xref_lines)
72+
73+
trailer = (
74+
f"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_table_start_offset}\n%%EOF"
75+
).encode("ascii")
76+
pdf_parts.append(trailer)
77+
78+
script_path = Path(__file__).resolve()
79+
output_dir = script_path.parent / "brotli-test-pdfs"
80+
output_path = output_dir / "minimal-brotli-compressed.pdf"
81+
82+
output_dir.mkdir(parents=True, exist_ok=True)
83+
84+
try:
85+
with open(output_path, "wb") as f:
86+
for part in pdf_parts:
87+
f.write(part)
88+
logger.info(f"Created test PDF with Brotli compression at: {output_path}")
89+
except OSError:
90+
logger.exception("Error writing PDF file")

0 commit comments

Comments
 (0)