ENH: Add support for BrotliDecode filter (PDF 2.0) #3223

ash01ish · ash01ish · commit 025226adf106 · 2025-04-21T16:40:25.000+05:30
diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -270,6 +270,8 @@ class PdfDocCommon:
 
     strict: bool = False  # default
 
+    flattened_pages: Optional[List[PageObject]] = None
+
     _encryption: Optional[Encryption] = None
 
     _readonly: bool = False
@@ -333,8 +335,6 @@ def viewer_preferences(self) -> Optional[ViewerPreferences]:
                 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
         return o
 
-    flattened_pages: Optional[List[PageObject]] = None
-
     def get_num_pages(self) -> int:
         """
         Calculate the number of pages in this PDF file.
@@ -1128,7 +1128,16 @@ def _flatten(
         indirect_reference: Optional[IndirectObject] = None,
     ) -> None:
         """
-        Prepare the document pages to ease searching
+        Process the document pages to ease searching.
+
+        Attributes of a page may inherit from ancestor nodes
+        in the page tree. Flattening means moving
+        any inheritance data into descendant nodes,
+        effectively removing the inheritance dependency.
+
+        Note: It is distinct from another use of "flattening" applied to PDFs.
+        Flattening a PDF also means combining all the contents into one single layer
+        and making the file less editable.
 
         Args:
             list_only: Will only list the pages within _flatten_pages.
@@ -1156,7 +1165,7 @@ def _flatten(
 
         if PA.TYPE in pages:
             t = cast(str, pages[PA.TYPE])
-        # if pdf has no type, considered as a page if /Kids is missing
+        # if the page tree node has no /Type, consider as a page if /Kids is also missing
         elif PA.KIDS not in pages:
             t = "/Page"
         else:
@@ -1181,8 +1190,8 @@ def _flatten(
                         )
         elif t == "/Page":
             for attr_in, value in inherit.items():
-                # if the page has it's own value, it does not inherit the
-                # parent's value:
+                # if the page has its own value, it does not inherit the
+                # parent's value
                 if attr_in not in pages:
                     pages[attr_in] = value
             page_obj = PageObject(self, indirect_reference)
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -2009,8 +2009,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 # A special case is a translating only tm:
                 # tm = [1, 0, 0, 1, e, f]
                 # i.e. tm[4] += tx, tm[5] += ty.
-                tx = float(operands[0])
-                ty = float(operands[1])
+                tx, ty = float(operands[0]), float(operands[1])
                 tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
                 tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
                 str_widths = compute_str_widths(_actual_str_size["str_widths"])
@@ -2022,7 +2021,10 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 _actual_str_size["str_widths"] = 0.0
             elif operator == b"T*":
                 check_crlf_space = True
-                tm_matrix[5] -= TL
+                tm_matrix[4] -= TL * tm_matrix[2]
+                tm_matrix[5] -= TL * tm_matrix[3]
+                str_widths = compute_str_widths(_actual_str_size["str_widths"])
+                _actual_str_size["str_widths"] = 0.0
             elif operator == b"Tj":
                 check_crlf_space = True
                 text, rtl_dir, _actual_str_size = self._handle_tj(
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -1502,13 +1502,12 @@ def _write_increment(self, stream: StreamType) -> None:
 
     def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
         object_positions = []
-        free_objects = []  # will contain list of all free entries
+        free_objects = []
         stream.write(self.pdf_header.encode() + b"\n")
         stream.write(b"%\xE2\xE3\xCF\xD3\n")
 
-        for i, obj in enumerate(self._objects):
+        for idnum, obj in enumerate(self._objects, start=1):
             if obj is not None:
-                idnum = i + 1
                 object_positions.append(stream.tell())
                 stream.write(f"{idnum} 0 obj\n".encode())
                 if self._encryption and obj != self._encrypt_entry:
@@ -1517,8 +1516,8 @@ def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]
                 stream.write(b"\nendobj\n")
             else:
                 object_positions.append(-1)
-                free_objects.append(i + 1)
-        free_objects.append(0)  # add 0 to loop in accordance with PDF spec
+                free_objects.append(idnum)
+        free_objects.append(0)  # add 0 to loop in accordance with specification
         return object_positions, free_objects
 
     def _write_xref_table(
@@ -1760,7 +1759,7 @@ def get_reference(self, obj: PdfObject) -> IndirectObject:
 
     def get_outline_root(self) -> TreeObject:
         if CO.OUTLINES in self._root_object:
-            # Table 3.25 Entries in the catalog dictionary
+            # Entries in the catalog dictionary
             outline = cast(TreeObject, self._root_object[CO.OUTLINES])
             if not isinstance(outline, TreeObject):
                 t = TreeObject(outline)
@@ -1784,12 +1783,12 @@ def get_threads_root(self) -> ArrayObject:
         See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
 
         Returns:
-            An array (possibly empty) of Dictionaries with ``/F`` and
-            ``/I`` properties.
+            An array (possibly empty) of Dictionaries with an ``/F`` key,
+            and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
 
         """
         if CO.THREADS in self._root_object:
-            # Table 3.25 Entries in the catalog dictionary
+            # Entries in the catalog dictionary
             threads = cast(ArrayObject, self._root_object[CO.THREADS])
         else:
             threads = ArrayObject()
@@ -1801,9 +1800,10 @@ def threads(self) -> ArrayObject:
         """
         Read-only property for the list of threads.
 
-        See §8.3.2 from PDF 1.7 spec.
+        See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
 
-        Each element is a dictionary with ``/F`` and ``/I`` keys.
+        Each element is a dictionary with an ``/F`` key, and optionally
+        information about the thread in ``/I`` or ``/Metadata`` keys.
         """
         return self.get_threads_root()
 
diff --git a/pypdf/constants.py b/pypdf/constants.py
@@ -245,6 +245,7 @@ class FilterTypes(StrEnum):
     CCITT_FAX_DECODE = "/CCITTFaxDecode"  # abbreviation: CCF
     DCT_DECODE = "/DCTDecode"  # abbreviation: DCT
     JPX_DECODE = "/JPXDecode"
+    BROTLI_DECODE = "/BrotliDecode"   # abbreviation: Br, PDF 2.0
 
 
 class FilterTypeAbbreviations:
@@ -257,6 +258,7 @@ class FilterTypeAbbreviations:
     RL = "/RL"
     CCF = "/CCF"
     DCT = "/DCT"
+    BR = "/Br"
 
 
 class LzwFilterParameters:
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -65,6 +65,11 @@
     NullObject,
 )
 
+try:
+    import brotli
+except ImportError:
+    brotli = None
+
 
 def decompress(data: bytes) -> bytes:
     """
@@ -481,6 +486,68 @@ def decode(
         return data
 
 
+class BrotliDecode:
+    """
+    Decompress the given data using Brotli.
+
+    Decodes data that has been encoded using the Brotli compression algorithm.
+    Brotli is a general-purpose lossless compression algorithm that combines
+    LZ77 and Huffman coding. It typically achieves better compression ratios
+    than Flate encoding, though with slightly slower compression speeds.
+
+    See ISO 32000-2:2020, Section 7.4.11.
+
+    Args:
+        data: The input data to be decompressed.
+        decode_parms: Optional decoding parameters (currently unused).
+        **kwargs: Additional keyword arguments (currently unused).
+
+    Returns:
+        The decompressed data.
+    """
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        """
+        Decode Brotli-compressed data.
+
+        Args:
+            data: Brotli-compressed data.
+            decode_parms: A dictionary of parameter values (unused).
+
+        Returns:
+            The decompressed data.
+
+        Raises:
+            ImportError: If the 'brotli' library is not installed.
+        """
+        if brotli is None:
+            raise ImportError("Brotli library not installed. Required for BrotliDecode filter.")
+        return brotli.decompress(data)
+
+    @staticmethod
+    def encode(data: bytes, **kwargs: Any) -> bytes:
+        """
+        Encode data using Brotli compression.
+
+        Args:
+            data: The data to be compressed.
+            **kwargs: Additional keyword arguments (unused).
+
+        Returns:
+            The compressed data.
+
+        Raises:
+            ImportError: If the 'brotli' library is not installed.
+        """
+        if brotli is None:
+            raise ImportError("Brotli library not installed. Required for BrotliDecode filter.")
+        return brotli.compress(data)
+
+
 @dataclass
 class CCITTParameters:
     """§7.4.6, optional parameters for the CCITTFaxDecode filter."""
@@ -666,6 +733,8 @@ def decode_stream_data(stream: Any) -> bytes:
             data = DCTDecode.decode(data)
         elif filter_name == FT.JPX_DECODE:
             data = JPXDecode.decode(data)
+        elif filter_name == FT.BROTLI_DECODE:
+            data = BrotliDecode.decode(data)
         elif filter_name == "/Crypt":
             if "/Name" in params or "/Type" in params:
                 raise NotImplementedError(
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,9 +42,11 @@ Source = "https://github.com/py-pdf/pypdf"
 crypto = ["cryptography"]
 cryptodome = ["PyCryptodome"]
 image = ["Pillow>=8.0.0"]
+brotli = ["brotli"]
 full = [
     "cryptography",
-    "Pillow>=8.0.0"
+    "Pillow>=8.0.0",
+    "brotli",
 ]
 dev = [
     "black",
diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt
@@ -75,3 +75,5 @@ typing-extensions==4.12.2
     # via
     #   mypy
     #   typeguard
+brotli==1.1.0
+    # via -r requirements/ci.in
diff --git a/requirements/ci.in b/requirements/ci.in
@@ -13,3 +13,4 @@ pytest-cov
 typeguard
 types-Pillow
 pyyaml
+brotli
diff --git a/requirements/ci.txt b/requirements/ci.txt
@@ -77,3 +77,5 @@ typing-extensions==4.12.2
     #   typeguard
 zipp==3.20.2
     # via importlib-metadata
+brotli==1.1.0
+    # via -r requirements/ci.in
diff --git a/requirements/dev.in b/requirements/dev.in
@@ -4,3 +4,4 @@ pre-commit
 pytest-cov
 flit
 wheel
+brotli
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -86,3 +86,5 @@ zipp==3.20.2
 # The following packages are considered to be unsafe in a requirements file:
 # pip
 # setuptools
+brotli==1.1.0
+    # via -r requirements/ci.in
diff --git a/resources/brotli-test-pdfs/minimal-brotli-compressed.pdf b/resources/brotli-test-pdfs/minimal-brotli-compressed.pdf
diff --git a/resources/create_brotli_test_pdf.py b/resources/create_brotli_test_pdf.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+"""
+Create a minimal PDF with Brotli compression for testing purposes.
+
+This script generates a simple PDF file that uses Brotli compression
+for the content stream, allowing for testing of the BrotliDecode filter
+in pypdf.
+
+Note: /BrotliDecode is not a standard PDF filter. This file is specifically
+for testing PDF library support for this filter (e.g., in pypdf).
+Standard PDF viewers will likely not render this file correctly.
+"""
+
+import logging
+from pathlib import Path
+
+import brotli
+
+logging.basicConfig(level=logging.INFO, format="%(name)s: %(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+content_stream = b"BT /F1 24 Tf 100 700 Td (Hello, Brotli!) Tj ET"
+compressed_content = brotli.compress(content_stream, quality=5)
+
+xref_offsets = [0] * 6
+current_offset = 0
+pdf_parts = []
+
+part = b"%PDF-1.7\n%\xc2\xa5\xc2\xb1\xc3\xab\xc3\xbf\n" # Binary marker
+pdf_parts.append(part)
+current_offset += len(part)
+xref_offsets[1] = current_offset
+
+part = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
+pdf_parts.append(part)
+current_offset += len(part)
+xref_offsets[2] = current_offset
+
+part = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
+pdf_parts.append(part)
+current_offset += len(part)
+xref_offsets[3] = current_offset
+
+part = (
+    b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
+    b"/Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n"
+)
+pdf_parts.append(part)
+current_offset += len(part)
+xref_offsets[4] = current_offset
+
+part_header = (
+    f"4 0 obj\n<< /Length {len(compressed_content)} /Filter /BrotliDecode >>\nstream\n"
+).encode("ascii")
+part_footer = b"\nendstream\nendobj\n"
+pdf_parts.append(part_header)
+pdf_parts.append(compressed_content)
+pdf_parts.append(part_footer)
+current_offset += len(part_header) + len(compressed_content) + len(part_footer)
+xref_offsets[5] = current_offset
+
+part = b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n"
+pdf_parts.append(part)
+current_offset += len(part)
+xref_table_start_offset = current_offset
+
+xref_lines = [b"xref\n0 6\n", b"0000000000 65535 f \n"]
+xref_lines.extend(
+    f"{xref_offsets[i]:010d} 00000 n \n".encode("ascii") for i in range(1, 6)
+)
+pdf_parts.extend(xref_lines)
+
+trailer = (
+    f"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_table_start_offset}\n%%EOF"
+).encode("ascii")
+pdf_parts.append(trailer)
+
+script_path = Path(__file__).resolve()
+output_dir = script_path.parent / "brotli-test-pdfs"
+output_path = output_dir / "minimal-brotli-compressed.pdf"
+
+output_dir.mkdir(parents=True, exist_ok=True)
+
+try:
+    with open(output_path, "wb") as f:
+        for part in pdf_parts:
+            f.write(part)
+    logger.info(f"Created test PDF with Brotli compression at: {output_path}")
+except OSError:
+    logger.exception("Error writing PDF file")
diff --git a/tests/test_filters.py b/tests/test_filters.py
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py

-Original file line number
+Diff line change
 pytest-cov
 flit
 wheel
 +brotli