Remove invalid hyperlink annotations to satisfy Ghostscript 10.x during PDF/A conversion

jbarlow83 · jbarlow83 · commit a659f83d67bf · 2024-11-16T19:02:10.000-08:00
Closes #1425
diff --git a/pyproject.toml b/pyproject.toml
@@ -155,3 +155,6 @@ convention = "google"
 
 [tool.ruff.format]
 quote-style = "preserve"
+
+[dependency-groups]
+dev = ["mypy>=1.13.0"]
diff --git a/src/ocrmypdf/_annots.py b/src/ocrmypdf/_annots.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: 2024 James R. Barlow
+# SPDX-License-Identifier: MPL-2.0
+
+"""OCRmyPDF PDF annotation cleanup."""
+
+from __future__ import annotations
+
+import logging
+
+from pikepdf import Dictionary, Name, NameTree, Pdf
+
+log = logging.getLogger(__name__)
+
+
+def remove_broken_goto_annotations(pdf: Pdf) -> bool:
+    """Remove broken goto annotations from a PDF.
+
+    If a PDF contains a GoTo Action that points to a named destination that does not
+    exist, Ghostscript PDF/A conversion will fail. In any event, a named destination
+    that is not defined is not useful.
+
+    Args:
+        pdf: Opened PDF file.
+
+    Returns:
+        bool: True if the file was modified, False if not.
+    """
+    modified = False
+
+    # Check if there are any named destinations
+    if Name.Names not in pdf.Root:
+        return modified
+    if Name.Dests not in pdf.Root[Name.Names]:
+        return modified
+
+    dests = pdf.Root[Name.Names][Name.Dests]
+    if not isinstance(dests, Dictionary):
+        return modified
+    nametree = NameTree(dests)
+
+    # Create a set of all named destinations
+    names = set(k for k in nametree.keys())
+
+    for n, page in enumerate(pdf.pages):
+        if Name.Annots not in page:
+            continue
+        for annot in page[Name.Annots]:
+            if not isinstance(annot, Dictionary):
+                continue
+            if Name.A not in annot or Name.D not in annot[Name.A]:
+                continue
+            # We found an annotation that points to a named destination
+            named_destination = str(annot[Name.A][Name.D])
+            if named_destination not in names:
+                # If there is no corresponding named destination, remove the
+                # annotation. Having no destination set is still valid and just
+                # makes the link non-functional.
+                log.warning(
+                    f"Disabling a hyperlink annotation on page {n + 1} to a "
+                    "non-existent named destination "
+                    f"{named_destination}."
+                )
+                del annot[Name.A][Name.D]
+                modified = True
+
+    return modified
diff --git a/src/ocrmypdf/_metadata.py b/src/ocrmypdf/_metadata.py
@@ -15,6 +15,7 @@
 from pikepdf import __version__ as PIKEPDF_VERSION
 from pikepdf.models.metadata import PdfMetadata, encode_pdf_date
 
+from ocrmypdf._annots import remove_broken_goto_annotations
 from ocrmypdf._defaults import PROGRAM_NAME
 from ocrmypdf._jobcontext import PdfContext
 from ocrmypdf._version import __version__ as OCRMYPF_VERSION
diff --git a/src/ocrmypdf/_pipelines/_common.py b/src/ocrmypdf/_pipelines/_common.py
@@ -20,7 +20,9 @@
 from typing import NamedTuple, cast
 
 import PIL
+from pikepdf import Pdf
 
+from ocrmypdf._annots import remove_broken_goto_annotations
 from ocrmypdf._concurrent import Executor, setup_executor
 from ocrmypdf._jobcontext import PageContext, PdfContext
 from ocrmypdf._logging import PageNumberFilter
@@ -438,7 +440,14 @@ def postprocess(
     pdf_file: Path, context: PdfContext, executor: Executor
 ) -> tuple[Path, Sequence[str]]:
     """Postprocess the PDF file."""
-    pdf_out = pdf_file
+    # pdf_out = pdf_file
+    with Pdf.open(pdf_file) as pdf:
+        fix_annots = context.get_path('fix_annots.pdf')
+        if remove_broken_goto_annotations(pdf):
+            pdf.save(fix_annots)
+            pdf_out = fix_annots
+        else:
+            pdf_out = pdf_file
     if context.options.output_type.startswith('pdfa'):
         ps_stub_out = generate_postscript_stub(context)
         pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context)
diff --git a/tests/test_annots.py b/tests/test_annots.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: 2024 James R. Barlow
+# SPDX-License-Identifier: MPL-2.0
+
+from __future__ import annotations
+
+import pytest
+from pikepdf import Array, Dictionary, Name, NameTree, Pdf
+
+from ocrmypdf._annots import remove_broken_goto_annotations
+
+
+def test_remove_broken_goto_annotations(resources):
+    with Pdf.open(resources / 'link.pdf') as pdf:
+        assert not remove_broken_goto_annotations(pdf), "File should not be modified"
+
+        # Construct Dests nametree
+        nt = NameTree.new(pdf)
+        names = pdf.Root[Name.Names] = pdf.make_indirect(Dictionary())
+        names[Name.Dests] = nt.obj
+        # Create a broken named destination
+        nt['Invalid'] = pdf.make_indirect(Dictionary())
+        # Create a valid named destination
+        nt['Valid'] = Array([pdf.pages[0].obj, Name.XYZ, 0, 0, 0])
+
+        pdf.pages[0].Annots[0].A.D = 'Missing'
+        pdf.pages[1].Annots[0].A.D = 'Valid'
+
+        assert remove_broken_goto_annotations(pdf), "File should be modified"
+
+        assert Name.D not in pdf.pages[0].Annots[0].A
+        assert Name.D in pdf.pages[1].Annots[0].A
diff --git a/tests/test_page_reducing.py b/tests/test_page_reducing.py
@@ -0,0 +1,2 @@
+import hypothesis
+import pytest