Skip to content

Commit a659f83

Browse files
committed
Remove invalid hyperlink annotations to satisfy Ghostscript 10.x during PDF/A conversion
Closes #1425
1 parent 08f95c0 commit a659f83

File tree

6 files changed

+113
-1
lines changed

6 files changed

+113
-1
lines changed

pyproject.toml

+3
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,6 @@ convention = "google"
155155

156156
[tool.ruff.format]
157157
quote-style = "preserve"
158+
159+
[dependency-groups]
160+
dev = ["mypy>=1.13.0"]

src/ocrmypdf/_annots.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# SPDX-FileCopyrightText: 2024 James R. Barlow
2+
# SPDX-License-Identifier: MPL-2.0
3+
4+
"""OCRmyPDF PDF annotation cleanup."""
5+
6+
from __future__ import annotations
7+
8+
import logging
9+
10+
from pikepdf import Dictionary, Name, NameTree, Pdf
11+
12+
log = logging.getLogger(__name__)
13+
14+
15+
def remove_broken_goto_annotations(pdf: Pdf) -> bool:
16+
"""Remove broken goto annotations from a PDF.
17+
18+
If a PDF contains a GoTo Action that points to a named destination that does not
19+
exist, Ghostscript PDF/A conversion will fail. In any event, a named destination
20+
that is not defined is not useful.
21+
22+
Args:
23+
pdf: Opened PDF file.
24+
25+
Returns:
26+
bool: True if the file was modified, False if not.
27+
"""
28+
modified = False
29+
30+
# Check if there are any named destinations
31+
if Name.Names not in pdf.Root:
32+
return modified
33+
if Name.Dests not in pdf.Root[Name.Names]:
34+
return modified
35+
36+
dests = pdf.Root[Name.Names][Name.Dests]
37+
if not isinstance(dests, Dictionary):
38+
return modified
39+
nametree = NameTree(dests)
40+
41+
# Create a set of all named destinations
42+
names = set(k for k in nametree.keys())
43+
44+
for n, page in enumerate(pdf.pages):
45+
if Name.Annots not in page:
46+
continue
47+
for annot in page[Name.Annots]:
48+
if not isinstance(annot, Dictionary):
49+
continue
50+
if Name.A not in annot or Name.D not in annot[Name.A]:
51+
continue
52+
# We found an annotation that points to a named destination
53+
named_destination = str(annot[Name.A][Name.D])
54+
if named_destination not in names:
55+
# If there is no corresponding named destination, remove the
56+
# annotation. Having no destination set is still valid and just
57+
# makes the link non-functional.
58+
log.warning(
59+
f"Disabling a hyperlink annotation on page {n + 1} to a "
60+
"non-existent named destination "
61+
f"{named_destination}."
62+
)
63+
del annot[Name.A][Name.D]
64+
modified = True
65+
66+
return modified

src/ocrmypdf/_metadata.py

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pikepdf import __version__ as PIKEPDF_VERSION
1616
from pikepdf.models.metadata import PdfMetadata, encode_pdf_date
1717

18+
from ocrmypdf._annots import remove_broken_goto_annotations
1819
from ocrmypdf._defaults import PROGRAM_NAME
1920
from ocrmypdf._jobcontext import PdfContext
2021
from ocrmypdf._version import __version__ as OCRMYPF_VERSION

src/ocrmypdf/_pipelines/_common.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
from typing import NamedTuple, cast
2121

2222
import PIL
23+
from pikepdf import Pdf
2324

25+
from ocrmypdf._annots import remove_broken_goto_annotations
2426
from ocrmypdf._concurrent import Executor, setup_executor
2527
from ocrmypdf._jobcontext import PageContext, PdfContext
2628
from ocrmypdf._logging import PageNumberFilter
@@ -438,7 +440,14 @@ def postprocess(
438440
pdf_file: Path, context: PdfContext, executor: Executor
439441
) -> tuple[Path, Sequence[str]]:
440442
"""Postprocess the PDF file."""
441-
pdf_out = pdf_file
443+
# pdf_out = pdf_file
444+
with Pdf.open(pdf_file) as pdf:
445+
fix_annots = context.get_path('fix_annots.pdf')
446+
if remove_broken_goto_annotations(pdf):
447+
pdf.save(fix_annots)
448+
pdf_out = fix_annots
449+
else:
450+
pdf_out = pdf_file
442451
if context.options.output_type.startswith('pdfa'):
443452
ps_stub_out = generate_postscript_stub(context)
444453
pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context)

tests/test_annots.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# SPDX-FileCopyrightText: 2024 James R. Barlow
2+
# SPDX-License-Identifier: MPL-2.0
3+
4+
from __future__ import annotations
5+
6+
import pytest
7+
from pikepdf import Array, Dictionary, Name, NameTree, Pdf
8+
9+
from ocrmypdf._annots import remove_broken_goto_annotations
10+
11+
12+
def test_remove_broken_goto_annotations(resources):
13+
with Pdf.open(resources / 'link.pdf') as pdf:
14+
assert not remove_broken_goto_annotations(pdf), "File should not be modified"
15+
16+
# Construct Dests nametree
17+
nt = NameTree.new(pdf)
18+
names = pdf.Root[Name.Names] = pdf.make_indirect(Dictionary())
19+
names[Name.Dests] = nt.obj
20+
# Create a broken named destination
21+
nt['Invalid'] = pdf.make_indirect(Dictionary())
22+
# Create a valid named destination
23+
nt['Valid'] = Array([pdf.pages[0].obj, Name.XYZ, 0, 0, 0])
24+
25+
pdf.pages[0].Annots[0].A.D = 'Missing'
26+
pdf.pages[1].Annots[0].A.D = 'Valid'
27+
28+
assert remove_broken_goto_annotations(pdf), "File should be modified"
29+
30+
assert Name.D not in pdf.pages[0].Annots[0].A
31+
assert Name.D in pdf.pages[1].Annots[0].A

tests/test_page_reducing.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
import hypothesis
2+
import pytest

0 commit comments

Comments
 (0)