Skip to content

Commit b94d203

Browse files
ENH: Support alternative (U)F names for embedded file retrieval (#3072)
Closes #3070.
1 parent ad97deb commit b94d203

File tree

3 files changed

+121
-18
lines changed

3 files changed

+121
-18
lines changed

.github/workflows/github-ci.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
sudo apt-get update
7171
- name: Install APT dependencies
7272
run:
73-
sudo apt-get install ghostscript
73+
sudo apt-get install ghostscript poppler-utils
7474
- name: Checkout Code
7575
uses: actions/checkout@v4
7676
with:

pypdf/_doc_common.py

+49-17
Original file line numberDiff line numberDiff line change
@@ -1350,7 +1350,8 @@ def _list_attachments(self) -> List[str]:
13501350
catalog = self.root_object
13511351
# From the catalog get the embedded file names
13521352
try:
1353-
filenames = cast(
1353+
# This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
1354+
names = cast(
13541355
ArrayObject,
13551356
cast(
13561357
DictionaryObject,
@@ -1359,8 +1360,23 @@ def _list_attachments(self) -> List[str]:
13591360
)
13601361
except KeyError:
13611362
return []
1362-
attachments_names = [f for f in filenames if isinstance(f, str)]
1363-
return attachments_names
1363+
attachment_names: List[str] = []
1364+
for i, name in enumerate(names):
1365+
if isinstance(name, str):
1366+
attachment_names.append(name)
1367+
else:
1368+
name = name.get_object()
1369+
for key in ["/UF", "/F"]:
1370+
# PDF 2.0 reference, table 43:
1371+
# > A PDF reader shall use the value of the UF key, when present, instead of the F key.
1372+
if key in name:
1373+
name = name[key].get_object()
1374+
if name == names[i - 1]:
1375+
# Avoid duplicates for the same entry.
1376+
continue
1377+
attachment_names.append(name)
1378+
break
1379+
return attachment_names
13641380

13651381
def _get_attachment_list(self, name: str) -> List[bytes]:
13661382
out = self._get_attachments(name)[name]
@@ -1389,7 +1405,8 @@ def _get_attachments(
13891405
catalog = self.root_object
13901406
# From the catalog get the embedded file names
13911407
try:
1392-
filenames = cast(
1408+
# This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
1409+
names = cast(
13931410
ArrayObject,
13941411
cast(
13951412
DictionaryObject,
@@ -1399,21 +1416,36 @@ def _get_attachments(
13991416
except KeyError:
14001417
return {}
14011418
attachments: Dict[str, Union[bytes, List[bytes]]] = {}
1419+
14021420
# Loop through attachments
1403-
for i in range(len(filenames)):
1404-
f = filenames[i]
1405-
if isinstance(f, str):
1406-
if filename is not None and f != filename:
1407-
continue
1408-
name = f
1409-
f_dict = filenames[i + 1].get_object()
1410-
f_data = f_dict["/EF"]["/F"].get_data()
1411-
if name in attachments:
1412-
if not isinstance(attachments[name], list):
1413-
attachments[name] = [attachments[name]] # type:ignore
1414-
attachments[name].append(f_data) # type:ignore
1421+
for i, name in enumerate(names):
1422+
if isinstance(name, str):
1423+
# Retrieve the corresponding reference.
1424+
file_dictionary = names[i + 1].get_object()
1425+
else:
1426+
# We have the reference, but need to determine the name.
1427+
file_dictionary = name.get_object()
1428+
for key in ["/UF", "/F"]:
1429+
# PDF 2.0 reference, table 43:
1430+
# > A PDF reader shall use the value of the UF key, when present, instead of the F key.
1431+
if key in file_dictionary:
1432+
name = file_dictionary[key].get_object()
1433+
break
14151434
else:
1416-
attachments[name] = f_data
1435+
continue
1436+
if name == names[i - 1]:
1437+
# Avoid extracting the same file twice.
1438+
continue
1439+
1440+
if filename is not None and name != filename:
1441+
continue
1442+
file_data = file_dictionary["/EF"]["/F"].get_data()
1443+
if name in attachments:
1444+
if not isinstance(attachments[name], list):
1445+
attachments[name] = [attachments[name]] # type:ignore
1446+
attachments[name].append(file_data) # type:ignore
1447+
else:
1448+
attachments[name] = file_data
14171449
return attachments
14181450

14191451
@abstractmethod

tests/test_doc_common.py

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""Test the pypdf._doc_common module."""
2+
import re
3+
import shutil
4+
import subprocess
5+
from pathlib import Path
6+
7+
import pytest
8+
9+
from pypdf import PdfReader, PdfWriter
10+
11+
TESTS_ROOT = Path(__file__).parent.resolve()
12+
PROJECT_ROOT = TESTS_ROOT.parent
13+
SAMPLE_ROOT = PROJECT_ROOT / "sample-files"
14+
15+
PDFATTACH_BINARY = shutil.which("pdfattach")
16+
17+
18+
@pytest.mark.skipif(PDFATTACH_BINARY is None, reason="Requires poppler-utils")
19+
def test_attachments(tmpdir):
20+
# No attachments.
21+
clean_path = SAMPLE_ROOT / "002-trivial-libre-office-writer" / "002-trivial-libre-office-writer.pdf"
22+
with PdfReader(clean_path) as pdf:
23+
assert pdf._list_attachments() == []
24+
25+
# UF = name.
26+
attached_path = tmpdir / "attached.pdf"
27+
file_path = tmpdir / "test.txt"
28+
file_path.write_binary(b"Hello World\n")
29+
subprocess.run([PDFATTACH_BINARY, clean_path, file_path, attached_path]) # noqa: S603
30+
with PdfReader(str(attached_path)) as pdf:
31+
assert pdf._list_attachments() == ["test.txt"]
32+
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
33+
34+
# UF != name.
35+
different_path = tmpdir / "different.pdf"
36+
different_path.write_binary(re.sub(rb" /UF [^/]+ /", b" /UF(my-file.txt) /", attached_path.read_binary()))
37+
with PdfReader(str(different_path)) as pdf:
38+
assert pdf._list_attachments() == ["test.txt", "my-file.txt"]
39+
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
40+
assert pdf._get_attachments("my-file.txt") == {"my-file.txt": b"Hello World\n"}
41+
42+
# Only name.
43+
no_f_path = tmpdir / "no-f.pdf"
44+
no_f_path.write_binary(re.sub(rb" /UF [^/]+ /", b" /", attached_path.read_binary()))
45+
with PdfReader(str(no_f_path)) as pdf:
46+
assert pdf._list_attachments() == ["test.txt"]
47+
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
48+
49+
# UF and F.
50+
uf_f_path = tmpdir / "uf-f.pdf"
51+
uf_f_path.write_binary(attached_path.read_binary().replace(b" /UF ", b"/F(file.txt) /UF "))
52+
with PdfReader(str(uf_f_path)) as pdf:
53+
assert pdf._list_attachments() == ["test.txt"]
54+
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
55+
56+
# Only F.
57+
only_f_path = tmpdir / "f.pdf"
58+
only_f_path.write_binary(attached_path.read_binary().replace(b" /UF ", b" /F "))
59+
with PdfReader(str(only_f_path)) as pdf:
60+
assert pdf._list_attachments() == ["test.txt"]
61+
assert pdf._get_attachments("test.txt") == {"test.txt": b"Hello World\n"}
62+
63+
64+
def test_get_attachments__same_attachment_more_than_twice():
65+
writer = PdfWriter()
66+
writer.add_blank_page(100, 100)
67+
for i in range(5):
68+
writer.add_attachment("test.txt", f"content{i}")
69+
assert writer._get_attachments("test.txt") == {
70+
"test.txt": [b"content0", b"content1", b"content2", b"content3", b"content4"]
71+
}

0 commit comments

Comments
 (0)