@@ -1350,7 +1350,8 @@ def _list_attachments(self) -> List[str]:
1350
1350
catalog = self .root_object
1351
1351
# From the catalog get the embedded file names
1352
1352
try :
1353
- filenames = cast (
1353
+ # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
1354
+ names = cast (
1354
1355
ArrayObject ,
1355
1356
cast (
1356
1357
DictionaryObject ,
@@ -1359,8 +1360,23 @@ def _list_attachments(self) -> List[str]:
1359
1360
)
1360
1361
except KeyError :
1361
1362
return []
1362
- attachments_names = [f for f in filenames if isinstance (f , str )]
1363
- return attachments_names
1363
+ attachment_names : List [str ] = []
1364
+ for i , name in enumerate (names ):
1365
+ if isinstance (name , str ):
1366
+ attachment_names .append (name )
1367
+ else :
1368
+ name = name .get_object ()
1369
+ for key in ["/UF" , "/F" ]:
1370
+ # PDF 2.0 reference, table 43:
1371
+ # > A PDF reader shall use the value of the UF key, when present, instead of the F key.
1372
+ if key in name :
1373
+ name = name [key ].get_object ()
1374
+ if name == names [i - 1 ]:
1375
+ # Avoid duplicates for the same entry.
1376
+ continue
1377
+ attachment_names .append (name )
1378
+ break
1379
+ return attachment_names
1364
1380
1365
1381
def _get_attachment_list (self , name : str ) -> List [bytes ]:
1366
1382
out = self ._get_attachments (name )[name ]
@@ -1389,7 +1405,8 @@ def _get_attachments(
1389
1405
catalog = self .root_object
1390
1406
# From the catalog get the embedded file names
1391
1407
try :
1392
- filenames = cast (
1408
+ # This is a name tree of the format [name_1, reference_1, name_2, reference_2, ...]
1409
+ names = cast (
1393
1410
ArrayObject ,
1394
1411
cast (
1395
1412
DictionaryObject ,
@@ -1399,21 +1416,36 @@ def _get_attachments(
1399
1416
except KeyError :
1400
1417
return {}
1401
1418
attachments : Dict [str , Union [bytes , List [bytes ]]] = {}
1419
+
1402
1420
# Loop through attachments
1403
- for i in range (len (filenames )):
1404
- f = filenames [i ]
1405
- if isinstance (f , str ):
1406
- if filename is not None and f != filename :
1407
- continue
1408
- name = f
1409
- f_dict = filenames [i + 1 ].get_object ()
1410
- f_data = f_dict ["/EF" ]["/F" ].get_data ()
1411
- if name in attachments :
1412
- if not isinstance (attachments [name ], list ):
1413
- attachments [name ] = [attachments [name ]] # type:ignore
1414
- attachments [name ].append (f_data ) # type:ignore
1421
+ for i , name in enumerate (names ):
1422
+ if isinstance (name , str ):
1423
+ # Retrieve the corresponding reference.
1424
+ file_dictionary = names [i + 1 ].get_object ()
1425
+ else :
1426
+ # We have the reference, but need to determine the name.
1427
+ file_dictionary = name .get_object ()
1428
+ for key in ["/UF" , "/F" ]:
1429
+ # PDF 2.0 reference, table 43:
1430
+ # > A PDF reader shall use the value of the UF key, when present, instead of the F key.
1431
+ if key in file_dictionary :
1432
+ name = file_dictionary [key ].get_object ()
1433
+ break
1415
1434
else :
1416
- attachments [name ] = f_data
1435
+ continue
1436
+ if name == names [i - 1 ]:
1437
+ # Avoid extracting the same file twice.
1438
+ continue
1439
+
1440
+ if filename is not None and name != filename :
1441
+ continue
1442
+ file_data = file_dictionary ["/EF" ]["/F" ].get_data ()
1443
+ if name in attachments :
1444
+ if not isinstance (attachments [name ], list ):
1445
+ attachments [name ] = [attachments [name ]] # type:ignore
1446
+ attachments [name ].append (file_data ) # type:ignore
1447
+ else :
1448
+ attachments [name ] = file_data
1417
1449
return attachments
1418
1450
1419
1451
@abstractmethod
0 commit comments