Skip to content

Commit e713ef6

Browse files
refactor: organize PDF link extraction functions in fast strategy
1 parent 9835fe4 commit e713ef6

File tree

2 files changed

+120
-115
lines changed

2 files changed

+120
-115
lines changed

unstructured/partition/pdf.py

Lines changed: 1 addition & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import wrapt
1414
from pdfminer import psparser
1515
from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox
16-
from pdfminer.pdftypes import PDFObjRef
1716
from pdfminer.utils import open_filename
1817
from pi_heif import register_heif_opener
1918
from PIL import Image as PILImage
@@ -69,6 +68,7 @@
6968
)
7069
from unstructured.partition.pdf_image.pdfminer_processing import (
7170
clean_pdfminer_inner_elements,
71+
get_uris,
7272
merge_inferred_with_extracted_layout,
7373
)
7474
from unstructured.partition.pdf_image.pdfminer_utils import (
@@ -1076,119 +1076,6 @@ def check_coords_within_boundary(
10761076
return x_within_boundary and y_within_boundary
10771077

10781078

1079-
def get_uris(
1080-
annots: PDFObjRef | list[PDFObjRef],
1081-
height: float,
1082-
coordinate_system: PixelSpace | PointSpace,
1083-
page_number: int,
1084-
) -> list[dict[str, Any]]:
1085-
"""
1086-
Extracts URI annotations from a single or a list of PDF object references on a specific page.
1087-
The type of annots (list or not) depends on the pdf formatting. The function detectes the type
1088-
of annots and then pass on to get_uris_from_annots function as a list.
1089-
1090-
Args:
1091-
annots (PDFObjRef | list[PDFObjRef]): A single or a list of PDF object references
1092-
representing annotations on the page.
1093-
height (float): The height of the page in the specified coordinate system.
1094-
coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent
1095-
the annotations' coordinates.
1096-
page_number (int): The page number from which to extract annotations.
1097-
1098-
Returns:
1099-
list[dict]: A list of dictionaries, each containing information about a URI annotation,
1100-
including its coordinates, bounding box, type, URI link, and page number.
1101-
"""
1102-
if isinstance(annots, list):
1103-
return get_uris_from_annots(annots, height, coordinate_system, page_number)
1104-
resolved_annots = annots.resolve()
1105-
if resolved_annots is None:
1106-
return []
1107-
return get_uris_from_annots(resolved_annots, height, coordinate_system, page_number)
1108-
1109-
1110-
def get_uris_from_annots(
1111-
annots: list[PDFObjRef],
1112-
height: int | float,
1113-
coordinate_system: PixelSpace | PointSpace,
1114-
page_number: int,
1115-
) -> list[dict[str, Any]]:
1116-
"""
1117-
Extracts URI annotations from a list of PDF object references.
1118-
1119-
Args:
1120-
annots (list[PDFObjRef]): A list of PDF object references representing annotations on
1121-
a page.
1122-
height (int | float): The height of the page in the specified coordinate system.
1123-
coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent
1124-
the annotations' coordinates.
1125-
page_number (int): The page number from which to extract annotations.
1126-
1127-
Returns:
1128-
list[dict]: A list of dictionaries, each containing information about a URI annotation,
1129-
including its coordinates, bounding box, type, URI link, and page number.
1130-
"""
1131-
annotation_list = []
1132-
for annotation in annots:
1133-
# Check annotation is valid for extraction
1134-
annotation_dict = try_resolve(annotation)
1135-
if not isinstance(annotation_dict, dict):
1136-
continue
1137-
subtype = annotation_dict.get("Subtype", None)
1138-
if not subtype or isinstance(subtype, PDFObjRef) or str(subtype) != "/'Link'":
1139-
continue
1140-
# Extract bounding box and update coordinates
1141-
rect = annotation_dict.get("Rect", None)
1142-
if not rect or isinstance(rect, PDFObjRef) or len(rect) != 4:
1143-
continue
1144-
x1, y1, x2, y2 = rect_to_bbox(rect, height)
1145-
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
1146-
coordinates_metadata = CoordinatesMetadata(
1147-
points=points,
1148-
system=coordinate_system,
1149-
)
1150-
# Extract type
1151-
if "A" not in annotation_dict:
1152-
continue
1153-
uri_dict = try_resolve(annotation_dict["A"])
1154-
if not isinstance(uri_dict, dict):
1155-
continue
1156-
uri_type = None
1157-
if "S" in uri_dict and not isinstance(uri_dict["S"], PDFObjRef):
1158-
uri_type = str(uri_dict["S"])
1159-
# Extract URI link
1160-
uri = None
1161-
try:
1162-
if uri_type == "/'URI'":
1163-
uri = try_resolve(try_resolve(uri_dict["URI"])).decode("utf-8")
1164-
if uri_type == "/'GoTo'":
1165-
uri = try_resolve(try_resolve(uri_dict["D"])).decode("utf-8")
1166-
except Exception:
1167-
pass
1168-
1169-
annotation_list.append(
1170-
{
1171-
"coordinates": coordinates_metadata,
1172-
"bbox": (x1, y1, x2, y2),
1173-
"type": uri_type,
1174-
"uri": uri,
1175-
"page_number": page_number,
1176-
},
1177-
)
1178-
return annotation_list
1179-
1180-
1181-
def try_resolve(annot: PDFObjRef):
1182-
"""
1183-
Attempt to resolve a PDF object reference. If successful, returns the resolved object;
1184-
otherwise, returns the original reference.
1185-
"""
1186-
try:
1187-
return annot.resolve()
1188-
except Exception:
1189-
return annot
1190-
1191-
11921079
def calculate_intersection_area(
11931080
bbox1: tuple[float, float, float, float],
11941081
bbox2: tuple[float, float, float, float],

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1-
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING, Any, BinaryIO, List, Optional, Union, cast
24

35
import numpy as np
6+
from pdfminer.pdftypes import PDFObjRef
47
from pdfminer.utils import open_filename
58

9+
from unstructured.documents.coordinates import PixelSpace, PointSpace
10+
from unstructured.documents.elements import CoordinatesMetadata
611
from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
712
from unstructured.partition.pdf_image.pdfminer_utils import (
813
extract_image_objects,
@@ -306,3 +311,116 @@ def aggregate_embedded_text_by_block(
306311

307312
text = " ".join([obj.text for i, obj in enumerate(pdf_objects) if (mask[i] and obj.text)])
308313
return text
314+
315+
316+
def get_uris(
317+
annots: PDFObjRef | list[PDFObjRef],
318+
height: float,
319+
coordinate_system: PixelSpace | PointSpace,
320+
page_number: int,
321+
) -> list[dict[str, Any]]:
322+
"""
323+
Extracts URI annotations from a single or a list of PDF object references on a specific page.
324+
The type of annots (list or not) depends on the pdf formatting. The function detectes the type
325+
of annots and then pass on to get_uris_from_annots function as a list.
326+
327+
Args:
328+
annots (PDFObjRef | list[PDFObjRef]): A single or a list of PDF object references
329+
representing annotations on the page.
330+
height (float): The height of the page in the specified coordinate system.
331+
coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent
332+
the annotations' coordinates.
333+
page_number (int): The page number from which to extract annotations.
334+
335+
Returns:
336+
list[dict]: A list of dictionaries, each containing information about a URI annotation,
337+
including its coordinates, bounding box, type, URI link, and page number.
338+
"""
339+
if isinstance(annots, list):
340+
return get_uris_from_annots(annots, height, coordinate_system, page_number)
341+
resolved_annots = annots.resolve()
342+
if resolved_annots is None:
343+
return []
344+
return get_uris_from_annots(resolved_annots, height, coordinate_system, page_number)
345+
346+
347+
def get_uris_from_annots(
348+
annots: list[PDFObjRef],
349+
height: int | float,
350+
coordinate_system: PixelSpace | PointSpace,
351+
page_number: int,
352+
) -> list[dict[str, Any]]:
353+
"""
354+
Extracts URI annotations from a list of PDF object references.
355+
356+
Args:
357+
annots (list[PDFObjRef]): A list of PDF object references representing annotations on
358+
a page.
359+
height (int | float): The height of the page in the specified coordinate system.
360+
coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent
361+
the annotations' coordinates.
362+
page_number (int): The page number from which to extract annotations.
363+
364+
Returns:
365+
list[dict]: A list of dictionaries, each containing information about a URI annotation,
366+
including its coordinates, bounding box, type, URI link, and page number.
367+
"""
368+
annotation_list = []
369+
for annotation in annots:
370+
# Check annotation is valid for extraction
371+
annotation_dict = try_resolve(annotation)
372+
if not isinstance(annotation_dict, dict):
373+
continue
374+
subtype = annotation_dict.get("Subtype", None)
375+
if not subtype or isinstance(subtype, PDFObjRef) or str(subtype) != "/'Link'":
376+
continue
377+
# Extract bounding box and update coordinates
378+
rect = annotation_dict.get("Rect", None)
379+
if not rect or isinstance(rect, PDFObjRef) or len(rect) != 4:
380+
continue
381+
x1, y1, x2, y2 = rect_to_bbox(rect, height)
382+
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
383+
coordinates_metadata = CoordinatesMetadata(
384+
points=points,
385+
system=coordinate_system,
386+
)
387+
# Extract type
388+
if "A" not in annotation_dict:
389+
continue
390+
uri_dict = try_resolve(annotation_dict["A"])
391+
if not isinstance(uri_dict, dict):
392+
continue
393+
uri_type = None
394+
if "S" in uri_dict and not isinstance(uri_dict["S"], PDFObjRef):
395+
uri_type = str(uri_dict["S"])
396+
# Extract URI link
397+
uri = None
398+
try:
399+
if uri_type == "/'URI'":
400+
uri = try_resolve(try_resolve(uri_dict["URI"])).decode("utf-8")
401+
if uri_type == "/'GoTo'":
402+
uri = try_resolve(try_resolve(uri_dict["D"])).decode("utf-8")
403+
except Exception:
404+
pass
405+
406+
annotation_list.append(
407+
{
408+
"coordinates": coordinates_metadata,
409+
"bbox": (x1, y1, x2, y2),
410+
"type": uri_type,
411+
"uri": uri,
412+
"page_number": page_number,
413+
},
414+
)
415+
return annotation_list
416+
417+
418+
def try_resolve(annot: PDFObjRef):
419+
"""
420+
Attempt to resolve a PDF object reference. If successful, returns the resolved object;
421+
otherwise, returns the original reference.
422+
"""
423+
try:
424+
return annot.resolve()
425+
except Exception:
426+
return annot

0 commit comments

Comments
 (0)