|
13 | 13 | import wrapt |
14 | 14 | from pdfminer import psparser |
15 | 15 | from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox |
16 | | -from pdfminer.pdftypes import PDFObjRef |
17 | 16 | from pdfminer.utils import open_filename |
18 | 17 | from pi_heif import register_heif_opener |
19 | 18 | from PIL import Image as PILImage |
|
69 | 68 | ) |
70 | 69 | from unstructured.partition.pdf_image.pdfminer_processing import ( |
71 | 70 | clean_pdfminer_inner_elements, |
| 71 | + get_uris, |
72 | 72 | merge_inferred_with_extracted_layout, |
73 | 73 | ) |
74 | 74 | from unstructured.partition.pdf_image.pdfminer_utils import ( |
@@ -1076,119 +1076,6 @@ def check_coords_within_boundary( |
1076 | 1076 | return x_within_boundary and y_within_boundary |
1077 | 1077 |
|
1078 | 1078 |
|
1079 | | -def get_uris( |
1080 | | - annots: PDFObjRef | list[PDFObjRef], |
1081 | | - height: float, |
1082 | | - coordinate_system: PixelSpace | PointSpace, |
1083 | | - page_number: int, |
1084 | | -) -> list[dict[str, Any]]: |
1085 | | - """ |
1086 | | - Extracts URI annotations from a single or a list of PDF object references on a specific page. |
1087 | | - The type of annots (list or not) depends on the pdf formatting. The function detectes the type |
1088 | | - of annots and then pass on to get_uris_from_annots function as a list. |
1089 | | -
|
1090 | | - Args: |
1091 | | - annots (PDFObjRef | list[PDFObjRef]): A single or a list of PDF object references |
1092 | | - representing annotations on the page. |
1093 | | - height (float): The height of the page in the specified coordinate system. |
1094 | | - coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent |
1095 | | - the annotations' coordinates. |
1096 | | - page_number (int): The page number from which to extract annotations. |
1097 | | -
|
1098 | | - Returns: |
1099 | | - list[dict]: A list of dictionaries, each containing information about a URI annotation, |
1100 | | - including its coordinates, bounding box, type, URI link, and page number. |
1101 | | - """ |
1102 | | - if isinstance(annots, list): |
1103 | | - return get_uris_from_annots(annots, height, coordinate_system, page_number) |
1104 | | - resolved_annots = annots.resolve() |
1105 | | - if resolved_annots is None: |
1106 | | - return [] |
1107 | | - return get_uris_from_annots(resolved_annots, height, coordinate_system, page_number) |
1108 | | - |
1109 | | - |
1110 | | -def get_uris_from_annots( |
1111 | | - annots: list[PDFObjRef], |
1112 | | - height: int | float, |
1113 | | - coordinate_system: PixelSpace | PointSpace, |
1114 | | - page_number: int, |
1115 | | -) -> list[dict[str, Any]]: |
1116 | | - """ |
1117 | | - Extracts URI annotations from a list of PDF object references. |
1118 | | -
|
1119 | | - Args: |
1120 | | - annots (list[PDFObjRef]): A list of PDF object references representing annotations on |
1121 | | - a page. |
1122 | | - height (int | float): The height of the page in the specified coordinate system. |
1123 | | - coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent |
1124 | | - the annotations' coordinates. |
1125 | | - page_number (int): The page number from which to extract annotations. |
1126 | | -
|
1127 | | - Returns: |
1128 | | - list[dict]: A list of dictionaries, each containing information about a URI annotation, |
1129 | | - including its coordinates, bounding box, type, URI link, and page number. |
1130 | | - """ |
1131 | | - annotation_list = [] |
1132 | | - for annotation in annots: |
1133 | | - # Check annotation is valid for extraction |
1134 | | - annotation_dict = try_resolve(annotation) |
1135 | | - if not isinstance(annotation_dict, dict): |
1136 | | - continue |
1137 | | - subtype = annotation_dict.get("Subtype", None) |
1138 | | - if not subtype or isinstance(subtype, PDFObjRef) or str(subtype) != "/'Link'": |
1139 | | - continue |
1140 | | - # Extract bounding box and update coordinates |
1141 | | - rect = annotation_dict.get("Rect", None) |
1142 | | - if not rect or isinstance(rect, PDFObjRef) or len(rect) != 4: |
1143 | | - continue |
1144 | | - x1, y1, x2, y2 = rect_to_bbox(rect, height) |
1145 | | - points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) |
1146 | | - coordinates_metadata = CoordinatesMetadata( |
1147 | | - points=points, |
1148 | | - system=coordinate_system, |
1149 | | - ) |
1150 | | - # Extract type |
1151 | | - if "A" not in annotation_dict: |
1152 | | - continue |
1153 | | - uri_dict = try_resolve(annotation_dict["A"]) |
1154 | | - if not isinstance(uri_dict, dict): |
1155 | | - continue |
1156 | | - uri_type = None |
1157 | | - if "S" in uri_dict and not isinstance(uri_dict["S"], PDFObjRef): |
1158 | | - uri_type = str(uri_dict["S"]) |
1159 | | - # Extract URI link |
1160 | | - uri = None |
1161 | | - try: |
1162 | | - if uri_type == "/'URI'": |
1163 | | - uri = try_resolve(try_resolve(uri_dict["URI"])).decode("utf-8") |
1164 | | - if uri_type == "/'GoTo'": |
1165 | | - uri = try_resolve(try_resolve(uri_dict["D"])).decode("utf-8") |
1166 | | - except Exception: |
1167 | | - pass |
1168 | | - |
1169 | | - annotation_list.append( |
1170 | | - { |
1171 | | - "coordinates": coordinates_metadata, |
1172 | | - "bbox": (x1, y1, x2, y2), |
1173 | | - "type": uri_type, |
1174 | | - "uri": uri, |
1175 | | - "page_number": page_number, |
1176 | | - }, |
1177 | | - ) |
1178 | | - return annotation_list |
1179 | | - |
1180 | | - |
1181 | | -def try_resolve(annot: PDFObjRef): |
1182 | | - """ |
1183 | | - Attempt to resolve a PDF object reference. If successful, returns the resolved object; |
1184 | | - otherwise, returns the original reference. |
1185 | | - """ |
1186 | | - try: |
1187 | | - return annot.resolve() |
1188 | | - except Exception: |
1189 | | - return annot |
1190 | | - |
1191 | | - |
1192 | 1079 | def calculate_intersection_area( |
1193 | 1080 | bbox1: tuple[float, float, float, float], |
1194 | 1081 | bbox2: tuple[float, float, float, float], |
|
0 commit comments