Skip to content

Commit 881d6fc

Browse files
add elements links using _get_links_in_element()
1 parent 7223a13 commit 881d6fc

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

unstructured/partition/common/common.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import emoji
1111
import psutil
12+
from unstructured_inference.inference.elements import Rectangle
1213

1314
from unstructured.documents.coordinates import CoordinateSystem, PixelSpace
1415
from unstructured.documents.elements import (
@@ -464,6 +465,9 @@ def document_to_element_list(
464465
image_height = page_image_metadata.get("height")
465466

466467
translation_mapping: list[tuple["LayoutElement", Element]] = []
468+
469+
links = layouts_links[page_number - 1]
470+
467471
for layout_element in page.elements:
468472
if image_width and image_height and hasattr(layout_element.bbox, "coordinates"):
469473
coordinate_system = PixelSpace(width=image_width, height=image_height)
@@ -485,6 +489,8 @@ def document_to_element_list(
485489
translation_mapping.extend([(layout_element, el) for el in element])
486490
continue
487491
else:
492+
element.metadata.links = _get_links_in_element(links, layout_element.bbox)
493+
488494
if last_modification_date:
489495
element.metadata.last_modified = last_modification_date
490496
element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
@@ -539,6 +545,24 @@ def document_to_element_list(
539545

540546
return elements
541547

548+
def _get_links_in_element(
549+
page_links: list,
550+
region: Rectangle
551+
) -> list:
552+
from unstructured.partition.pdf_image.pdfminer_processing import bboxes1_is_almost_subregion_of_bboxes2
553+
554+
links_bboxes = [Rectangle(*link.get('bbox')) for link in page_links]
555+
results = bboxes1_is_almost_subregion_of_bboxes2(links_bboxes, [region])
556+
links = [
557+
{
558+
"text": page_links[idx].get("text"),
559+
"url": page_links[idx].get("url"),
560+
"start_index": page_links[idx].get("start_index"),
561+
}
562+
for idx, result in enumerate(results) if any(result)
563+
]
564+
565+
return links
542566

543567
def ocr_data_to_elements(
544568
ocr_data: list["LayoutElement"],

0 commit comments

Comments
 (0)