99
1010import emoji
1111import psutil
12+ from unstructured_inference .inference .elements import Rectangle
1213
1314from unstructured .documents .coordinates import CoordinateSystem , PixelSpace
1415from unstructured .documents .elements import (
@@ -464,6 +465,9 @@ def document_to_element_list(
464465 image_height = page_image_metadata .get ("height" )
465466
466467 translation_mapping : list [tuple ["LayoutElement" , Element ]] = []
468+
469+ links = layouts_links [page_number - 1 ]
470+
467471 for layout_element in page .elements :
468472 if image_width and image_height and hasattr (layout_element .bbox , "coordinates" ):
469473 coordinate_system = PixelSpace (width = image_width , height = image_height )
@@ -485,6 +489,8 @@ def document_to_element_list(
485489 translation_mapping .extend ([(layout_element , el ) for el in element ])
486490 continue
487491 else :
492+ element .metadata .links = _get_links_in_element (links , layout_element .bbox )
493+
488494 if last_modification_date :
489495 element .metadata .last_modified = last_modification_date
490496 element .metadata .text_as_html = getattr (layout_element , "text_as_html" , None )
@@ -539,6 +545,24 @@ def document_to_element_list(
539545
540546 return elements
541547
548+ def _get_links_in_element (
549+ page_links : list ,
550+ region : Rectangle
551+ ) -> list :
552+ from unstructured .partition .pdf_image .pdfminer_processing import bboxes1_is_almost_subregion_of_bboxes2
553+
554+ links_bboxes = [Rectangle (* link .get ('bbox' )) for link in page_links ]
555+ results = bboxes1_is_almost_subregion_of_bboxes2 (links_bboxes , [region ])
556+ links = [
557+ {
558+ "text" : page_links [idx ].get ("text" ),
559+ "url" : page_links [idx ].get ("url" ),
560+ "start_index" : page_links [idx ].get ("start_index" ),
561+ }
562+ for idx , result in enumerate (results ) if any (result )
563+ ]
564+
565+ return links
542566
543567def ocr_data_to_elements (
544568 ocr_data : list ["LayoutElement" ],
0 commit comments