Feat/remove reference of PageLayout.elements (#3943)

badGarnet · web-flow · commit 2dceac34b596 · 2025-03-12T15:21:21.000Z
This PR removes usage of `PageLayout.elements` from partition function,
except for when `analysis=True`. This PR updates the partition logic so
that `PageLayout.elements_array` is used everywhere to save memory and
cpu cost.
Since the analysis function is intended for investigation and not for
general document processing purposes, this part of the code is left for
a future refactor.

`PageLayout.elements` uses a list to store layout elements' data while
`elements_array` uses `numpy` array to store the data, which has much
lower memory requirements. Using `memory_profiler` to test the
differences is usually around 10x.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,13 @@
-## 0.16.26-dev3
+## 0.17.0
 
 ### Enhancements
 
 - **Add support for images in html partitioner** `<img>` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source.
+
 - **Use kwargs instead of env to specify `ocr_agent` and `table_ocr_agent`** for `hi_res` strategy.
 
+- **stop using `PageLayout.elements` to save memory and cpu cost**. Now only use `PageLayout.elements_array` throughout the partition, except when `analysis=True` where the drawing logic still uses `elements`.
+
 ### Features
 
 ### Fixes
@@ -28,6 +31,7 @@
   in unstructured and `register_partitioner` to enable registering your own partitioner for any file type.
 
 - **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements
+
 - **use block matrix to reduce peak memory usage for pdf/image partition**.
 
 ### Features
diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt
@@ -20,3 +20,5 @@ botocore<1.34.132
 importlib-metadata>=8.5.0
 # (austin): Versions below this have a different interface for passing parameters
 unstructured-client>=0.23.0,<0.26.0
+# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file
+protobuf>=6.30.0
diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -11,5 +11,5 @@ google-cloud-vision
 effdet
 # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference>=0.8.7
+unstructured-inference>=0.8.9
 unstructured.pytesseract>=0.3.12
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1479,8 +1479,7 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
     # can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes.
     layout_elem_absent_coordinates = MockSinglePageDocumentLayout()
     for page in layout_elem_absent_coordinates.pages:
-        for el in page.elements:
-            el.bbox = None
+        page.elements_array.element_coords[:, :] = None
     elements = pdf.document_to_element_list(layout_elem_absent_coordinates)
     assert elements[0].metadata.coordinates is None
 
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -12,6 +12,7 @@
     TextRegions,
 )
 from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
+from unstructured_inference.inference.layoutelement import LayoutElements
 
 from test_unstructured.unit_utils import example_doc_path
 from unstructured.partition.auto import partition
@@ -108,15 +109,15 @@ def test_valid_bbox(bbox, is_valid):
 def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
     # create a sample document with pdfminer elements inside tables
     page = PageLayout(number=1, image=Image.new("1", (1, 1)))
-    page.elements = elements
+    page.elements_array = LayoutElements.from_list(elements)
     document_with_table = DocumentLayout(pages=[page])
     document = document_with_table
 
     # call the function to clean the pdfminer inner elements
     cleaned_doc = clean_pdfminer_inner_elements(document)
 
     # check that the pdfminer elements were stored in the extra_info dictionary
-    assert len(cleaned_doc.pages[0].elements) == expected_document_length
+    assert len(cleaned_doc.pages[0].elements_array) == expected_document_length
 
 
 elements_with_duplicate_images = [
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.26-dev3"  # pragma: no cover
+__version__ = "0.17.0"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -766,10 +766,6 @@ def _partition_pdf_or_image_local(
     # vectorization of the data structure ends here
     final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
 
-    for page in final_document_layout.pages:
-        for el in page.elements:
-            el.text = el.text or ""
-
     elements = document_to_element_list(
         final_document_layout,
         sortable=True,
@@ -1199,11 +1195,24 @@ def document_to_element_list(
             else None
         )
 
-        for layout_element in page.elements:
+        head_line_type_class_ids = [
+            idx
+            for idx, class_type in page.elements_array.element_class_id_map.items()
+            if class_type in ("Headline", "Subheadline")
+        ]
+        if head_line_type_class_ids:
+            has_headline = any(
+                np.any(page.elements_array.element_class_ids == idx)
+                for idx in head_line_type_class_ids
+            )
+        else:
+            has_headline = False
+
+        for layout_element in page.elements_array.iter_elements():
             if (
                 image_width
                 and image_height
-                and getattr(layout_element.bbox, "x1") not in (None, np.nan)
+                and not np.isnan(getattr(layout_element.bbox, "x1", np.nan))
             ):
                 coordinate_system = PixelSpace(width=image_width, height=image_height)
             else:
@@ -1234,8 +1243,8 @@ def document_to_element_list(
                 element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
                 element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
 
-                if (isinstance(element, Title) and element.metadata.category_depth is None) and any(
-                    getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements
+                if (isinstance(element, Title) and element.metadata.category_depth is None) and (
+                    has_headline
                 ):
                     element.metadata.category_depth = 0
 
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -281,7 +281,6 @@ def supplement_page_layout_with_ocr(
             ocr_agent=_table_ocr_agent,
             extracted_regions=extracted_regions,
         )
-    page_layout.elements = page_layout.elements_array.as_list()
 
     return page_layout
 
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -657,8 +657,6 @@ def merge_inferred_with_extracted_layout(
             merged_layout.texts[i] = remove_control_characters(text)
 
         inferred_page.elements_array = merged_layout
-        # NOTE: once we drop reference to elements we can remove this step below
-        inferred_page.elements[:] = merged_layout.as_list()
 
     return inferred_document_layout
 
@@ -670,34 +668,26 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
     """
 
     for page in document.pages:
-        non_pdfminer_element_boxes = [e.bbox for e in page.elements if e.source != Source.PDFMINER]
-        element_boxes = []
-        element_to_subregion_map = {}
-        subregion_indice = 0
-        for i, element in enumerate(page.elements):
-            if element.source != Source.PDFMINER:
-                continue
-            element_boxes.append(element.bbox)
-            element_to_subregion_map[i] = subregion_indice
-            subregion_indice += 1
+        pdfminer_mask = page.elements_array.sources == Source.PDFMINER
+        non_pdfminer_element_boxes = page.elements_array.slice(~pdfminer_mask).element_coords
+        pdfminer_element_boxes = page.elements_array.slice(pdfminer_mask).element_coords
+
+        if len(pdfminer_element_boxes) == 0 or len(non_pdfminer_element_boxes) == 0:
+            continue
 
         is_element_subregion_of_other_elements = (
             bboxes1_is_almost_subregion_of_bboxes2(
-                element_boxes,
+                pdfminer_element_boxes,
                 non_pdfminer_element_boxes,
                 env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
             ).sum(axis=1)
             == 1
         )
 
-        page.elements = [
-            e
-            for i, e in enumerate(page.elements)
-            if (
-                (i not in element_to_subregion_map)
-                or not is_element_subregion_of_other_elements[element_to_subregion_map[i]]
-            )
-        ]
+        pdfminer_to_keep = np.where(pdfminer_mask)[0][~is_element_subregion_of_other_elements]
+        page.elements_array = page.elements_array.slice(
+            np.sort(np.concatenate((np.where(~pdfminer_mask)[0], pdfminer_to_keep)))
+        )
 
     return document
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.16.26-dev3" # pragma: no cover`
	`1`	`+__version__ = "0.17.0" # pragma: no cover`
Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,6 @@ def supplement_page_layout_with_ocr(`
`281`	`281`	`ocr_agent=_table_ocr_agent,`
`282`	`282`	`extracted_regions=extracted_regions,`
`283`	`283`	`)`
`284`		`- page_layout.elements = page_layout.elements_array.as_list()`
`285`	`284`
`286`	`285`	`return page_layout`
`287`	`286`