feat: use block matrix to reduce peak memory usage for matmul (#3947)

badGarnet · web-flow · commit 961c8d5b118e · 2025-03-07T00:28:36.000Z
This PR targets the most memory expensive operation in partition pdf and images: deduplicate pdfminer elements. In large pages the number of elements can be over 10k, which would generate multiple 10k x 10k square double float matrices during deduplication, pushing peak memory usage close to 13Gb ![Screenshot 2025-03-06 at 3 22 52 PM](https://github.com/user-attachments/assets/fdc26806-947b-4b5a-9d8e-4faeb0179b9f) This PR breaks this computation down by computing partial IOU. More precisely it computes IOU for each 2000 elements against all the elements at a time to reduce peak memory usage by about 10x to around 1.6Gb. ![image](https://github.com/user-attachments/assets/e7b9f149-2b6a-4fc9-83c7-652e20849b76) The block size is configurable based on user preference for peak memory usage and it is set by changing the env `UNST_MATMUL_MEMORY_CAP_IN_GB`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.16.24-dev5
+## 0.16.24
 
 ### Enhancements
 
 - **Support dynamic partitioner file type registration**. Use `create_file_type` to create new file type that can be handled
   in unstructured and `register_partitioner` to enable registering your own partitioner for any file type.
 
 - **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements
+- **use block matrix to reduce peak memory usage for pdf/image partition**.
 
 ### Features
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.24-dev5"  # pragma: no cover
+__version__ = "0.16.24"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast
 
 import numpy as np
@@ -708,10 +709,16 @@ def remove_duplicate_elements(
 ) -> TextRegions:
     """Removes duplicate text elements extracted by PDFMiner from a document layout."""
 
-    iou = boxes_self_iou(elements.element_coords, threshold)
-    # this is equivalent of finding those rows where `not iou[i, i + 1 :].any()`, i.e., any element
-    # that has no overlap above the threshold with any other elements
-    return elements.slice(~np.triu(iou, k=1).any(axis=1))
+    coords = elements.element_coords
+    # experiments show 2e3 is the block size that constrains the peak memory around 1Gb for this
+    # function; that accounts for all the intermediate matricies allocated and memory for storing
+    # final results
+    memory_cap_in_gb = os.getenv("UNST_MATMUL_MEMORY_CAP_IN_GB", 1)
+    n_split = np.ceil(coords.shape[0] / 2e3 / memory_cap_in_gb)
+    splits = np.array_split(coords, n_split, axis=0)
+
+    ious = [~np.triu(boxes_iou(split, coords, threshold), k=1).any(axis=1) for split in splits]
+    return elements.slice(np.concatenate(ious))
 
 
 def aggregate_embedded_text_by_block(

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.16.24-dev5" # pragma: no cover`
	`1`	`+__version__ = "0.16.24" # pragma: no cover`