Skip to content

Commit 2dceac3

Browse files
authored
Feat/remove reference of PageLayout.elements (#3943)
This PR removes usage of `PageLayout.elements` from partition function, except for when `analysis=True`. This PR updates the partition logic so that `PageLayout.elements_array` is used everywhere to save memory and cpu cost. Since the analysis function is intended for investigation and not for general document processing purposes, this part of the code is left for a future refactor. `PageLayout.elements` uses a list to store layout elements' data while `elements_array` uses `numpy` array to store the data, which has much lower memory requirements. Using `memory_profiler` to test the differences is usually around 10x.
1 parent 8759b0a commit 2dceac3

File tree

9 files changed

+41
-37
lines changed

9 files changed

+41
-37
lines changed

CHANGELOG.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1-
## 0.16.26-dev3
1+
## 0.17.0
22

33
### Enhancements
44

55
- **Add support for images in html partitioner** `<img>` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source.
6+
67
- **Use kwargs instead of env to specify `ocr_agent` and `table_ocr_agent`** for `hi_res` strategy.
78

9+
- **stop using `PageLayout.elements` to save memory and cpu cost**. Now only use `PageLayout.elements_array` throughout the partition, except when `analysis=True` where the drawing logic still uses `elements`.
10+
811
### Features
912

1013
### Fixes
@@ -28,6 +31,7 @@
2831
in unstructured and `register_partitioner` to enable registering your own partitioner for any file type.
2932

3033
- **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements
34+
3135
- **use block matrix to reduce peak memory usage for pdf/image partition**.
3236

3337
### Features

requirements/deps/constraints.txt

+2
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@ botocore<1.34.132
2020
importlib-metadata>=8.5.0
2121
# (austin): Versions below this have a different interface for passing parameters
2222
unstructured-client>=0.23.0,<0.26.0
23+
# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file
24+
protobuf>=6.30.0

requirements/extra-pdf-image.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ google-cloud-vision
1111
effdet
1212
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
1313
# when unstructured library is.
14-
unstructured-inference>=0.8.7
14+
unstructured-inference>=0.8.9
1515
unstructured.pytesseract>=0.3.12

test_unstructured/partition/pdf_image/test_pdf.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1479,8 +1479,7 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
14791479
# can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes.
14801480
layout_elem_absent_coordinates = MockSinglePageDocumentLayout()
14811481
for page in layout_elem_absent_coordinates.pages:
1482-
for el in page.elements:
1483-
el.bbox = None
1482+
page.elements_array.element_coords[:, :] = None
14841483
elements = pdf.document_to_element_list(layout_elem_absent_coordinates)
14851484
assert elements[0].metadata.coordinates is None
14861485

test_unstructured/partition/pdf_image/test_pdfminer_processing.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
TextRegions,
1313
)
1414
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
15+
from unstructured_inference.inference.layoutelement import LayoutElements
1516

1617
from test_unstructured.unit_utils import example_doc_path
1718
from unstructured.partition.auto import partition
@@ -108,15 +109,15 @@ def test_valid_bbox(bbox, is_valid):
108109
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
109110
# create a sample document with pdfminer elements inside tables
110111
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
111-
page.elements = elements
112+
page.elements_array = LayoutElements.from_list(elements)
112113
document_with_table = DocumentLayout(pages=[page])
113114
document = document_with_table
114115

115116
# call the function to clean the pdfminer inner elements
116117
cleaned_doc = clean_pdfminer_inner_elements(document)
117118

118119
# check that the pdfminer elements were stored in the extra_info dictionary
119-
assert len(cleaned_doc.pages[0].elements) == expected_document_length
120+
assert len(cleaned_doc.pages[0].elements_array) == expected_document_length
120121

121122

122123
elements_with_duplicate_images = [

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.26-dev3" # pragma: no cover
1+
__version__ = "0.17.0" # pragma: no cover

unstructured/partition/pdf.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -766,10 +766,6 @@ def _partition_pdf_or_image_local(
766766
# vectorization of the data structure ends here
767767
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
768768

769-
for page in final_document_layout.pages:
770-
for el in page.elements:
771-
el.text = el.text or ""
772-
773769
elements = document_to_element_list(
774770
final_document_layout,
775771
sortable=True,
@@ -1199,11 +1195,24 @@ def document_to_element_list(
11991195
else None
12001196
)
12011197

1202-
for layout_element in page.elements:
1198+
head_line_type_class_ids = [
1199+
idx
1200+
for idx, class_type in page.elements_array.element_class_id_map.items()
1201+
if class_type in ("Headline", "Subheadline")
1202+
]
1203+
if head_line_type_class_ids:
1204+
has_headline = any(
1205+
np.any(page.elements_array.element_class_ids == idx)
1206+
for idx in head_line_type_class_ids
1207+
)
1208+
else:
1209+
has_headline = False
1210+
1211+
for layout_element in page.elements_array.iter_elements():
12031212
if (
12041213
image_width
12051214
and image_height
1206-
and getattr(layout_element.bbox, "x1") not in (None, np.nan)
1215+
and not np.isnan(getattr(layout_element.bbox, "x1", np.nan))
12071216
):
12081217
coordinate_system = PixelSpace(width=image_width, height=image_height)
12091218
else:
@@ -1234,8 +1243,8 @@ def document_to_element_list(
12341243
element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
12351244
element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
12361245

1237-
if (isinstance(element, Title) and element.metadata.category_depth is None) and any(
1238-
getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements
1246+
if (isinstance(element, Title) and element.metadata.category_depth is None) and (
1247+
has_headline
12391248
):
12401249
element.metadata.category_depth = 0
12411250

unstructured/partition/pdf_image/ocr.py

-1
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,6 @@ def supplement_page_layout_with_ocr(
281281
ocr_agent=_table_ocr_agent,
282282
extracted_regions=extracted_regions,
283283
)
284-
page_layout.elements = page_layout.elements_array.as_list()
285284

286285
return page_layout
287286

unstructured/partition/pdf_image/pdfminer_processing.py

+11-21
Original file line numberDiff line numberDiff line change
@@ -657,8 +657,6 @@ def merge_inferred_with_extracted_layout(
657657
merged_layout.texts[i] = remove_control_characters(text)
658658

659659
inferred_page.elements_array = merged_layout
660-
# NOTE: once we drop reference to elements we can remove this step below
661-
inferred_page.elements[:] = merged_layout.as_list()
662660

663661
return inferred_document_layout
664662

@@ -670,34 +668,26 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
670668
"""
671669

672670
for page in document.pages:
673-
non_pdfminer_element_boxes = [e.bbox for e in page.elements if e.source != Source.PDFMINER]
674-
element_boxes = []
675-
element_to_subregion_map = {}
676-
subregion_indice = 0
677-
for i, element in enumerate(page.elements):
678-
if element.source != Source.PDFMINER:
679-
continue
680-
element_boxes.append(element.bbox)
681-
element_to_subregion_map[i] = subregion_indice
682-
subregion_indice += 1
671+
pdfminer_mask = page.elements_array.sources == Source.PDFMINER
672+
non_pdfminer_element_boxes = page.elements_array.slice(~pdfminer_mask).element_coords
673+
pdfminer_element_boxes = page.elements_array.slice(pdfminer_mask).element_coords
674+
675+
if len(pdfminer_element_boxes) == 0 or len(non_pdfminer_element_boxes) == 0:
676+
continue
683677

684678
is_element_subregion_of_other_elements = (
685679
bboxes1_is_almost_subregion_of_bboxes2(
686-
element_boxes,
680+
pdfminer_element_boxes,
687681
non_pdfminer_element_boxes,
688682
env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
689683
).sum(axis=1)
690684
== 1
691685
)
692686

693-
page.elements = [
694-
e
695-
for i, e in enumerate(page.elements)
696-
if (
697-
(i not in element_to_subregion_map)
698-
or not is_element_subregion_of_other_elements[element_to_subregion_map[i]]
699-
)
700-
]
687+
pdfminer_to_keep = np.where(pdfminer_mask)[0][~is_element_subregion_of_other_elements]
688+
page.elements_array = page.elements_array.slice(
689+
np.sort(np.concatenate((np.where(~pdfminer_mask)[0], pdfminer_to_keep)))
690+
)
701691

702692
return document
703693

0 commit comments

Comments
 (0)