Skip to content

Commit d41730d

Browse files
authored
feat: make PageLayout.elements a cached property (#414)
- default `PageLayout.get_elements_with_detection_model` now returns `LayoutElements` - `PageLayout.elements` is a cached property computed from `elements_array` property to save memory and cpu costs
1 parent 5d6e50b commit d41730d

File tree

4 files changed

+16
-10
lines changed

4 files changed

+16
-10
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.8.8-dev0
1+
## 0.8.8-dev1
22

33
* fix: pdfminer-six dependencies
4+
* feat: `PageLayout.elements` is now a `cached_property` to reduce unecessary memory and cpu costs
45

56
## 0.8.7
67

test_unstructured_inference/inference/test_layout.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def test_get_page_elements(monkeypatch, mock_final_layout):
116116
)
117117
elements = page.get_elements_with_detection_model(inplace=False)
118118
page.get_elements_with_detection_model(inplace=True)
119-
assert elements == page.elements
119+
assert elements == page.elements_array
120120

121121

122122
class MockPool:

unstructured_inference/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.8.8-dev0" # pragma: no cover
1+
__version__ = "0.8.8-dev1" # pragma: no cover

unstructured_inference/inference/layout.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import os
44
import tempfile
5+
from functools import cached_property
56
from pathlib import PurePath
67
from typing import Any, BinaryIO, Collection, List, Optional, Union, cast
78

@@ -149,7 +150,6 @@ def __init__(
149150
self.number = number
150151
self.detection_model = detection_model
151152
self.element_extraction_model = element_extraction_model
152-
self.elements: Collection[LayoutElement] = []
153153
self.elements_array: LayoutElements | None = None
154154
self.password = password
155155
# NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
@@ -159,10 +159,18 @@ def __init__(
159159
def __str__(self) -> str:
160160
return "\n\n".join([str(element) for element in self.elements])
161161

162+
@cached_property
163+
def elements(self) -> Collection[LayoutElement]:
164+
"""return a list of layout elements from the array data structure; intended for backward
165+
compatibility"""
166+
if self.elements_array is None:
167+
return []
168+
return self.elements_array.as_list()
169+
162170
def get_elements_using_image_extraction(
163171
self,
164172
inplace=True,
165-
) -> Optional[List[LayoutElement]]:
173+
) -> Optional[list[LayoutElement]]:
166174
"""Uses end-to-end text element extraction model to extract the elements on the page."""
167175
if self.element_extraction_model is None:
168176
raise ValueError(
@@ -178,8 +186,7 @@ def get_elements_using_image_extraction(
178186
def get_elements_with_detection_model(
179187
self,
180188
inplace: bool = True,
181-
array_only: bool = False,
182-
) -> Optional[List[LayoutElement]]:
189+
) -> Optional[LayoutElements]:
183190
"""Uses specified model to detect the elements on the page."""
184191
if self.detection_model is None:
185192
model = get_model()
@@ -198,11 +205,9 @@ def get_elements_with_detection_model(
198205

199206
if inplace:
200207
self.elements_array = inferred_layout
201-
if not array_only:
202-
self.elements = inferred_layout.as_list()
203208
return None
204209

205-
return inferred_layout.as_list()
210+
return inferred_layout
206211

207212
def _get_image_array(self) -> Union[np.ndarray[Any, Any], None]:
208213
"""Converts the raw image into a numpy array."""

0 commit comments

Comments
 (0)