Skip to content

Commit 81549a7

Browse files
refactor: remove code related to embedded text extraction (#349)
This PR removes all code related to filling inferred elements text from embedded text (`pdfminer`). This PR is the first part of moving embedded text related code from `unstructured-inference` to `unstructured` and works together with Unstructured-IO/unstructured#3061.
1 parent 76619ca commit 81549a7

File tree

10 files changed

+7
-234
lines changed

10 files changed

+7
-234
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.7.32-dev1
1+
## 0.7.32
22

3+
* refactor: remove all code related to filling inferred elements text from embedded text (pdfminer).
34
* bug: set the Chipper max_length variable
45

56
## 0.7.31

test_unstructured_inference/inference/test_layout.py

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -312,16 +312,6 @@ def test_from_image_file_raises_isadirectoryerror_with_dir():
312312
layout.DocumentLayout.from_image_file(tempdir)
313313

314314

315-
@pytest.mark.parametrize("idx", range(2))
316-
def test_get_elements_from_layout(mock_initial_layout, idx):
317-
page = MockPageLayout()
318-
block = mock_initial_layout[idx]
319-
block.bbox.pad(3)
320-
fixed_layout = [block]
321-
elements = page.get_elements_from_layout(fixed_layout)
322-
assert elements[0].text == block.text
323-
324-
325315
def test_page_numbers_in_page_objects():
326316
with patch(
327317
"unstructured_inference.inference.layout.PageLayout.get_elements_with_detection_model",
@@ -331,40 +321,6 @@ def test_page_numbers_in_page_objects():
331321
assert [page.number for page in doc.pages] == list(range(1, len(doc.pages) + 1))
332322

333323

334-
@pytest.mark.parametrize(
335-
("fixed_layouts", "called_method", "not_called_method"),
336-
[
337-
(
338-
[MockLayout()],
339-
"get_elements_from_layout",
340-
"get_elements_with_detection_model",
341-
),
342-
(None, "get_elements_with_detection_model", "get_elements_from_layout"),
343-
],
344-
)
345-
def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method):
346-
with patch.object(
347-
layout.PageLayout,
348-
"get_elements_with_detection_model",
349-
return_value=[],
350-
), patch.object(
351-
layout.PageLayout,
352-
"get_elements_from_layout",
353-
return_value=[],
354-
):
355-
layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf", fixed_layouts=fixed_layouts)
356-
getattr(layout.PageLayout, called_method).assert_called()
357-
getattr(layout.PageLayout, not_called_method).assert_not_called()
358-
359-
360-
@pytest.mark.parametrize(
361-
("text", "expected"),
362-
[("c\to\x0cn\ftrol\ncharacter\rs\b", "control characters"), ("\"'\\", "\"'\\")],
363-
)
364-
def test_remove_control_characters(text, expected):
365-
assert elements.remove_control_characters(text) == expected
366-
367-
368324
no_text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100)
369325
text_region = EmbeddedTextRegion.from_coords(0, 0, 100, 100, text="test")
370326
overlapping_rect = ImageTextRegion.from_coords(50, 50, 150, 150)
@@ -417,12 +373,6 @@ def check_annotated_image():
417373
check_annotated_image()
418374

419375

420-
@pytest.mark.parametrize(("text", "expected"), [("asdf", "asdf"), (None, "")])
421-
def test_embedded_text_region(text, expected):
422-
etr = elements.EmbeddedTextRegion.from_coords(0, 0, 24, 24, text=text)
423-
assert etr.extract_text(objects=None) == expected
424-
425-
426376
class MockDetectionModel(layout.UnstructuredObjectDetectionModel):
427377
def initialize(self, *args, **kwargs):
428378
pass

test_unstructured_inference/inference/test_layout_element.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,6 @@
55
from unstructured_inference.inference.layoutelement import LayoutElement, TextRegion
66

77

8-
def test_layout_element_extract_text(
9-
mock_layout_element,
10-
mock_text_region,
11-
):
12-
extracted_text = mock_layout_element.extract_text(
13-
objects=[mock_text_region],
14-
)
15-
16-
assert isinstance(extracted_text, str)
17-
assert "Sample text" in extracted_text
18-
19-
208
def test_layout_element_do_dict(mock_layout_element):
219
expected = {
2210
"coordinates": ((100, 100), (100, 300), (300, 300), (300, 100)),

test_unstructured_inference/test_elements.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -272,16 +272,3 @@ def test_merge_inferred_layout_with_extracted_layout():
272272
assert merged_layout[0].text == "Example Section Header"
273273
assert merged_layout[1].type == ElementType.TEXT
274274
assert merged_layout[1].text == "Example Title"
275-
276-
277-
def test_aggregate_by_block():
278-
expected = "Inside region1 Inside region2"
279-
embedded_regions = [
280-
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
281-
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
282-
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
283-
]
284-
target_region = TextRegion.from_coords(0, 0, 300, 300)
285-
286-
text = elements.aggregate_by_block(target_region, embedded_regions)
287-
assert text == expected
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.32-dev1" # pragma: no cover
1+
__version__ = "0.7.32" # pragma: no cover

unstructured_inference/config.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,6 @@ def LAYOUT_SUBREGION_THRESHOLD(self) -> float:
9292
"""
9393
return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75)
9494

95-
@property
96-
def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
97-
"""threshold to determine if an embedded region is a sub-region of a given block
98-
when aggregating the text from embedded elements that lie within the given block
99-
100-
When the intersection region area divided by self area is larger than this threshold self is
101-
considered a subregion of the other
102-
"""
103-
return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99)
104-
10595
@property
10696
def ELEMENTS_H_PADDING_COEF(self) -> float:
10797
"""When extending the boundaries of a PDF object for the purpose of determining which other

unstructured_inference/inference/elements.py

Lines changed: 3 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
from __future__ import annotations
22

3-
import unicodedata
43
from copy import deepcopy
54
from dataclasses import dataclass
6-
from typing import Collection, Optional, Union
5+
from typing import Optional, Union
76

87
import numpy as np
98

10-
from unstructured_inference.config import inference_config
119
from unstructured_inference.constants import Source
1210
from unstructured_inference.math import safe_division
1311

@@ -184,21 +182,6 @@ class TextRegion:
184182
def __str__(self) -> str:
185183
return str(self.text)
186184

187-
def extract_text(
188-
self,
189-
objects: Optional[Collection[TextRegion]],
190-
) -> str:
191-
"""Extracts text contained in region."""
192-
if self.text is not None:
193-
# If block text is already populated, we'll assume it's correct
194-
text = self.text
195-
elif objects is not None:
196-
text = aggregate_by_block(self, objects)
197-
else:
198-
text = ""
199-
cleaned_text = remove_control_characters(text)
200-
return cleaned_text
201-
202185
@classmethod
203186
def from_coords(
204187
cls,
@@ -217,54 +200,11 @@ def from_coords(
217200

218201

219202
class EmbeddedTextRegion(TextRegion):
220-
def extract_text(
221-
self,
222-
objects: Optional[Collection[TextRegion]],
223-
) -> str:
224-
"""Extracts text contained in region."""
225-
if self.text is None:
226-
return ""
227-
else:
228-
return self.text
203+
pass
229204

230205

231206
class ImageTextRegion(TextRegion):
232-
def extract_text(
233-
self,
234-
objects: Optional[Collection[TextRegion]],
235-
) -> str:
236-
"""Extracts text contained in region."""
237-
if self.text is None:
238-
return ""
239-
else:
240-
return super().extract_text(objects)
241-
242-
243-
def aggregate_by_block(
244-
text_region: TextRegion,
245-
pdf_objects: Collection[TextRegion],
246-
) -> str:
247-
"""Extracts the text aggregated from the elements of the given layout that lie within the given
248-
block."""
249-
250-
subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
251-
filtered_blocks = [
252-
obj
253-
for obj in pdf_objects
254-
if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
255-
]
256-
text = " ".join([x.text for x in filtered_blocks if x.text])
257-
return text
258-
259-
260-
def remove_control_characters(text: str) -> str:
261-
"""Removes control characters from text."""
262-
263-
# Replace newline character with a space
264-
text = text.replace("\n", " ")
265-
# Remove other control characters
266-
out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
267-
return out_text
207+
pass
268208

269209

270210
def region_bounding_boxes_are_almost_the_same(

unstructured_inference/inference/layout.py

Lines changed: 1 addition & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,8 @@
1515
from unstructured_inference.inference.layoutelement import (
1616
LayoutElement,
1717
)
18-
from unstructured_inference.inference.ordering import order_layout
1918
from unstructured_inference.logger import logger
2019
from unstructured_inference.models.base import get_model
21-
from unstructured_inference.models.chipper import UnstructuredChipperModel
2220
from unstructured_inference.models.unstructuredmodel import (
2321
UnstructuredElementExtractionModel,
2422
UnstructuredObjectDetectionModel,
@@ -201,29 +199,6 @@ def get_elements_with_detection_model(
201199

202200
return inferred_layout
203201

204-
def get_elements_from_layout(
205-
self,
206-
layout: List[TextRegion],
207-
pdf_objects: Optional[List[TextRegion]] = None,
208-
) -> List[LayoutElement]:
209-
"""Uses the given Layout to separate the page text into elements, either extracting the
210-
text from the discovered layout blocks."""
211-
212-
# If the model is a chipper model, we don't want to order the
213-
# elements, as they are already ordered
214-
order_elements = not isinstance(self.detection_model, UnstructuredChipperModel)
215-
if order_elements:
216-
layout = order_layout(layout)
217-
218-
elements = [
219-
get_element_from_block(
220-
block=e,
221-
pdf_objects=pdf_objects,
222-
)
223-
for e in layout
224-
]
225-
return elements
226-
227202
def _get_image_array(self) -> Union[np.ndarray, None]:
228203
"""Converts the raw image into a numpy array."""
229204
if self.image_array is None:
@@ -330,7 +305,7 @@ def from_image(
330305
elif fixed_layout is None:
331306
page.get_elements_with_detection_model()
332307
else:
333-
page.elements = page.get_elements_from_layout(fixed_layout)
308+
page.elements = []
334309

335310
page.image_metadata = {
336311
"format": page.image.format if page.image else None,
@@ -405,19 +380,6 @@ def process_file_with_model(
405380
return layout
406381

407382

408-
def get_element_from_block(
409-
block: TextRegion,
410-
pdf_objects: Optional[List[TextRegion]] = None,
411-
) -> LayoutElement:
412-
"""Creates a LayoutElement from a given layout or image by finding all the text that lies within
413-
a given block."""
414-
element = block if isinstance(block, LayoutElement) else LayoutElement.from_region(block)
415-
element.text = element.extract_text(
416-
objects=pdf_objects,
417-
)
418-
return element
419-
420-
421383
def convert_pdf_to_image(
422384
filename: str,
423385
dpi: int = 200,

unstructured_inference/inference/layoutelement.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,6 @@ class LayoutElement(TextRegion):
3232
image_path: Optional[str] = None
3333
parent: Optional[LayoutElement] = None
3434

35-
def extract_text(
36-
self,
37-
objects: Optional[Collection[TextRegion]],
38-
):
39-
"""Extracts text contained in region"""
40-
text = super().extract_text(
41-
objects=objects,
42-
)
43-
return text
44-
4535
def to_dict(self) -> dict:
4636
"""Converts the class instance to dictionary form."""
4737
out_dict = {

unstructured_inference/inference/ordering.py

Lines changed: 0 additions & 35 deletions
This file was deleted.

0 commit comments

Comments
 (0)