Skip to content

Commit 4814a72

Browse files
authored
fix: ocr when no elements are found in block (#68)
Stopgap fix for a bug that causes the parsing procedure to ignore pdf elements that are not contained within the bounds of an inferred/specified layout element.
1 parent 237d69d commit 4814a72

File tree

5 files changed

+25
-2
lines changed

5 files changed

+25
-2
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.2.11
2+
3+
* Fixed some cases where image elements were not being OCR'd
4+
15
## 0.2.10
26

37
* Removed control characters from tesseract output

scripts/version-sync.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ done
1616
# Version appearing earliest in CHANGELOGFILE will be used as ground truth.
1717
CHANGELOGFILE="CHANGELOG.md"
1818
VERSIONFILE="unstructured_inference/__version__.py"
19-
RE_SEMVER_FULL="(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(-((0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
19+
RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
2020
# Pull out semver appearing earliest in CHANGELOGFILE.
2121
LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$CHANGELOGFILE")
2222

test_unstructured_inference/inference/test_layout.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ class MockLayout:
230230
def __init__(self, *elements):
231231
self.elements = elements
232232

233+
def __len__(self):
234+
return len(self.elements)
235+
233236
def sort(self, key, inplace):
234237
return self.elements
235238

@@ -239,6 +242,9 @@ def __iter__(self):
239242
def get_texts(self):
240243
return [el.text for el in self.elements]
241244

245+
def filter_by(self, *args, **kwargs):
246+
return MockLayout()
247+
242248

243249
@pytest.mark.parametrize(
244250
"block_text, layout_texts, expected_text",
@@ -334,3 +340,9 @@ def test_invalid_ocr_strategy_raises(mock_image):
334340
)
335341
def test_remove_control_characters(text, expected):
336342
assert layout.remove_control_characters(text) == expected
343+
344+
345+
def test_interpret_called_when_filter_empty(mock_image):
346+
with patch("unstructured_inference.inference.layout.interpret_text_block"):
347+
layout.aggregate_by_block(MockTextBlock(), mock_image, MockLayout())
348+
layout.interpret_text_block.assert_called_once()
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.10" # pragma: no cover
1+
__version__ = "0.2.11" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,13 @@ def aggregate_by_block(
298298
"""Extracts the text aggregated from the elements of the given layout that lie within the given
299299
block."""
300300
filtered_blocks = layout.filter_by(text_block, center=True)
301+
# NOTE(alan): For now, if none of the elements discovered by layoutparser are in the block
302+
# we can try interpreting the whole block. This still doesn't handle edge cases, like when there
303+
# are some text elements within the block, but there are image elements overlapping the block
304+
# with text lying within the block. In this case the text in the image would likely be ignored.
305+
if not filtered_blocks:
306+
text = interpret_text_block(text_block, image, ocr_strategy)
307+
return text
301308
for little_block in filtered_blocks:
302309
little_block.text = interpret_text_block(little_block, image, ocr_strategy)
303310
text = " ".join([x for x in filtered_blocks.get_texts() if x])

0 commit comments

Comments
 (0)