Skip to content

Commit 1b6aadd

Browse files
authored
feat: ocr when too many chars unrecognized (#8)
Certain PDF documents have text blocks within their layout that contain text, but the text contains / consists of unrecognized characters, represented in the text as (cid:n) where n is an integer. Current logic accepts whatever text is present, only applying OCR in the case the layout text is None. This PR changes the logic to also apply OCR in the case that > 50% of the characters are unrecognized.
1 parent 79b147d commit 1b6aadd

File tree

4 files changed

+69
-5
lines changed

4 files changed

+69
-5
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.2.2-dev0
2+
3+
* Add logic to use OCR when layout text is full of unknown characters
4+
15
## 0.2.1
26

37
* Refactor to facilitate local inference

test_unstructured_inference/inference/test_layout.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,3 +176,39 @@ def test_process_file_with_model(monkeypatch, mock_page_layout, model_name):
176176
def test_process_file_with_model_raises_on_invalid_model_name():
177177
with pytest.raises(models.UnknownModelException):
178178
layout.process_file_with_model("", model_name="fake")
179+
180+
181+
class MockPageLayout(layout.PageLayout):
182+
def __init__(self, ocr_text):
183+
self.ocr_text = ocr_text
184+
185+
def ocr(self, text_block):
186+
return self.ocr_text
187+
188+
189+
class MockTextBlock(lp.TextBlock):
190+
def __init__(self, text):
191+
self.text = text
192+
193+
194+
def test_interpret_text_block_use_ocr_when_text_symbols_cid():
195+
fake_text = "(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)"
196+
fake_ocr = "ocrme"
197+
fake_text_block = MockTextBlock(fake_text)
198+
assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr
199+
200+
201+
@pytest.mark.parametrize(
202+
"text, expected",
203+
[("base", 0.0), ("", 0.0), ("(cid:2)", 1.0), ("(cid:1)a", 0.5), ("c(cid:1)ab", 0.25)],
204+
)
205+
def test_cid_ratio(text, expected):
206+
assert layout.cid_ratio(text) == expected
207+
208+
209+
@pytest.mark.parametrize(
210+
"text, expected",
211+
[("base", False), ("(cid:2)", True), ("(cid:1234567890)", True), ("jkl;(cid:12)asdf", True)],
212+
)
213+
def test_is_cid_present(text, expected):
214+
assert layout.is_cid_present(text) == expected
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.1" # pragma: no cover
1+
__version__ = "0.2.2-dev0" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22
from dataclasses import dataclass
3+
import re
34
import tempfile
45
from typing import List, Optional, Tuple, Union, BinaryIO
56

@@ -109,10 +110,7 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]:
109110
text_blocks = self.layout.filter_by(item, center=True)
110111
text = str()
111112
for text_block in text_blocks:
112-
# NOTE(robinson) - If the text attribute is None, that means the PDF isn't
113-
# already OCR'd and we have to send the snippet out for OCRing.
114-
if text_block.text is None:
115-
text_block.text = self.ocr(text_block)
113+
text_block.text = self.interpret_text_block(text_block)
116114
text = " ".join([x for x in text_blocks.get_texts() if x])
117115

118116
elements.append(
@@ -124,6 +122,16 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]:
124122
return None
125123
return elements
126124

125+
def interpret_text_block(self, text_block: lp.TextBlock) -> str:
126+
"""Interprets the text in a TextBlock."""
127+
# NOTE(robinson) - If the text attribute is None, that means the PDF isn't
128+
# already OCR'd and we have to send the snippet out for OCRing.
129+
if (text_block.text is None) or cid_ratio(text_block.text) > 0.5:
130+
out_text = self.ocr(text_block)
131+
else:
132+
out_text = text_block.text
133+
return out_text
134+
127135
def ocr(self, text_block: lp.TextBlock) -> str:
128136
"""Runs a cropped text block image through and OCR agent."""
129137
logger.debug("Running OCR on text block ...")
@@ -156,3 +164,19 @@ def process_file_with_model(filename: str, model_name: str) -> DocumentLayout:
156164
model = None if model_name is None else get_model(model_name)
157165
layout = DocumentLayout.from_file(filename, model=model)
158166
return layout
167+
168+
169+
def cid_ratio(text: str) -> float:
170+
"""Gets ratio of unknown 'cid' characters extracted from text to all characters."""
171+
if not is_cid_present(text):
172+
return 0.0
173+
cid_pattern = r"\(cid\:(\d+)\)"
174+
unmatched, n_cid = re.subn(cid_pattern, "", text)
175+
total = n_cid + len(unmatched)
176+
return n_cid / total
177+
178+
179+
def is_cid_present(text: str) -> bool:
180+
if len(text) < len("(cid:x)"):
181+
return False
182+
return text.find("(cid:") != -1

0 commit comments

Comments
 (0)