Skip to content

Commit 237d69d

Browse files
authored
fix: removing control characters (#66)
Tesseract is putting some control characters in out_text, this commit just delete all of them * Style correction * Version sync
1 parent d332b65 commit 237d69d

File tree

4 files changed

+20
-2
lines changed

4 files changed

+20
-2
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.2.10
2+
3+
* Removed control characters from tesseract output
4+
15
## 0.2.9
26

37
* Removed multithreading from OCR (DocumentLayout.get_elements_from_layout)

test_unstructured_inference/inference/test_layout.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,3 +327,10 @@ def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method)
327327
def test_invalid_ocr_strategy_raises(mock_image):
328328
with pytest.raises(ValueError):
329329
layout.PageLayout(0, mock_image, MockLayout(), ocr_strategy="fake_strategy")
330+
331+
332+
@pytest.mark.parametrize(
333+
("text", "expected"), [("a\ts\x0cd\nfas\fd\rf\b", "asdfasdf"), ("\"'\\", "\"'\\")]
334+
)
335+
def test_remove_control_characters(text, expected):
336+
assert layout.remove_control_characters(text) == expected
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.9" # pragma: no cover
1+
__version__ = "0.2.10" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import tempfile
66
from tqdm import tqdm
77
from typing import List, Optional, Tuple, Union, BinaryIO
8-
8+
import unicodedata
99
from layoutparser.io.pdf import load_pdf
1010
from layoutparser.elements.layout_elements import TextBlock
1111
from layoutparser.elements.layout import Layout
@@ -318,6 +318,7 @@ def interpret_text_block(
318318
out_text = ocr(text_block, image)
319319
else:
320320
out_text = "" if text_block.text is None else text_block.text
321+
out_text = remove_control_characters(out_text)
321322
return out_text
322323

323324

@@ -329,3 +330,9 @@ def ocr(text_block: TextBlock, image: Image.Image) -> str:
329330
padded_block = text_block.pad(left=5, right=5, top=5, bottom=5)
330331
cropped_image = padded_block.crop_image(image_array)
331332
return tesseract.ocr_agent.detect(cropped_image)
333+
334+
335+
def remove_control_characters(text: str) -> str:
336+
"""Removes control characters from text."""
337+
out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
338+
return out_text

0 commit comments

Comments
 (0)