Skip to content

Commit 8685905

Browse files
authored
Character confidence threshold (#3860)
This change adds the ability to filter out characters predicted by Tesseract with low confidence scores. Some notes: - I intentionally disabled it by default; I think some low score(like 0.9-0.95 for Tesseract) could be a safe choice though - I wanted to use character bboxes and combine them into word bbox later. However, a bug in Tesseract in some specific scenarios returns incorrect character bboxes (unit tests caught it 🥳 ). More in comment in the code
1 parent 8378c26 commit 8685905

File tree

6 files changed

+177
-15
lines changed

6 files changed

+177
-15
lines changed

Diff for: CHANGELOG.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
## 0.16.13-dev0
1+
## 0.16.13-dev1
22

33
### Enhancements
4+
- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable.
45

56
### Features
67

78
### Fixes
89

910
- **Fix NLTK Download** to use nltk assets in docker image
1011
- removed the ability to automatically download nltk package if missing
11-
12+
1213
## 0.16.12
1314

1415
### Enhancements

Diff for: test_unstructured/partition/pdf_image/test_ocr.py

+82-4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas as pd
77
import pytest
88
import unstructured_pytesseract
9+
from bs4 import BeautifulSoup, Tag
910
from pdf2image.exceptions import PDFPageCountError
1011
from PIL import Image, UnidentifiedImageError
1112
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
7172

7273
def test_get_ocr_layout_from_image_tesseract(monkeypatch):
7374
monkeypatch.setattr(
74-
unstructured_pytesseract,
75-
"image_to_data",
75+
OCRAgentTesseract,
76+
"image_to_data_with_character_confidence_filter",
7677
lambda *args, **kwargs: pd.DataFrame(
7778
{
7879
"left": [10, 20, 30, 0],
@@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
445446
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
446447
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
447448
monkeypatch.setattr(
448-
unstructured_pytesseract,
449-
"image_to_data",
449+
OCRAgentTesseract,
450+
"image_to_data_with_character_confidence_filter",
450451
lambda *args, **kwargs: pd.DataFrame(
451452
{
452453
"left": [10, 20, 30, 0],
@@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
484485
# Check if the final layout contains both original elements and OCR-derived elements
485486
assert all(element in final_layout for element in mock_out_layout)
486487
assert any(element in final_layout for element in ocr_elements)
488+
489+
490+
def _create_hocr_word_span(
491+
characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
492+
) -> Tag:
493+
word_span = BeautifulSoup(
494+
f"<span class='ocrx_word' title='"
495+
f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
496+
f"; x_wconf 64'></span>",
497+
"html.parser",
498+
).span
499+
for char, x_conf in characters:
500+
char_span = BeautifulSoup(
501+
f"""
502+
<span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
503+
""", # noqa : E501
504+
"html.parser",
505+
).span
506+
word_span.append(char_span)
507+
return word_span
508+
509+
510+
def test_extract_word_from_hocr():
511+
characters = [
512+
("w", "99.0"),
513+
("o", "98.5"),
514+
("r", "97.5"),
515+
("d", "96.0"),
516+
("!", "50.0"),
517+
("@", "45.0"),
518+
]
519+
word_bbox = (10, 9, 70, 22)
520+
word_span = _create_hocr_word_span(characters, word_bbox)
521+
522+
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
523+
assert text == "word!@"
524+
525+
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
526+
assert text == "word"
527+
528+
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
529+
assert text == "w"
530+
531+
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
532+
assert text == ""
533+
534+
535+
def test_hocr_to_dataframe():
536+
characters = [
537+
("w", "99.0"),
538+
("o", "98.5"),
539+
("r", "97.5"),
540+
("d", "96.0"),
541+
("!", "50.0"),
542+
("@", "45.0"),
543+
]
544+
word_bbox = (10, 9, 70, 22)
545+
hocr = str(_create_hocr_word_span(characters, word_bbox))
546+
df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
547+
548+
assert df.shape == (1, 5)
549+
assert df["left"].iloc[0] == 10
550+
assert df["top"].iloc[0] == 9
551+
assert df["width"].iloc[0] == 60
552+
assert df["height"].iloc[0] == 13
553+
assert df["text"].iloc[0] == "word"
554+
555+
556+
def test_hocr_to_dataframe_when_no_prediction_empty_df():
557+
df = OCRAgentTesseract().hocr_to_dataframe(hocr="")
558+
559+
assert df.shape == (0, 5)
560+
assert "left" in df.columns
561+
assert "top" in df.columns
562+
assert "width" in df.columns
563+
assert "text" in df.columns
564+
assert "text" in df.columns

Diff for: test_unstructured/partition/pdf_image/test_pdf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():
995995
[
996996
(
997997
PartitionStrategy.HI_RES,
998-
"unstructured_pytesseract.image_to_data",
998+
"unstructured_pytesseract.image_to_pdf_or_hocr",
999999
),
10001000
(
10011001
PartitionStrategy.OCR_ONLY,
1002-
"unstructured_pytesseract.image_to_data",
1002+
"unstructured_pytesseract.image_to_pdf_or_hocr",
10031003
),
10041004
(
10051005
PartitionStrategy.OCR_ONLY,

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.13-dev0" # pragma: no cover
1+
__version__ = "0.16.13-dev1" # pragma: no cover

Diff for: unstructured/partition/utils/config.py

+5
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
9696
"""optimum text height for tesseract OCR"""
9797
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
9898

99+
@property
100+
def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
101+
"""Tesseract predictions with confidence below this threshold are ignored"""
102+
return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
103+
99104
@property
100105
def GOOGLEVISION_API_ENDPOINT(self) -> str:
101106
"""API endpoint to use for Google Vision"""

Diff for: unstructured/partition/utils/ocr_models/tesseract_ocr.py

+84-6
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
from __future__ import annotations
22

33
import os
4+
import re
45
from typing import TYPE_CHECKING, List
56

67
import cv2
78
import numpy as np
89
import pandas as pd
910
import unstructured_pytesseract
11+
from bs4 import BeautifulSoup, Tag
1012
from PIL import Image as PILImage
11-
from unstructured_pytesseract import Output
1213

1314
from unstructured.logger import trace_logger
1415
from unstructured.partition.utils.config import env_config
@@ -47,10 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
4748

4849
trace_logger.detail("Processing entire page OCR with tesseract...")
4950
zoom = 1
50-
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
51+
ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
5152
np.array(image),
5253
lang=self.language,
53-
output_type=Output.DATAFRAME,
54+
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
5455
)
5556
ocr_df = ocr_df.dropna()
5657

@@ -76,17 +77,94 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
7677
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
7778
max_zoom,
7879
)
79-
ocr_df = unstructured_pytesseract.image_to_data(
80+
ocr_df = self.image_to_data_with_character_confidence_filter(
8081
np.array(zoom_image(image, zoom)),
8182
lang=self.language,
82-
output_type=Output.DATAFRAME,
83+
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
8384
)
8485
ocr_df = ocr_df.dropna()
85-
8686
ocr_regions = self.parse_data(ocr_df, zoom=zoom)
8787

8888
return ocr_regions
8989

90+
def image_to_data_with_character_confidence_filter(
91+
self,
92+
image: np.ndarray,
93+
lang: str = "eng",
94+
config: str = "",
95+
character_confidence_threshold: float = 0.0,
96+
) -> pd.DataFrame:
97+
hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
98+
image,
99+
lang=lang,
100+
config="-c hocr_char_boxes=1 " + config,
101+
extension="hocr",
102+
)
103+
ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold)
104+
return ocr_df
105+
106+
def hocr_to_dataframe(
107+
self, hocr: str, character_confidence_threshold: float = 0.0
108+
) -> pd.DataFrame:
109+
soup = BeautifulSoup(hocr, "html.parser")
110+
word_spans = soup.find_all("span", class_="ocrx_word")
111+
112+
df_entries = []
113+
for word_span in word_spans:
114+
word_title = word_span.get("title", "")
115+
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)
116+
117+
# Note: word bbox is used instead of combining characters together due to tesseract
118+
# bug that causes the character bboxes to be outside the word bbox, and they have 0
119+
# height or width when text is horizontal
120+
text = self.extract_word_from_hocr(
121+
word=word_span, character_confidence_threshold=character_confidence_threshold
122+
)
123+
if text and bbox_match:
124+
word_bbox = list(map(int, bbox_match.groups()))
125+
left, top, right, bottom = word_bbox
126+
df_entries.append(
127+
{
128+
"left": left,
129+
"top": top,
130+
"right": right,
131+
"bottom": bottom,
132+
"text": text,
133+
}
134+
)
135+
ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "right", "bottom", "text"])
136+
137+
ocr_df["width"] = ocr_df["right"] - ocr_df["left"]
138+
ocr_df["height"] = ocr_df["bottom"] - ocr_df["top"]
139+
140+
ocr_df = ocr_df.drop(columns=["right", "bottom"])
141+
return ocr_df
142+
143+
@staticmethod
144+
def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
145+
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
146+
147+
character_spans = word.find_all("span", class_="ocrx_cinfo")
148+
if len(character_spans) == 0:
149+
return ""
150+
151+
word_text = ""
152+
for character_span in character_spans:
153+
char = character_span.text
154+
155+
char_title = character_span.get("title", "")
156+
conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
157+
158+
if not (char and conf_match):
159+
continue
160+
161+
character_probability = float(conf_match.group(1)) / 100
162+
163+
if character_probability >= character_confidence_threshold:
164+
word_text += char
165+
166+
return word_text
167+
90168
@requires_dependencies("unstructured_inference")
91169
def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
92170
from unstructured.partition.pdf_image.inference_utils import (

0 commit comments

Comments
 (0)