We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent c1e9b8e commit 0e44926Copy full SHA for 0e44926
unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -136,14 +136,12 @@ def hocr_to_dataframe(
136
return ocr_df
137
138
@staticmethod
139
- def extract_word_from_hocr(
140
- word: Tag, character_confidence_threshold: float = 0.0
141
- ) -> str | None:
+ def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
142
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
143
144
character_spans = word.find_all("span", class_="ocrx_cinfo")
145
if len(character_spans) == 0:
146
- return None
+ return ""
147
148
word_text = ""
149
for character_span in character_spans:
0 commit comments