Skip to content

Commit 4e424ef

Browse files
authored
feat: use lxml instead of bs4 to parse hOCR data (#3960)
- `lxml` is a much faster library than `bs4` when the input data is regular - since the hOCR data is guaranteed to be regular (programmatically generated) we don't need `bs4` here to parse the data - `lxml` improves parsing speed by about 10x Example runtime profiling locally using the same `hocr` data from 1 page pdf, where `agent.hocr_to_dataframe_bs4` is the current method on main and `agent.hocr_to_dataframe` is the PR's method. ![Screenshot 2025-03-17 at 12 14 59 PM](https://github.com/user-attachments/assets/7c483857-8711-4d72-8954-e83510fef783)
1 parent 66bf4b0 commit 4e424ef

File tree

4 files changed

+46
-37
lines changed

4 files changed

+46
-37
lines changed

Diff for: CHANGELOG.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
## 0.17.1-dev0
1+
## 0.17.1-dev1
22

33
### Enhancements
44

55
- **Add image_url of images in html partitioner** `<img>` tags with non-data content include a new image_url metadata field with the content of the src attribute.
6-
6+
- **Use `lxml` instead of `bs4` to parse hOCR data.** `lxml` is much faster than `bs4` given the hOCR data format is regular (garanteed because it is programatically generated)
7+
78
### Features
89

910
### Fixes

Diff for: test_unstructured/partition/pdf_image/test_ocr.py

+28-25
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pandas as pd
77
import pytest
88
import unstructured_pytesseract
9-
from bs4 import BeautifulSoup, Tag
9+
from lxml import etree
1010
from pdf2image.exceptions import PDFPageCountError
1111
from PIL import Image, UnidentifiedImageError
1212
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion, TextRegions
@@ -536,23 +536,24 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
536536

537537

538538
def _create_hocr_word_span(
539-
characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
540-
) -> Tag:
541-
word_span = BeautifulSoup(
542-
f"<span class='ocrx_word' title='"
543-
f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
544-
f"; x_wconf 64'></span>",
545-
"html.parser",
546-
).span
539+
characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int], namespace_map: dict
540+
) -> etree.Element:
541+
word_span = [
542+
'<root xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n',
543+
(
544+
f"<span class='ocrx_word' title='"
545+
f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
546+
f"; x_wconf 64'>"
547+
),
548+
]
547549
for char, x_conf in characters:
548-
char_span = BeautifulSoup(
549-
f"""
550-
<span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
551-
""", # noqa : E501
552-
"html.parser",
553-
).span
554-
word_span.append(char_span)
555-
return word_span
550+
word_span.append(
551+
f"<span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>"
552+
)
553+
word_span.append("</span>")
554+
word_span.append("</root>")
555+
root = etree.fromstring("\n".join(word_span))
556+
return root
556557

557558

558559
def test_extract_word_from_hocr():
@@ -565,18 +566,19 @@ def test_extract_word_from_hocr():
565566
("@", "45.0"),
566567
]
567568
word_bbox = (10, 9, 70, 22)
568-
word_span = _create_hocr_word_span(characters, word_bbox)
569+
agent = OCRAgentTesseract()
570+
word_span = _create_hocr_word_span(characters, word_bbox, agent.hocr_namespace)
569571

570-
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
572+
text = agent.extract_word_from_hocr(word_span, 0.0)
571573
assert text == "word!@"
572574

573-
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
575+
text = agent.extract_word_from_hocr(word_span, 0.960)
574576
assert text == "word"
575577

576-
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
578+
text = agent.extract_word_from_hocr(word_span, 0.990)
577579
assert text == "w"
578580

579-
text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
581+
text = agent.extract_word_from_hocr(word_span, 0.999)
580582
assert text == ""
581583

582584

@@ -590,8 +592,9 @@ def test_hocr_to_dataframe():
590592
("@", "45.0"),
591593
]
592594
word_bbox = (10, 9, 70, 22)
593-
hocr = str(_create_hocr_word_span(characters, word_bbox))
594-
df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
595+
agent = OCRAgentTesseract()
596+
hocr = etree.tostring(_create_hocr_word_span(characters, word_bbox, agent.hocr_namespace))
597+
df = agent.hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
595598

596599
assert df.shape == (1, 5)
597600
assert df["left"].iloc[0] == 10
@@ -608,7 +611,7 @@ def test_hocr_to_dataframe_when_no_prediction_empty_df():
608611
assert "left" in df.columns
609612
assert "top" in df.columns
610613
assert "width" in df.columns
611-
assert "text" in df.columns
614+
assert "height" in df.columns
612615
assert "text" in df.columns
613616

614617

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.17.1-dev0" # pragma: no cover
1+
__version__ = "0.17.1-dev1" # pragma: no cover

Diff for: unstructured/partition/utils/ocr_models/tesseract_ocr.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numpy as np
99
import pandas as pd
1010
import unstructured_pytesseract
11-
from bs4 import BeautifulSoup, Tag
11+
from lxml import etree
1212
from PIL import Image as PILImage
1313

1414
from unstructured.logger import trace_logger
@@ -34,6 +34,8 @@
3434
class OCRAgentTesseract(OCRAgent):
3535
"""OCR service implementation for Tesseract."""
3636

37+
hocr_namespace = {"h": "http://www.w3.org/1999/xhtml"}
38+
3739
def __init__(self, language: str = "eng"):
3840
self.language = language
3941

@@ -106,17 +108,19 @@ def image_to_data_with_character_confidence_filter(
106108
def hocr_to_dataframe(
107109
self, hocr: str, character_confidence_threshold: float = 0.0
108110
) -> pd.DataFrame:
109-
soup = BeautifulSoup(hocr, "html.parser")
110-
word_spans = soup.find_all("span", class_="ocrx_word")
111111

112112
df_entries = []
113+
114+
if not hocr:
115+
return pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"])
116+
117+
root = etree.fromstring(hocr)
118+
word_spans = root.findall('.//h:span[@class="ocrx_word"]', self.hocr_namespace)
119+
113120
for word_span in word_spans:
114121
word_title = word_span.get("title", "")
115122
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)
116123

117-
# Note: word bbox is used instead of combining characters together due to tesseract
118-
# bug that causes the character bboxes to be outside the word bbox, and they have 0
119-
# height or width when text is horizontal
120124
text = self.extract_word_from_hocr(
121125
word=word_span, character_confidence_threshold=character_confidence_threshold
122126
)
@@ -140,11 +144,12 @@ def hocr_to_dataframe(
140144
ocr_df = ocr_df.drop(columns=["right", "bottom"])
141145
return ocr_df
142146

143-
@staticmethod
144-
def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
147+
def extract_word_from_hocr(
148+
self, word: etree.Element, character_confidence_threshold: float = 0.0
149+
) -> str:
145150
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
146151

147-
character_spans = word.find_all("span", class_="ocrx_cinfo")
152+
character_spans = word.findall('.//h:span[@class="ocrx_cinfo"]', self.hocr_namespace)
148153
if len(character_spans) == 0:
149154
return ""
150155

0 commit comments

Comments
 (0)