feat: use lxml instead of bs4 to parse hOCR data (#3960)

badGarnet · web-flow · commit 4e424efd2261 · 2025-03-18T00:36:19.000Z
- `lxml` is a much faster library than `bs4` when the input data is regular - since the hOCR data is guaranteed to be regular (programmatically generated) we don't need `bs4` here to parse the data - `lxml` improves parsing speed by about 10x Example runtime profiling locally using the same `hocr` data from 1 page pdf, where `agent.hocr_to_dataframe_bs4` is the current method on main and `agent.hocr_to_dataframe` is the PR's method. ![Screenshot 2025-03-17 at 12 14 59 PM](https://github.com/user-attachments/assets/7c483857-8711-4d72-8954-e83510fef783)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,10 @@
-## 0.17.1-dev0
+## 0.17.1-dev1
 
 ### Enhancements
 
 - **Add image_url of images in html partitioner** `<img>` tags with non-data content include a new image_url metadata field with the content of the src attribute.
-  
+- **Use `lxml` instead of `bs4` to parse hOCR data.** `lxml` is much faster than `bs4` given the hOCR data format is regular (garanteed because it is programatically generated)
+
 ### Features
 
 ### Fixes
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import pytest
 import unstructured_pytesseract
-from bs4 import BeautifulSoup, Tag
+from lxml import etree
 from pdf2image.exceptions import PDFPageCountError
 from PIL import Image, UnidentifiedImageError
 from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion, TextRegions
@@ -536,23 +536,24 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
 
 
 def _create_hocr_word_span(
-    characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
-) -> Tag:
-    word_span = BeautifulSoup(
-        f"<span class='ocrx_word' title='"
-        f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
-        f"; x_wconf 64'></span>",
-        "html.parser",
-    ).span
+    characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int], namespace_map: dict
+) -> etree.Element:
+    word_span = [
+        '<root xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n',
+        (
+            f"<span class='ocrx_word' title='"
+            f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
+            f"; x_wconf 64'>"
+        ),
+    ]
     for char, x_conf in characters:
-        char_span = BeautifulSoup(
-            f"""
-            <span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
-            """,  # noqa : E501
-            "html.parser",
-        ).span
-        word_span.append(char_span)
-    return word_span
+        word_span.append(
+            f"<span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>"
+        )
+    word_span.append("</span>")
+    word_span.append("</root>")
+    root = etree.fromstring("\n".join(word_span))
+    return root
 
 
 def test_extract_word_from_hocr():
@@ -565,18 +566,19 @@ def test_extract_word_from_hocr():
         ("@", "45.0"),
     ]
     word_bbox = (10, 9, 70, 22)
-    word_span = _create_hocr_word_span(characters, word_bbox)
+    agent = OCRAgentTesseract()
+    word_span = _create_hocr_word_span(characters, word_bbox, agent.hocr_namespace)
 
-    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
+    text = agent.extract_word_from_hocr(word_span, 0.0)
     assert text == "word!@"
 
-    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
+    text = agent.extract_word_from_hocr(word_span, 0.960)
     assert text == "word"
 
-    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
+    text = agent.extract_word_from_hocr(word_span, 0.990)
     assert text == "w"
 
-    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
+    text = agent.extract_word_from_hocr(word_span, 0.999)
     assert text == ""
 
 
@@ -590,8 +592,9 @@ def test_hocr_to_dataframe():
         ("@", "45.0"),
     ]
     word_bbox = (10, 9, 70, 22)
-    hocr = str(_create_hocr_word_span(characters, word_bbox))
-    df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
+    agent = OCRAgentTesseract()
+    hocr = etree.tostring(_create_hocr_word_span(characters, word_bbox, agent.hocr_namespace))
+    df = agent.hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
 
     assert df.shape == (1, 5)
     assert df["left"].iloc[0] == 10
@@ -608,7 +611,7 @@ def test_hocr_to_dataframe_when_no_prediction_empty_df():
     assert "left" in df.columns
     assert "top" in df.columns
     assert "width" in df.columns
-    assert "text" in df.columns
+    assert "height" in df.columns
     assert "text" in df.columns
 
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.17.1-dev0"  # pragma: no cover
+__version__ = "0.17.1-dev1"  # pragma: no cover
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -8,7 +8,7 @@
 import numpy as np
 import pandas as pd
 import unstructured_pytesseract
-from bs4 import BeautifulSoup, Tag
+from lxml import etree
 from PIL import Image as PILImage
 
 from unstructured.logger import trace_logger
@@ -34,6 +34,8 @@
 class OCRAgentTesseract(OCRAgent):
     """OCR service implementation for Tesseract."""
 
+    hocr_namespace = {"h": "http://www.w3.org/1999/xhtml"}
+
     def __init__(self, language: str = "eng"):
         self.language = language
 
@@ -106,17 +108,19 @@ def image_to_data_with_character_confidence_filter(
     def hocr_to_dataframe(
         self, hocr: str, character_confidence_threshold: float = 0.0
     ) -> pd.DataFrame:
-        soup = BeautifulSoup(hocr, "html.parser")
-        word_spans = soup.find_all("span", class_="ocrx_word")
 
         df_entries = []
+
+        if not hocr:
+            return pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"])
+
+        root = etree.fromstring(hocr)
+        word_spans = root.findall('.//h:span[@class="ocrx_word"]', self.hocr_namespace)
+
         for word_span in word_spans:
             word_title = word_span.get("title", "")
             bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)
 
-            # Note: word bbox is used instead of combining characters together due to tesseract
-            # bug that causes the character bboxes to be outside the word bbox, and they have 0
-            # height or width when text is horizontal
             text = self.extract_word_from_hocr(
                 word=word_span, character_confidence_threshold=character_confidence_threshold
             )
@@ -140,11 +144,12 @@ def hocr_to_dataframe(
         ocr_df = ocr_df.drop(columns=["right", "bottom"])
         return ocr_df
 
-    @staticmethod
-    def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
+    def extract_word_from_hocr(
+        self, word: etree.Element, character_confidence_threshold: float = 0.0
+    ) -> str:
         """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
 
-        character_spans = word.find_all("span", class_="ocrx_cinfo")
+        character_spans = word.findall('.//h:span[@class="ocrx_cinfo"]', self.hocr_namespace)
         if len(character_spans) == 0:
             return ""
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.17.1-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.17.1-dev1" # pragma: no cover`