feat: consider rotated text as low fidelityfeat: consider rotated text (#4190)

badGarnet · ryannikolaidis · web-flow · commit d64c57d33bd8 · 2026-01-15T09:44:08.000-06:00
This PR updates the function `is_text_embedded`:
- now considers both if chars are invisible or rotated (as a result
includes some refactoring of variable names)
- rotated text elements can have wrong character order compared to
natural reading order -&gt; if feed into downstream applications like
embedding text the element loses its semantic meaning
- as a result this update flags texts with too many rotated characters
as only partially embedded: its source is technically embedded but it
may need post processing to be useful

---------

Co-authored-by: ryannikolaidis &lt;1208590+ryannikolaidis@users.noreply.github.com&gt;
Co-authored-by: badGarnet &lt;badGarnet@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 0.18.30-dev1
+## 0.18.30-dev2
 
 ### Enhancement
+- `is_text_embedded` now considers rotated text as low fidelity and and elements with no trivial amount of it are considered not embedded
 - Replace `pdf2image` with PyPDFium2 for PDF rendering
 
 ### Fixes
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -284,7 +284,9 @@ def test_process_file_with_pdfminer():
 
 def test_process_file_with_pdfminer_is_extracted_array():
     layout, _ = process_file_with_pdfminer(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
-    assert all(is_extracted is IsExtracted.TRUE for is_extracted in layout[0].is_extracted_array)
+    # first page contains rotated text that are considered low fidelity, i.e., is_extracted=partial
+    assert layout[0].is_extracted_array[0] is None
+    assert all(is_extracted is IsExtracted.TRUE for is_extracted in layout[1].is_extracted_array)
 
 
 def test_process_file_hidden_ocr_text():
@@ -310,13 +312,14 @@ def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock):
     }
 
 
-def create_mock_ltchar(text, invisible=False):
+def create_mock_ltchar(text, invisible=False, rotated=False):
     """Create a mock LTChar object"""
 
     graphicstate = Mock()
+    matrix = (1, 0.5, 0, 1, 0, 0) if rotated else (1, 0, 0, 1, 0, 0)
 
     char = LTChar(
-        matrix=(1, 0, 0, 1, 0, 0),  # transformation matrix
+        matrix=matrix,  # transformation matrix
         font=Mock(),  # you'd need to mock PDFFont
         fontsize=12,
         scaling=1,
@@ -351,11 +354,11 @@ def test_text_is_embedded():
         create_mock_ltchar("H"),
         create_mock_ltchar("e"),
         create_mock_ltchar("l"),
-        create_mock_ltchar("l"),
+        create_mock_ltchar("l", rotated=True),
         create_mock_ltchar("o", invisible=True),
     ]
 
     container = create_mock_ltcontainer(chars)
 
     assert text_is_embedded(container, threshold=0.5)
-    assert not text_is_embedded(container, threshold=0.1)
+    assert not text_is_embedded(container, threshold=0.3)
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -4,7 +4,6 @@
     "element_id": "04fa31034847cbbf6c840f4da683ccf8",
     "text": "1",
     "metadata": {
-      "is_extracted": "true",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -49,7 +48,6 @@
     "element_id": "fc05a198b2ff732119edea8986775994",
     "text": "2",
     "metadata": {
-      "is_extracted": "true",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -72,7 +70,6 @@
     "element_id": "4a90480c2297c31b4d7ad43b0801ae98",
     "text": "0",
     "metadata": {
-      "is_extracted": "true",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -95,7 +92,7 @@
     "element_id": "e3a383b7e9439f39773c13ea769297b7",
     "text": "2 n u J 1 2 ] V C . s c [ 2 v 8 4 3 5 1 . 3 0 1 2 :",
     "metadata": {
-      "is_extracted": "true",
+      "is_extracted": "partial",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -118,7 +115,6 @@
     "element_id": "4608f9aa33a0cab158565817b0d15743",
     "text": "v",
     "metadata": {
-      "is_extracted": "true",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -163,7 +159,6 @@
     "element_id": "ed4e590932b333f40d0e1367b6b0e32e",
     "text": "i",
     "metadata": {
-      "is_extracted": "true",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -186,7 +181,6 @@
     "element_id": "8cb024fb60457b7c572b167801037f75",
     "text": "X",
     "metadata": {
-      "is_extracted": "true",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -209,7 +203,6 @@
     "element_id": "c202bdacd2daf4c52fa3a6ddd64a0728",
     "text": "r",
     "metadata": {
-      "is_extracted": "true",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -232,7 +225,6 @@
     "element_id": "3db474893ec321c81ef9d1a2afd5f660",
     "text": "a",
     "metadata": {
-      "is_extracted": "true",
       "filetype": "application/pdf",
       "languages": [
         "eng"
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.18.30-dev1"  # pragma: no cover
+__version__ = "0.18.30-dev2"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import math
 import os
 from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast
 
@@ -380,22 +381,44 @@ def array_merge_inferred_layout_with_extracted_layout(
     return final_layout
 
 
-def text_is_embedded(obj, threshold=env_config.PDF_MAX_EMBED_INVISIBLE_TEXT_RATIO):
-    """Check if text object contains visible embedded text vs invisible OCR text."""
-    invisible_chars = 0
+def _ltchar_is_rotated(char: LTChar) -> bool:
+    # Calculate rotation angle in degrees
+    # For standard text: a=1, b=0, c=0, d=1 (no rotation)
+    rotation_radians = math.atan2(char.matrix[1], char.matrix[0])
+    # 0.001 is the tolerance for nearly flat angles; mainly for handling numerical precision
+    return abs(rotation_radians) > 0.001
+
+
+def text_is_embedded(obj, threshold=env_config.PDF_MAX_EMBED_LOW_FIDELITY_TEXT_RATIO):
+    """Check if text object contains too many low_fidelity text: invisible or rotated
+
+    Low fidelity text means that even though the text is extracted from pdf data but its
+    representation in the partitioned elements may require post processing to make senmatic sense.
+    This includes:
+      - invisible text: text not rendered on the pdf are not present visually when reading the page
+        so those texts may not be high quality information for understanding the page
+      - rotated text: text rotated usually are extracted in the order they appear in the dominant
+        reading order of the page (e.g., left->right, top->down). But if a text is rotated so the
+        last character is at the top (y position) and first character is at the bottom the extracted
+        element would contain words written in reverse order. This makes the extraction low quality.
+    """
+    low_fidelity_chars = 0
     total_chars = 0
 
     def extract_chars(layout_obj):
         """Recursively extract all LTChar objects from layout."""
-        nonlocal invisible_chars, total_chars
+        nonlocal low_fidelity_chars, total_chars
 
         if isinstance(layout_obj, LTChar):
             total_chars += 1
 
-            # Check if text is invisible:
-            #   - rendering mode 3 (requires custom pdf interpreter comes with this library)
-            if hasattr(layout_obj, "rendermode") and layout_obj.rendermode == 3:
-                invisible_chars += 1
+            # Check if text is low_fidelity:
+            #  - rendering mode 3 (requires custom pdf interpreter comes with this library)
+            #  - text is rotated
+            if (
+                hasattr(layout_obj, "rendermode") and layout_obj.rendermode == 3
+            ) or _ltchar_is_rotated(layout_obj):
+                low_fidelity_chars += 1
         elif isinstance(layout_obj, LTContainer):
             # Recursively process container's children
             for child in layout_obj:
@@ -406,8 +429,8 @@ def extract_chars(layout_obj):
         # when there are no-trivial amount of hidden characters in the object it means there are
         # text that is not rendered -> most likely OCR'ed text for the image content overlying the
         # text and not embedded text that also shows in the rendered pdf
-        invisible_ratio = invisible_chars / total_chars
-        return invisible_ratio < threshold
+        low_fidelity_ratio = low_fidelity_chars / total_chars
+        return low_fidelity_ratio < threshold
     return True
 
 
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -174,9 +174,9 @@ def PDF_ANNOTATION_THRESHOLD(self) -> float:
         return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
 
     @property
-    def PDF_MAX_EMBED_INVISIBLE_TEXT_RATIO(self) -> float:
-        """maximum ratio of invisible text for a text to be considered embedded text"""
-        return self._get_float("PDF_MAX_EMBED_INVISIBLE_TEXT_RATIO", 0.1)
+    def PDF_MAX_EMBED_LOW_FIDELITY_TEXT_RATIO(self) -> float:
+        """maximum ratio of low fidelity charcaters for a text to be considered embedded text"""
+        return self._get_float("PDF_MAX_EMBED_LOW_FIDELITY_TEXT_RATIO", 0.1)
 
     @property
     def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.18.30-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.18.30-dev2" # pragma: no cover`