Skip to content

Commit d64c57d

Browse files
feat: consider rotated text as low fidelityfeat: consider rotated text (#4190)
This PR updates the function `is_text_embedded`: - now considers both if chars are invisible or rotated (as a result includes some refactoring of variable names) - rotated text elements can have wrong character order compared to natural reading order -> if feed into downstream applications like embedding text the element loses its semantic meaning - as a result this update flags texts with too many rotated characters as only partially embedded: its source is technically embedded but it may need post processing to be useful --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: badGarnet <[email protected]>
1 parent 138661a commit d64c57d

File tree

6 files changed

+48
-29
lines changed

6 files changed

+48
-29
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.18.30-dev1
1+
## 0.18.30-dev2
22

33
### Enhancement
4+
- `is_text_embedded` now considers rotated text as low fidelity and and elements with no trivial amount of it are considered not embedded
45
- Replace `pdf2image` with PyPDFium2 for PDF rendering
56

67
### Fixes

test_unstructured/partition/pdf_image/test_pdfminer_processing.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,9 @@ def test_process_file_with_pdfminer():
284284

285285
def test_process_file_with_pdfminer_is_extracted_array():
286286
layout, _ = process_file_with_pdfminer(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
287-
assert all(is_extracted is IsExtracted.TRUE for is_extracted in layout[0].is_extracted_array)
287+
# first page contains rotated text that are considered low fidelity, i.e., is_extracted=partial
288+
assert layout[0].is_extracted_array[0] is None
289+
assert all(is_extracted is IsExtracted.TRUE for is_extracted in layout[1].is_extracted_array)
288290

289291

290292
def test_process_file_hidden_ocr_text():
@@ -310,13 +312,14 @@ def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock):
310312
}
311313

312314

313-
def create_mock_ltchar(text, invisible=False):
315+
def create_mock_ltchar(text, invisible=False, rotated=False):
314316
"""Create a mock LTChar object"""
315317

316318
graphicstate = Mock()
319+
matrix = (1, 0.5, 0, 1, 0, 0) if rotated else (1, 0, 0, 1, 0, 0)
317320

318321
char = LTChar(
319-
matrix=(1, 0, 0, 1, 0, 0), # transformation matrix
322+
matrix=matrix, # transformation matrix
320323
font=Mock(), # you'd need to mock PDFFont
321324
fontsize=12,
322325
scaling=1,
@@ -351,11 +354,11 @@ def test_text_is_embedded():
351354
create_mock_ltchar("H"),
352355
create_mock_ltchar("e"),
353356
create_mock_ltchar("l"),
354-
create_mock_ltchar("l"),
357+
create_mock_ltchar("l", rotated=True),
355358
create_mock_ltchar("o", invisible=True),
356359
]
357360

358361
container = create_mock_ltcontainer(chars)
359362

360363
assert text_is_embedded(container, threshold=0.5)
361-
assert not text_is_embedded(container, threshold=0.1)
364+
assert not text_is_embedded(container, threshold=0.3)

test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"element_id": "04fa31034847cbbf6c840f4da683ccf8",
55
"text": "1",
66
"metadata": {
7-
"is_extracted": "true",
87
"filetype": "application/pdf",
98
"languages": [
109
"eng"
@@ -49,7 +48,6 @@
4948
"element_id": "fc05a198b2ff732119edea8986775994",
5049
"text": "2",
5150
"metadata": {
52-
"is_extracted": "true",
5351
"filetype": "application/pdf",
5452
"languages": [
5553
"eng"
@@ -72,7 +70,6 @@
7270
"element_id": "4a90480c2297c31b4d7ad43b0801ae98",
7371
"text": "0",
7472
"metadata": {
75-
"is_extracted": "true",
7673
"filetype": "application/pdf",
7774
"languages": [
7875
"eng"
@@ -95,7 +92,7 @@
9592
"element_id": "e3a383b7e9439f39773c13ea769297b7",
9693
"text": "2 n u J 1 2 ] V C . s c [ 2 v 8 4 3 5 1 . 3 0 1 2 :",
9794
"metadata": {
98-
"is_extracted": "true",
95+
"is_extracted": "partial",
9996
"filetype": "application/pdf",
10097
"languages": [
10198
"eng"
@@ -118,7 +115,6 @@
118115
"element_id": "4608f9aa33a0cab158565817b0d15743",
119116
"text": "v",
120117
"metadata": {
121-
"is_extracted": "true",
122118
"filetype": "application/pdf",
123119
"languages": [
124120
"eng"
@@ -163,7 +159,6 @@
163159
"element_id": "ed4e590932b333f40d0e1367b6b0e32e",
164160
"text": "i",
165161
"metadata": {
166-
"is_extracted": "true",
167162
"filetype": "application/pdf",
168163
"languages": [
169164
"eng"
@@ -186,7 +181,6 @@
186181
"element_id": "8cb024fb60457b7c572b167801037f75",
187182
"text": "X",
188183
"metadata": {
189-
"is_extracted": "true",
190184
"filetype": "application/pdf",
191185
"languages": [
192186
"eng"
@@ -209,7 +203,6 @@
209203
"element_id": "c202bdacd2daf4c52fa3a6ddd64a0728",
210204
"text": "r",
211205
"metadata": {
212-
"is_extracted": "true",
213206
"filetype": "application/pdf",
214207
"languages": [
215208
"eng"
@@ -232,7 +225,6 @@
232225
"element_id": "3db474893ec321c81ef9d1a2afd5f660",
233226
"text": "a",
234227
"metadata": {
235-
"is_extracted": "true",
236228
"filetype": "application/pdf",
237229
"languages": [
238230
"eng"

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.30-dev1" # pragma: no cover
1+
__version__ = "0.18.30-dev2" # pragma: no cover

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import math
34
import os
45
from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Optional, Union, cast
56

@@ -380,22 +381,44 @@ def array_merge_inferred_layout_with_extracted_layout(
380381
return final_layout
381382

382383

383-
def text_is_embedded(obj, threshold=env_config.PDF_MAX_EMBED_INVISIBLE_TEXT_RATIO):
384-
"""Check if text object contains visible embedded text vs invisible OCR text."""
385-
invisible_chars = 0
384+
def _ltchar_is_rotated(char: LTChar) -> bool:
385+
# Calculate rotation angle in degrees
386+
# For standard text: a=1, b=0, c=0, d=1 (no rotation)
387+
rotation_radians = math.atan2(char.matrix[1], char.matrix[0])
388+
# 0.001 is the tolerance for nearly flat angles; mainly for handling numerical precision
389+
return abs(rotation_radians) > 0.001
390+
391+
392+
def text_is_embedded(obj, threshold=env_config.PDF_MAX_EMBED_LOW_FIDELITY_TEXT_RATIO):
393+
"""Check if text object contains too many low_fidelity text: invisible or rotated
394+
395+
Low fidelity text means that even though the text is extracted from pdf data but its
396+
representation in the partitioned elements may require post processing to make senmatic sense.
397+
This includes:
398+
- invisible text: text not rendered on the pdf are not present visually when reading the page
399+
so those texts may not be high quality information for understanding the page
400+
- rotated text: text rotated usually are extracted in the order they appear in the dominant
401+
reading order of the page (e.g., left->right, top->down). But if a text is rotated so the
402+
last character is at the top (y position) and first character is at the bottom the extracted
403+
element would contain words written in reverse order. This makes the extraction low quality.
404+
"""
405+
low_fidelity_chars = 0
386406
total_chars = 0
387407

388408
def extract_chars(layout_obj):
389409
"""Recursively extract all LTChar objects from layout."""
390-
nonlocal invisible_chars, total_chars
410+
nonlocal low_fidelity_chars, total_chars
391411

392412
if isinstance(layout_obj, LTChar):
393413
total_chars += 1
394414

395-
# Check if text is invisible:
396-
# - rendering mode 3 (requires custom pdf interpreter comes with this library)
397-
if hasattr(layout_obj, "rendermode") and layout_obj.rendermode == 3:
398-
invisible_chars += 1
415+
# Check if text is low_fidelity:
416+
# - rendering mode 3 (requires custom pdf interpreter comes with this library)
417+
# - text is rotated
418+
if (
419+
hasattr(layout_obj, "rendermode") and layout_obj.rendermode == 3
420+
) or _ltchar_is_rotated(layout_obj):
421+
low_fidelity_chars += 1
399422
elif isinstance(layout_obj, LTContainer):
400423
# Recursively process container's children
401424
for child in layout_obj:
@@ -406,8 +429,8 @@ def extract_chars(layout_obj):
406429
# when there are no-trivial amount of hidden characters in the object it means there are
407430
# text that is not rendered -> most likely OCR'ed text for the image content overlying the
408431
# text and not embedded text that also shows in the rendered pdf
409-
invisible_ratio = invisible_chars / total_chars
410-
return invisible_ratio < threshold
432+
low_fidelity_ratio = low_fidelity_chars / total_chars
433+
return low_fidelity_ratio < threshold
411434
return True
412435

413436

unstructured/partition/utils/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,9 @@ def PDF_ANNOTATION_THRESHOLD(self) -> float:
174174
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
175175

176176
@property
177-
def PDF_MAX_EMBED_INVISIBLE_TEXT_RATIO(self) -> float:
178-
"""maximum ratio of invisible text for a text to be considered embedded text"""
179-
return self._get_float("PDF_MAX_EMBED_INVISIBLE_TEXT_RATIO", 0.1)
177+
def PDF_MAX_EMBED_LOW_FIDELITY_TEXT_RATIO(self) -> float:
178+
"""maximum ratio of low fidelity charcaters for a text to be considered embedded text"""
179+
return self._get_float("PDF_MAX_EMBED_LOW_FIDELITY_TEXT_RATIO", 0.1)
180180

181181
@property
182182
def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:

0 commit comments

Comments
 (0)