Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
### Enhancements
- **Add `group_elements_by_parent_id` utility function**: Groups elements by their `parent_id` metadata field for easier document hierarchy traversal (fixes #1489)

### Fixes
- **Fix duplicate characters in PDF bold text extraction**: Some PDFs render bold text by drawing each character twice at slightly offset positions, causing text like "BOLD" to be extracted as "BBOOLLDD". Added character-level deduplication based on position proximity. Configurable via `PDF_CHAR_DUPLICATE_THRESHOLD` environment variable (default: 3.0 pixels, set to 0 to disable).

## 0.18.32

### Enhancements
Expand Down
91 changes: 91 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from test_unstructured.unit_utils import example_doc_path
from unstructured.partition.auto import partition
from unstructured.partition.pdf_image.pdfminer_processing import (
_deduplicate_ltchars,
_validate_bbox,
aggregate_embedded_text_by_block,
bboxes1_is_almost_subregion_of_bboxes2,
Expand Down Expand Up @@ -362,3 +363,93 @@ def test_text_is_embedded():

assert text_is_embedded(container, threshold=0.5)
assert not text_is_embedded(container, threshold=0.3)


# -- Tests for _deduplicate_ltchars (fake bold fix) --


def _create_positioned_ltchar(text: str, x0: float, y0: float) -> LTChar:
"""Create an LTChar with a specific position for deduplication testing."""
graphicstate = Mock()
# Matrix format: (a, b, c, d, e, f) where e=x, f=y for translation
matrix = (1, 0, 0, 1, x0, y0)

char = LTChar(
matrix=matrix,
font=Mock(),
fontsize=12,
scaling=1,
rise=0,
text=text,
textwidth=10,
textdisp=(0, 1),
ncs=Mock(),
graphicstate=graphicstate,
)
return char


class TestDeduplicateLtchars:
"""Tests for _deduplicate_ltchars function."""

def test_empty_list_returns_empty(self):
"""Empty character list should return empty list."""
result = _deduplicate_ltchars([], threshold=3.0)
assert result == []

def test_threshold_zero_disables_deduplication(self):
"""Threshold of 0 should disable deduplication and return original list."""
chars = [
_create_positioned_ltchar("A", 10.0, 20.0),
_create_positioned_ltchar("A", 10.5, 20.0), # Would be duplicate
]
result = _deduplicate_ltchars(chars, threshold=0)
assert len(result) == 2

def test_fake_bold_duplicates_removed(self):
"""Fake bold (double-rendered) characters should be deduplicated."""
# Simulate "AB" rendered as "AABB" with fake bold
chars = [
_create_positioned_ltchar("A", 10.0, 20.0),
_create_positioned_ltchar("A", 10.5, 20.0), # Duplicate - close position
_create_positioned_ltchar("B", 25.0, 20.0),
_create_positioned_ltchar("B", 25.5, 20.0), # Duplicate - close position
]
result = _deduplicate_ltchars(chars, threshold=3.0)
assert len(result) == 2
assert result[0].get_text() == "A"
assert result[1].get_text() == "B"

def test_legitimate_repeated_chars_preserved(self):
"""Legitimate repeated characters at different positions should be preserved."""
# "AA" where both A's are at legitimately different positions
chars = [
_create_positioned_ltchar("A", 10.0, 20.0),
_create_positioned_ltchar("A", 25.0, 20.0), # Far enough - not duplicate
]
result = _deduplicate_ltchars(chars, threshold=3.0)
assert len(result) == 2

def test_single_char_returns_single(self):
"""Single character should return single character."""
chars = [_create_positioned_ltchar("X", 10.0, 20.0)]
result = _deduplicate_ltchars(chars, threshold=3.0)
assert len(result) == 1
assert result[0].get_text() == "X"

def test_mixed_duplicates_and_normal(self):
"""Mix of duplicated and normal characters should be handled correctly."""
# "HELLO" where only H and L are fake-bold
chars = [
_create_positioned_ltchar("H", 10.0, 20.0),
_create_positioned_ltchar("H", 10.5, 20.0), # Duplicate
_create_positioned_ltchar("E", 20.0, 20.0), # Normal
_create_positioned_ltchar("L", 30.0, 20.0),
_create_positioned_ltchar("L", 30.5, 20.0), # Duplicate
_create_positioned_ltchar("L", 40.0, 20.0), # Second L (normal, different position)
_create_positioned_ltchar("O", 50.0, 20.0), # Normal
]
result = _deduplicate_ltchars(chars, threshold=3.0)
assert len(result) == 5
text = "".join(c.get_text() for c in result)
assert text == "HELLO"
180 changes: 178 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdfminer_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from unittest.mock import MagicMock

from pdfminer.layout import LTContainer, LTTextLine
from pdfminer.layout import LTChar, LTContainer, LTTextLine

from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
from unstructured.partition.pdf_image.pdfminer_utils import (
_is_duplicate_char,
deduplicate_chars_in_text_line,
extract_text_objects,
get_text_with_deduplication,
)


def test_extract_text_objects_nested_containers():
Expand All @@ -26,3 +31,174 @@ def test_extract_text_objects_nested_containers():
assert len(result) == 2
assert mock_text_line1 in result
assert mock_text_line2 in result


# -- Tests for character deduplication (fake bold fix) --


def _create_mock_ltchar(text: str, x0: float, y0: float) -> MagicMock:
"""Helper to create a mock LTChar with specified text and position."""
mock_char = MagicMock(spec=LTChar)
mock_char.get_text.return_value = text
mock_char.x0 = x0
mock_char.y0 = y0
return mock_char


class TestIsDuplicateChar:
"""Tests for _is_duplicate_char function."""

def test_same_char_same_position_is_duplicate(self):
"""Two identical characters at the same position should be duplicates."""
char1 = _create_mock_ltchar("A", 10.0, 20.0)
char2 = _create_mock_ltchar("A", 10.0, 20.0)
assert _is_duplicate_char(char1, char2, threshold=3.0) is True

def test_same_char_close_position_is_duplicate(self):
"""Two identical characters at close positions should be duplicates."""
char1 = _create_mock_ltchar("B", 10.0, 20.0)
char2 = _create_mock_ltchar("B", 11.5, 21.0) # Within 3.0 threshold
assert _is_duplicate_char(char1, char2, threshold=3.0) is True

def test_same_char_far_position_not_duplicate(self):
"""Two identical characters at far positions should not be duplicates."""
char1 = _create_mock_ltchar("C", 10.0, 20.0)
char2 = _create_mock_ltchar("C", 15.0, 20.0) # 5.0 > 3.0 threshold
assert _is_duplicate_char(char1, char2, threshold=3.0) is False

def test_different_chars_same_position_not_duplicate(self):
"""Two different characters at the same position should not be duplicates."""
char1 = _create_mock_ltchar("A", 10.0, 20.0)
char2 = _create_mock_ltchar("B", 10.0, 20.0)
assert _is_duplicate_char(char1, char2, threshold=3.0) is False

def test_threshold_boundary(self):
"""Test behavior at exact threshold boundary."""
char1 = _create_mock_ltchar("X", 10.0, 20.0)
char2 = _create_mock_ltchar("X", 13.0, 20.0) # Exactly at threshold
# At threshold means NOT within threshold (uses < not <=)
assert _is_duplicate_char(char1, char2, threshold=3.0) is False

char3 = _create_mock_ltchar("X", 12.9, 20.0) # Just under threshold
assert _is_duplicate_char(char1, char3, threshold=3.0) is True


class TestDeduplicateCharsInTextLine:
"""Tests for deduplicate_chars_in_text_line function."""

def test_no_duplicates_returns_original(self):
"""Text line without duplicates should return original text."""
chars = [
_create_mock_ltchar("H", 10.0, 20.0),
_create_mock_ltchar("i", 15.0, 20.0),
]
mock_text_line = MagicMock(spec=LTTextLine)
mock_text_line.__iter__ = lambda self: iter(chars)
mock_text_line.get_text.return_value = "Hi"

result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0)
assert result == "Hi"

def test_fake_bold_duplicates_removed(self):
"""Fake bold text (each char doubled) should be deduplicated."""
# Simulates "BOLD" rendered as "BBOOLLDD" with duplicate positions
chars = [
_create_mock_ltchar("B", 10.0, 20.0),
_create_mock_ltchar("B", 10.5, 20.0), # Duplicate
_create_mock_ltchar("O", 20.0, 20.0),
_create_mock_ltchar("O", 20.5, 20.0), # Duplicate
_create_mock_ltchar("L", 30.0, 20.0),
_create_mock_ltchar("L", 30.5, 20.0), # Duplicate
_create_mock_ltchar("D", 40.0, 20.0),
_create_mock_ltchar("D", 40.5, 20.0), # Duplicate
]
mock_text_line = MagicMock(spec=LTTextLine)
mock_text_line.__iter__ = lambda self: iter(chars)

result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0)
assert result == "BOLD"

def test_threshold_zero_disables_deduplication(self):
"""Setting threshold to 0 should disable deduplication."""
mock_text_line = MagicMock(spec=LTTextLine)
mock_text_line.get_text.return_value = "BBOOLLDD"

result = deduplicate_chars_in_text_line(mock_text_line, threshold=0)
assert result == "BBOOLLDD"

def test_negative_threshold_disables_deduplication(self):
"""Setting negative threshold should disable deduplication."""
mock_text_line = MagicMock(spec=LTTextLine)
mock_text_line.get_text.return_value = "BBOOLLDD"

result = deduplicate_chars_in_text_line(mock_text_line, threshold=-1.0)
assert result == "BBOOLLDD"

def test_empty_text_line(self):
"""Empty text line should return original text."""
mock_text_line = MagicMock(spec=LTTextLine)
mock_text_line.__iter__ = lambda self: iter([])
mock_text_line.get_text.return_value = ""

result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0)
assert result == ""

def test_legitimate_repeated_chars_preserved(self):
"""Legitimate repeated characters (different positions) should be preserved."""
# "AA" where both A's are at different positions
chars = [
_create_mock_ltchar("A", 10.0, 20.0),
_create_mock_ltchar("A", 20.0, 20.0), # Different position, not duplicate
]
mock_text_line = MagicMock(spec=LTTextLine)
mock_text_line.__iter__ = lambda self: iter(chars)

result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0)
assert result == "AA"


class TestGetTextWithDeduplication:
"""Tests for get_text_with_deduplication function."""

def test_with_text_line(self):
"""Should properly deduplicate text from LTTextLine."""
chars = [
_create_mock_ltchar("H", 10.0, 20.0),
_create_mock_ltchar("H", 10.5, 20.0), # Duplicate
_create_mock_ltchar("i", 20.0, 20.0),
]
mock_text_line = MagicMock(spec=LTTextLine)
mock_text_line.__iter__ = lambda self: iter(chars)

result = get_text_with_deduplication(mock_text_line, threshold=3.0)
assert result == "Hi"

def test_with_container(self):
"""Should handle LTContainer with nested LTTextLine."""
chars = [
_create_mock_ltchar("T", 10.0, 20.0),
_create_mock_ltchar("T", 10.5, 20.0), # Duplicate
]
mock_text_line = MagicMock(spec=LTTextLine)
mock_text_line.__iter__ = lambda self: iter(chars)

mock_container = MagicMock(spec=LTContainer)
mock_container.__iter__ = lambda self: iter([mock_text_line])

result = get_text_with_deduplication(mock_container, threshold=3.0)
assert result == "T"

def test_with_generic_object(self):
"""Should fall back to get_text() for non-standard objects."""
mock_obj = MagicMock()
mock_obj.get_text.return_value = "fallback text"

result = get_text_with_deduplication(mock_obj, threshold=3.0)
assert result == "fallback text"

def test_without_get_text(self):
"""Should return empty string for objects without get_text."""
mock_obj = MagicMock(spec=[]) # No get_text method

result = get_text_with_deduplication(mock_obj, threshold=3.0)
assert result == ""
46 changes: 45 additions & 1 deletion unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
from unstructured.partition.pdf_image.pdfminer_utils import (
PDFMinerConfig,
_is_duplicate_char,
extract_image_objects,
extract_text_objects,
get_text_with_deduplication,
open_pdfminer_pages_generator,
rect_to_bbox,
)
Expand Down Expand Up @@ -466,11 +468,13 @@ def process_page_layout_from_pdfminer(

if hasattr(obj, "get_text"):
inner_text_objects = extract_text_objects(obj)
char_dedup_threshold = env_config.PDF_CHAR_DUPLICATE_THRESHOLD
for inner_obj in inner_text_objects:
inner_bbox = rect_to_bbox(inner_obj.bbox, page_height)
if not _validate_bbox(inner_bbox):
continue
texts.append(inner_obj.get_text())
# Use deduplication to handle fake bold text (characters rendered twice)
texts.append(get_text_with_deduplication(inner_obj, char_dedup_threshold))
element_coords.append(inner_bbox)
element_class.append(0)
is_extracted.append(IsExtracted.TRUE if text_is_embedded(inner_obj) else None)
Expand Down Expand Up @@ -1006,6 +1010,33 @@ def check_annotations_within_element(
return annotations_within_element


def _deduplicate_ltchars(
chars: list[LTChar],
threshold: float,
) -> list[LTChar]:
"""Remove duplicate characters caused by fake bold rendering.
Some PDFs create bold text by rendering the same character twice at slightly offset
positions. This function removes such duplicates.
Args:
chars: List of LTChar objects to deduplicate.
threshold: Maximum pixel distance to consider characters as duplicates.
Set to 0 to disable deduplication.
Returns:
Deduplicated list of LTChar objects.
"""
if threshold <= 0 or not chars:
return chars

result = [chars[0]]
for char in chars[1:]:
if not _is_duplicate_char(result[-1], char, threshold):
result.append(char)
return result


def get_words_from_obj(
obj: LTTextBox,
height: float,
Expand All @@ -1026,13 +1057,25 @@ def get_words_from_obj(
characters = []
words = []
text_len = 0
char_dedup_threshold = env_config.PDF_CHAR_DUPLICATE_THRESHOLD

for text_line in obj:
word = ""
x1, y1, x2, y2 = None, None, None, None
start_index = 0
last_char: LTChar | None = None # Track last character for deduplication

for index, character in enumerate(text_line):
if isinstance(character, LTChar):
# Skip duplicate characters (fake bold fix)
if (
char_dedup_threshold > 0
and last_char is not None
and _is_duplicate_char(last_char, character, char_dedup_threshold)
):
continue

last_char = character
characters.append(character)
char = character.get_text()

Expand Down Expand Up @@ -1066,6 +1109,7 @@ def get_words_from_obj(

word += char
else:
# Non-LTChar items (e.g., LTAnno) act as word boundaries
words.append(
{"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index},
)
Expand Down
Loading