diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d7adf2496..fd0ca46a21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,9 @@ ### Enhancements - **Add `group_elements_by_parent_id` utility function**: Groups elements by their `parent_id` metadata field for easier document hierarchy traversal (fixes #1489) +### Fixes +- **Fix duplicate characters in PDF bold text extraction**: Some PDFs render bold text by drawing each character twice at slightly offset positions, causing text like "BOLD" to be extracted as "BBOOLLDD". Added character-level deduplication based on position proximity. Configurable via `PDF_CHAR_DUPLICATE_THRESHOLD` environment variable (default: 3.0 pixels, set to 0 to disable). + ## 0.18.32 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 5d5b28e5e2..ea2f5338ee 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -18,6 +18,7 @@ from test_unstructured.unit_utils import example_doc_path from unstructured.partition.auto import partition from unstructured.partition.pdf_image.pdfminer_processing import ( + _deduplicate_ltchars, _validate_bbox, aggregate_embedded_text_by_block, bboxes1_is_almost_subregion_of_bboxes2, @@ -362,3 +363,93 @@ def test_text_is_embedded(): assert text_is_embedded(container, threshold=0.5) assert not text_is_embedded(container, threshold=0.3) + + +# -- Tests for _deduplicate_ltchars (fake bold fix) -- + + +def _create_positioned_ltchar(text: str, x0: float, y0: float) -> LTChar: + """Create an LTChar with a specific position for deduplication testing.""" + graphicstate = Mock() + # Matrix format: (a, b, c, d, e, f) where e=x, f=y for translation + matrix = (1, 0, 0, 1, x0, y0) + + char = LTChar( + matrix=matrix, + font=Mock(), + fontsize=12, + scaling=1, + rise=0, + text=text, + textwidth=10, + textdisp=(0, 1), + ncs=Mock(), + graphicstate=graphicstate, + ) + return char + + +class TestDeduplicateLtchars: + """Tests for _deduplicate_ltchars function.""" + + def test_empty_list_returns_empty(self): + """Empty character list should return empty list.""" + result = _deduplicate_ltchars([], threshold=3.0) + assert result == [] + + def test_threshold_zero_disables_deduplication(self): + """Threshold of 0 should disable deduplication and return original list.""" + chars = [ + _create_positioned_ltchar("A", 10.0, 20.0), + _create_positioned_ltchar("A", 10.5, 20.0), # Would be duplicate + ] + result = _deduplicate_ltchars(chars, threshold=0) + assert len(result) == 2 + + def test_fake_bold_duplicates_removed(self): + """Fake bold (double-rendered) characters should be deduplicated.""" + # Simulate "AB" rendered as "AABB" with fake bold + chars = [ + _create_positioned_ltchar("A", 10.0, 20.0), + _create_positioned_ltchar("A", 10.5, 20.0), # Duplicate - close position + _create_positioned_ltchar("B", 25.0, 20.0), + _create_positioned_ltchar("B", 25.5, 20.0), # Duplicate - close position + ] + result = _deduplicate_ltchars(chars, threshold=3.0) + assert len(result) == 2 + assert result[0].get_text() == "A" + assert result[1].get_text() == "B" + + def test_legitimate_repeated_chars_preserved(self): + """Legitimate repeated characters at different positions should be preserved.""" + # "AA" where both A's are at legitimately different positions + chars = [ + _create_positioned_ltchar("A", 10.0, 20.0), + _create_positioned_ltchar("A", 25.0, 20.0), # Far enough - not duplicate + ] + result = _deduplicate_ltchars(chars, threshold=3.0) + assert len(result) == 2 + + def test_single_char_returns_single(self): + """Single character should return single character.""" + chars = [_create_positioned_ltchar("X", 10.0, 20.0)] + result = _deduplicate_ltchars(chars, threshold=3.0) + assert len(result) == 1 + assert result[0].get_text() == "X" + + def test_mixed_duplicates_and_normal(self): + """Mix of duplicated and normal characters should be handled correctly.""" + # "HELLO" where only H and L are fake-bold + chars = [ + _create_positioned_ltchar("H", 10.0, 20.0), + _create_positioned_ltchar("H", 10.5, 20.0), # Duplicate + _create_positioned_ltchar("E", 20.0, 20.0), # Normal + _create_positioned_ltchar("L", 30.0, 20.0), + _create_positioned_ltchar("L", 30.5, 20.0), # Duplicate + _create_positioned_ltchar("L", 40.0, 20.0), # Second L (normal, different position) + _create_positioned_ltchar("O", 50.0, 20.0), # Normal + ] + result = _deduplicate_ltchars(chars, threshold=3.0) + assert len(result) == 5 + text = "".join(c.get_text() for c in result) + assert text == "HELLO" diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index 075a4e151e..cd70a2b18a 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -1,8 +1,13 @@ from unittest.mock import MagicMock -from pdfminer.layout import LTContainer, LTTextLine +from pdfminer.layout import LTChar, LTContainer, LTTextLine -from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects +from unstructured.partition.pdf_image.pdfminer_utils import ( + _is_duplicate_char, + deduplicate_chars_in_text_line, + extract_text_objects, + get_text_with_deduplication, +) def test_extract_text_objects_nested_containers(): @@ -26,3 +31,174 @@ def test_extract_text_objects_nested_containers(): assert len(result) == 2 assert mock_text_line1 in result assert mock_text_line2 in result + + +# -- Tests for character deduplication (fake bold fix) -- + + +def _create_mock_ltchar(text: str, x0: float, y0: float) -> MagicMock: + """Helper to create a mock LTChar with specified text and position.""" + mock_char = MagicMock(spec=LTChar) + mock_char.get_text.return_value = text + mock_char.x0 = x0 + mock_char.y0 = y0 + return mock_char + + +class TestIsDuplicateChar: + """Tests for _is_duplicate_char function.""" + + def test_same_char_same_position_is_duplicate(self): + """Two identical characters at the same position should be duplicates.""" + char1 = _create_mock_ltchar("A", 10.0, 20.0) + char2 = _create_mock_ltchar("A", 10.0, 20.0) + assert _is_duplicate_char(char1, char2, threshold=3.0) is True + + def test_same_char_close_position_is_duplicate(self): + """Two identical characters at close positions should be duplicates.""" + char1 = _create_mock_ltchar("B", 10.0, 20.0) + char2 = _create_mock_ltchar("B", 11.5, 21.0) # Within 3.0 threshold + assert _is_duplicate_char(char1, char2, threshold=3.0) is True + + def test_same_char_far_position_not_duplicate(self): + """Two identical characters at far positions should not be duplicates.""" + char1 = _create_mock_ltchar("C", 10.0, 20.0) + char2 = _create_mock_ltchar("C", 15.0, 20.0) # 5.0 > 3.0 threshold + assert _is_duplicate_char(char1, char2, threshold=3.0) is False + + def test_different_chars_same_position_not_duplicate(self): + """Two different characters at the same position should not be duplicates.""" + char1 = _create_mock_ltchar("A", 10.0, 20.0) + char2 = _create_mock_ltchar("B", 10.0, 20.0) + assert _is_duplicate_char(char1, char2, threshold=3.0) is False + + def test_threshold_boundary(self): + """Test behavior at exact threshold boundary.""" + char1 = _create_mock_ltchar("X", 10.0, 20.0) + char2 = _create_mock_ltchar("X", 13.0, 20.0) # Exactly at threshold + # At threshold means NOT within threshold (uses < not <=) + assert _is_duplicate_char(char1, char2, threshold=3.0) is False + + char3 = _create_mock_ltchar("X", 12.9, 20.0) # Just under threshold + assert _is_duplicate_char(char1, char3, threshold=3.0) is True + + +class TestDeduplicateCharsInTextLine: + """Tests for deduplicate_chars_in_text_line function.""" + + def test_no_duplicates_returns_original(self): + """Text line without duplicates should return original text.""" + chars = [ + _create_mock_ltchar("H", 10.0, 20.0), + _create_mock_ltchar("i", 15.0, 20.0), + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + mock_text_line.get_text.return_value = "Hi" + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0) + assert result == "Hi" + + def test_fake_bold_duplicates_removed(self): + """Fake bold text (each char doubled) should be deduplicated.""" + # Simulates "BOLD" rendered as "BBOOLLDD" with duplicate positions + chars = [ + _create_mock_ltchar("B", 10.0, 20.0), + _create_mock_ltchar("B", 10.5, 20.0), # Duplicate + _create_mock_ltchar("O", 20.0, 20.0), + _create_mock_ltchar("O", 20.5, 20.0), # Duplicate + _create_mock_ltchar("L", 30.0, 20.0), + _create_mock_ltchar("L", 30.5, 20.0), # Duplicate + _create_mock_ltchar("D", 40.0, 20.0), + _create_mock_ltchar("D", 40.5, 20.0), # Duplicate + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0) + assert result == "BOLD" + + def test_threshold_zero_disables_deduplication(self): + """Setting threshold to 0 should disable deduplication.""" + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.get_text.return_value = "BBOOLLDD" + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=0) + assert result == "BBOOLLDD" + + def test_negative_threshold_disables_deduplication(self): + """Setting negative threshold should disable deduplication.""" + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.get_text.return_value = "BBOOLLDD" + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=-1.0) + assert result == "BBOOLLDD" + + def test_empty_text_line(self): + """Empty text line should return original text.""" + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter([]) + mock_text_line.get_text.return_value = "" + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0) + assert result == "" + + def test_legitimate_repeated_chars_preserved(self): + """Legitimate repeated characters (different positions) should be preserved.""" + # "AA" where both A's are at different positions + chars = [ + _create_mock_ltchar("A", 10.0, 20.0), + _create_mock_ltchar("A", 20.0, 20.0), # Different position, not duplicate + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0) + assert result == "AA" + + +class TestGetTextWithDeduplication: + """Tests for get_text_with_deduplication function.""" + + def test_with_text_line(self): + """Should properly deduplicate text from LTTextLine.""" + chars = [ + _create_mock_ltchar("H", 10.0, 20.0), + _create_mock_ltchar("H", 10.5, 20.0), # Duplicate + _create_mock_ltchar("i", 20.0, 20.0), + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + + result = get_text_with_deduplication(mock_text_line, threshold=3.0) + assert result == "Hi" + + def test_with_container(self): + """Should handle LTContainer with nested LTTextLine.""" + chars = [ + _create_mock_ltchar("T", 10.0, 20.0), + _create_mock_ltchar("T", 10.5, 20.0), # Duplicate + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + + mock_container = MagicMock(spec=LTContainer) + mock_container.__iter__ = lambda self: iter([mock_text_line]) + + result = get_text_with_deduplication(mock_container, threshold=3.0) + assert result == "T" + + def test_with_generic_object(self): + """Should fall back to get_text() for non-standard objects.""" + mock_obj = MagicMock() + mock_obj.get_text.return_value = "fallback text" + + result = get_text_with_deduplication(mock_obj, threshold=3.0) + assert result == "fallback text" + + def test_without_get_text(self): + """Should return empty string for objects without get_text.""" + mock_obj = MagicMock(spec=[]) # No get_text method + + result = get_text_with_deduplication(mock_obj, threshold=3.0) + assert result == "" diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 991d5c5d6f..0a7c7453f2 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -17,8 +17,10 @@ from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( PDFMinerConfig, + _is_duplicate_char, extract_image_objects, extract_text_objects, + get_text_with_deduplication, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -466,11 +468,13 @@ def process_page_layout_from_pdfminer( if hasattr(obj, "get_text"): inner_text_objects = extract_text_objects(obj) + char_dedup_threshold = env_config.PDF_CHAR_DUPLICATE_THRESHOLD for inner_obj in inner_text_objects: inner_bbox = rect_to_bbox(inner_obj.bbox, page_height) if not _validate_bbox(inner_bbox): continue - texts.append(inner_obj.get_text()) + # Use deduplication to handle fake bold text (characters rendered twice) + texts.append(get_text_with_deduplication(inner_obj, char_dedup_threshold)) element_coords.append(inner_bbox) element_class.append(0) is_extracted.append(IsExtracted.TRUE if text_is_embedded(inner_obj) else None) @@ -1006,6 +1010,33 @@ def check_annotations_within_element( return annotations_within_element +def _deduplicate_ltchars( + chars: list[LTChar], + threshold: float, +) -> list[LTChar]: + """Remove duplicate characters caused by fake bold rendering. + + Some PDFs create bold text by rendering the same character twice at slightly offset + positions. This function removes such duplicates. + + Args: + chars: List of LTChar objects to deduplicate. + threshold: Maximum pixel distance to consider characters as duplicates. + Set to 0 to disable deduplication. + + Returns: + Deduplicated list of LTChar objects. + """ + if threshold <= 0 or not chars: + return chars + + result = [chars[0]] + for char in chars[1:]: + if not _is_duplicate_char(result[-1], char, threshold): + result.append(char) + return result + + def get_words_from_obj( obj: LTTextBox, height: float, @@ -1026,13 +1057,25 @@ def get_words_from_obj( characters = [] words = [] text_len = 0 + char_dedup_threshold = env_config.PDF_CHAR_DUPLICATE_THRESHOLD for text_line in obj: word = "" x1, y1, x2, y2 = None, None, None, None start_index = 0 + last_char: LTChar | None = None # Track last character for deduplication + for index, character in enumerate(text_line): if isinstance(character, LTChar): + # Skip duplicate characters (fake bold fix) + if ( + char_dedup_threshold > 0 + and last_char is not None + and _is_duplicate_char(last_char, character, char_dedup_threshold) + ): + continue + + last_char = character characters.append(character) char = character.get_text() @@ -1066,6 +1109,7 @@ def get_words_from_obj( word += char else: + # Non-LTChar items (e.g., LTAnno) act as word boundaries words.append( {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, ) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 948cf8ba48..ba9c9062a8 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,6 +1,6 @@ import os import tempfile -from typing import BinaryIO, List, Optional, Tuple +from typing import BinaryIO, List, Optional, Tuple, Union from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTChar, LTContainer, LTImage, LTItem, LTTextLine @@ -106,6 +106,102 @@ def rect_to_bbox( return (x1, y1, x2, y2) +def _is_duplicate_char(char1: LTChar, char2: LTChar, threshold: float) -> bool: + """Detect if two characters are duplicates caused by fake bold rendering. + + Some PDF generators create bold text by rendering the same character twice at slightly + offset positions. This function detects such duplicates by checking if two characters + have the same text content and nearly identical positions. + + Args: + char1: First LTChar object. + char2: Second LTChar object. + threshold: Maximum pixel distance to consider as duplicate. + + Returns: + True if char2 appears to be a duplicate of char1. + """ + # Must be the same character + if char1.get_text() != char2.get_text(): + return False + + # Check if positions are nearly identical (within threshold) + x_diff = abs(char1.x0 - char2.x0) + y_diff = abs(char1.y0 - char2.y0) + + return x_diff < threshold and y_diff < threshold + + +def deduplicate_chars_in_text_line(text_line: LTTextLine, threshold: float) -> str: + """Extract text from an LTTextLine with duplicate characters removed. + + Some PDFs create bold text by rendering each character twice at slightly offset + positions. This function removes such duplicates by keeping only the first instance + when two identical characters appear at nearly the same position. + + Args: + text_line: An LTTextLine object containing characters to extract. + threshold: Maximum pixel distance to consider characters as duplicates. + Set to 0 to disable deduplication. + + Returns: + The extracted text with duplicate characters removed. + """ + if threshold <= 0: + return text_line.get_text() + + # Build deduplicated text while preserving non-LTChar items (like LTAnno for spaces) + result_parts: List[str] = [] + last_ltchar: Optional[LTChar] = None + + for item in text_line: + if isinstance(item, LTChar): + # Check if this is a duplicate of the last LTChar + if last_ltchar is not None and _is_duplicate_char(last_ltchar, item, threshold): + # Skip this duplicate character + continue + last_ltchar = item + result_parts.append(item.get_text()) + else: + # Non-LTChar items (e.g., LTAnno for spaces) - keep as-is + if hasattr(item, "get_text"): + result_parts.append(item.get_text()) + + return "".join(result_parts) + + +def get_text_with_deduplication( + text_obj: Union[LTTextLine, LTContainer, LTItem], + threshold: float, +) -> str: + """Get text from a text object with optional character deduplication. + + This is the main entry point for extracting text with fake-bold deduplication. + It handles LTTextLine objects and recursively processes containers. + + Args: + text_obj: An LTTextLine, LTContainer, or other LTItem object. + threshold: Maximum pixel distance to consider characters as duplicates. + Set to 0 to disable deduplication. + + Returns: + The extracted text with duplicate characters removed. + """ + if isinstance(text_obj, LTTextLine): + return deduplicate_chars_in_text_line(text_obj, threshold) + elif isinstance(text_obj, LTContainer): + parts: List[str] = [] + for child in text_obj: + if isinstance(child, LTTextLine): + parts.append(deduplicate_chars_in_text_line(child, threshold)) + elif hasattr(child, "get_text"): + parts.append(child.get_text()) + return "".join(parts) + elif hasattr(text_obj, "get_text"): + return text_obj.get_text() + return "" + + @requires_dependencies(["pikepdf", "pypdf"]) def open_pdfminer_pages_generator( fp: BinaryIO, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index d92457ce1f..133212ac11 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -240,6 +240,16 @@ def TEXT_COVERAGE_THRESHOLD(self) -> float: the inferred element to be considered contaning extracted text""" return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25) + @property + def PDF_CHAR_DUPLICATE_THRESHOLD(self) -> float: + """Maximum pixel distance to consider two characters as duplicates (fake bold rendering). + + Some PDFs create bold text by rendering the same character twice at slightly offset + positions. This threshold determines how close two identical characters must be to be + considered duplicates. Set to 0 to disable duplicate character removal. + """ + return self._get_float("PDF_CHAR_DUPLICATE_THRESHOLD", 3.0) + @property def PDF_RENDER_DPI(self) -> int: """The DPI to use for rendering PDF pages"""