Unstructured-IO · bittoby · Jan 28, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,9 @@
 ### Enhancements
 - **Add `group_elements_by_parent_id` utility function**: Groups elements by their `parent_id` metadata field for easier document hierarchy traversal (fixes #1489)
 
+### Fixes
+- **Fix duplicate characters in PDF bold text extraction**: Some PDFs render bold text by drawing each character twice at slightly offset positions, causing text like "BOLD" to be extracted as "BBOOLLDD". Added character-level deduplication based on position proximity. Configurable via `PDF_CHAR_DUPLICATE_THRESHOLD` environment variable (default: 3.0 pixels, set to 0 to disable).
+
 ## 0.18.32
 
 ### Enhancements

diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -18,6 +18,7 @@
 from test_unstructured.unit_utils import example_doc_path
 from unstructured.partition.auto import partition
 from unstructured.partition.pdf_image.pdfminer_processing import (
+    _deduplicate_ltchars,
     _validate_bbox,
     aggregate_embedded_text_by_block,
     bboxes1_is_almost_subregion_of_bboxes2,
@@ -362,3 +363,93 @@ def test_text_is_embedded():
 
     assert text_is_embedded(container, threshold=0.5)
     assert not text_is_embedded(container, threshold=0.3)
+
+
+# -- Tests for _deduplicate_ltchars (fake bold fix) --
+
+
+def _create_positioned_ltchar(text: str, x0: float, y0: float) -> LTChar:
+    """Create an LTChar with a specific position for deduplication testing."""
+    graphicstate = Mock()
+    # Matrix format: (a, b, c, d, e, f) where e=x, f=y for translation
+    matrix = (1, 0, 0, 1, x0, y0)
+
+    char = LTChar(
+        matrix=matrix,
+        font=Mock(),
+        fontsize=12,
+        scaling=1,
+        rise=0,
+        text=text,
+        textwidth=10,
+        textdisp=(0, 1),
+        ncs=Mock(),
+        graphicstate=graphicstate,
+    )
+    return char
+
+
+class TestDeduplicateLtchars:
+    """Tests for _deduplicate_ltchars function."""
+
+    def test_empty_list_returns_empty(self):
+        """Empty character list should return empty list."""
+        result = _deduplicate_ltchars([], threshold=3.0)
+        assert result == []
+
+    def test_threshold_zero_disables_deduplication(self):
+        """Threshold of 0 should disable deduplication and return original list."""
+        chars = [
+            _create_positioned_ltchar("A", 10.0, 20.0),
+            _create_positioned_ltchar("A", 10.5, 20.0),  # Would be duplicate
+        ]
+        result = _deduplicate_ltchars(chars, threshold=0)
+        assert len(result) == 2
+
+    def test_fake_bold_duplicates_removed(self):
+        """Fake bold (double-rendered) characters should be deduplicated."""
+        # Simulate "AB" rendered as "AABB" with fake bold
+        chars = [
+            _create_positioned_ltchar("A", 10.0, 20.0),
+            _create_positioned_ltchar("A", 10.5, 20.0),  # Duplicate - close position
+            _create_positioned_ltchar("B", 25.0, 20.0),
+            _create_positioned_ltchar("B", 25.5, 20.0),  # Duplicate - close position
+        ]
+        result = _deduplicate_ltchars(chars, threshold=3.0)
+        assert len(result) == 2
+        assert result[0].get_text() == "A"
+        assert result[1].get_text() == "B"
+
+    def test_legitimate_repeated_chars_preserved(self):
+        """Legitimate repeated characters at different positions should be preserved."""
+        # "AA" where both A's are at legitimately different positions
+        chars = [
+            _create_positioned_ltchar("A", 10.0, 20.0),
+            _create_positioned_ltchar("A", 25.0, 20.0),  # Far enough - not duplicate
+        ]
+        result = _deduplicate_ltchars(chars, threshold=3.0)
+        assert len(result) == 2
+
+    def test_single_char_returns_single(self):
+        """Single character should return single character."""
+        chars = [_create_positioned_ltchar("X", 10.0, 20.0)]
+        result = _deduplicate_ltchars(chars, threshold=3.0)
+        assert len(result) == 1
+        assert result[0].get_text() == "X"
+
+    def test_mixed_duplicates_and_normal(self):
+        """Mix of duplicated and normal characters should be handled correctly."""
+        # "HELLO" where only H and L are fake-bold
+        chars = [
+            _create_positioned_ltchar("H", 10.0, 20.0),
+            _create_positioned_ltchar("H", 10.5, 20.0),  # Duplicate
+            _create_positioned_ltchar("E", 20.0, 20.0),  # Normal
+            _create_positioned_ltchar("L", 30.0, 20.0),
+            _create_positioned_ltchar("L", 30.5, 20.0),  # Duplicate
+            _create_positioned_ltchar("L", 40.0, 20.0),  # Second L (normal, different position)
+            _create_positioned_ltchar("O", 50.0, 20.0),  # Normal
+        ]
+        result = _deduplicate_ltchars(chars, threshold=3.0)
+        assert len(result) == 5
+        text = "".join(c.get_text() for c in result)
+        assert text == "HELLO"
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
@@ -1,8 +1,13 @@
 from unittest.mock import MagicMock
 
-from pdfminer.layout import LTContainer, LTTextLine
+from pdfminer.layout import LTChar, LTContainer, LTTextLine
 
-from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
+from unstructured.partition.pdf_image.pdfminer_utils import (
+    _is_duplicate_char,
+    deduplicate_chars_in_text_line,
+    extract_text_objects,
+    get_text_with_deduplication,
+)
 
 
 def test_extract_text_objects_nested_containers():
@@ -26,3 +31,174 @@ def test_extract_text_objects_nested_containers():
     assert len(result) == 2
     assert mock_text_line1 in result
     assert mock_text_line2 in result
+
+
+# -- Tests for character deduplication (fake bold fix) --
+
+
+def _create_mock_ltchar(text: str, x0: float, y0: float) -> MagicMock:
+    """Helper to create a mock LTChar with specified text and position."""
+    mock_char = MagicMock(spec=LTChar)
+    mock_char.get_text.return_value = text
+    mock_char.x0 = x0
+    mock_char.y0 = y0
+    return mock_char
+
+
+class TestIsDuplicateChar:
+    """Tests for _is_duplicate_char function."""
+
+    def test_same_char_same_position_is_duplicate(self):
+        """Two identical characters at the same position should be duplicates."""
+        char1 = _create_mock_ltchar("A", 10.0, 20.0)
+        char2 = _create_mock_ltchar("A", 10.0, 20.0)
+        assert _is_duplicate_char(char1, char2, threshold=3.0) is True
+
+    def test_same_char_close_position_is_duplicate(self):
+        """Two identical characters at close positions should be duplicates."""
+        char1 = _create_mock_ltchar("B", 10.0, 20.0)
+        char2 = _create_mock_ltchar("B", 11.5, 21.0)  # Within 3.0 threshold
+        assert _is_duplicate_char(char1, char2, threshold=3.0) is True
+
+    def test_same_char_far_position_not_duplicate(self):
+        """Two identical characters at far positions should not be duplicates."""
+        char1 = _create_mock_ltchar("C", 10.0, 20.0)
+        char2 = _create_mock_ltchar("C", 15.0, 20.0)  # 5.0 > 3.0 threshold
+        assert _is_duplicate_char(char1, char2, threshold=3.0) is False
+
+    def test_different_chars_same_position_not_duplicate(self):
+        """Two different characters at the same position should not be duplicates."""
+        char1 = _create_mock_ltchar("A", 10.0, 20.0)
+        char2 = _create_mock_ltchar("B", 10.0, 20.0)
+        assert _is_duplicate_char(char1, char2, threshold=3.0) is False
+
+    def test_threshold_boundary(self):
+        """Test behavior at exact threshold boundary."""
+        char1 = _create_mock_ltchar("X", 10.0, 20.0)
+        char2 = _create_mock_ltchar("X", 13.0, 20.0)  # Exactly at threshold
+        # At threshold means NOT within threshold (uses < not <=)
+        assert _is_duplicate_char(char1, char2, threshold=3.0) is False
+
+        char3 = _create_mock_ltchar("X", 12.9, 20.0)  # Just under threshold
+        assert _is_duplicate_char(char1, char3, threshold=3.0) is True
+
+
+class TestDeduplicateCharsInTextLine:
+    """Tests for deduplicate_chars_in_text_line function."""
+
+    def test_no_duplicates_returns_original(self):
+        """Text line without duplicates should return original text."""
+        chars = [
+            _create_mock_ltchar("H", 10.0, 20.0),
+            _create_mock_ltchar("i", 15.0, 20.0),
+        ]
+        mock_text_line = MagicMock(spec=LTTextLine)
+        mock_text_line.__iter__ = lambda self: iter(chars)
+        mock_text_line.get_text.return_value = "Hi"
+
+        result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0)
+        assert result == "Hi"
+
+    def test_fake_bold_duplicates_removed(self):
+        """Fake bold text (each char doubled) should be deduplicated."""
+        # Simulates "BOLD" rendered as "BBOOLLDD" with duplicate positions
+        chars = [
+            _create_mock_ltchar("B", 10.0, 20.0),
+            _create_mock_ltchar("B", 10.5, 20.0),  # Duplicate
+            _create_mock_ltchar("O", 20.0, 20.0),
+            _create_mock_ltchar("O", 20.5, 20.0),  # Duplicate
+            _create_mock_ltchar("L", 30.0, 20.0),
+            _create_mock_ltchar("L", 30.5, 20.0),  # Duplicate
+            _create_mock_ltchar("D", 40.0, 20.0),
+            _create_mock_ltchar("D", 40.5, 20.0),  # Duplicate
+        ]
+        mock_text_line = MagicMock(spec=LTTextLine)
+        mock_text_line.__iter__ = lambda self: iter(chars)
+
+        result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0)
+        assert result == "BOLD"
+
+    def test_threshold_zero_disables_deduplication(self):
+        """Setting threshold to 0 should disable deduplication."""
+        mock_text_line = MagicMock(spec=LTTextLine)
+        mock_text_line.get_text.return_value = "BBOOLLDD"
+
+        result = deduplicate_chars_in_text_line(mock_text_line, threshold=0)
+        assert result == "BBOOLLDD"
+
+    def test_negative_threshold_disables_deduplication(self):
+        """Setting negative threshold should disable deduplication."""
+        mock_text_line = MagicMock(spec=LTTextLine)
+        mock_text_line.get_text.return_value = "BBOOLLDD"
+
+        result = deduplicate_chars_in_text_line(mock_text_line, threshold=-1.0)
+        assert result == "BBOOLLDD"
+
+    def test_empty_text_line(self):
+        """Empty text line should return original text."""
+        mock_text_line = MagicMock(spec=LTTextLine)
+        mock_text_line.__iter__ = lambda self: iter([])
+        mock_text_line.get_text.return_value = ""
+
+        result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0)
+        assert result == ""
+
+    def test_legitimate_repeated_chars_preserved(self):
+        """Legitimate repeated characters (different positions) should be preserved."""
+        # "AA" where both A's are at different positions
+        chars = [
+            _create_mock_ltchar("A", 10.0, 20.0),
+            _create_mock_ltchar("A", 20.0, 20.0),  # Different position, not duplicate
+        ]
+        mock_text_line = MagicMock(spec=LTTextLine)
+        mock_text_line.__iter__ = lambda self: iter(chars)
+
+        result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0)
+        assert result == "AA"
+
+
+class TestGetTextWithDeduplication:
+    """Tests for get_text_with_deduplication function."""
+
+    def test_with_text_line(self):
+        """Should properly deduplicate text from LTTextLine."""
+        chars = [
+            _create_mock_ltchar("H", 10.0, 20.0),
+            _create_mock_ltchar("H", 10.5, 20.0),  # Duplicate
+            _create_mock_ltchar("i", 20.0, 20.0),
+        ]
+        mock_text_line = MagicMock(spec=LTTextLine)
+        mock_text_line.__iter__ = lambda self: iter(chars)
+
+        result = get_text_with_deduplication(mock_text_line, threshold=3.0)
+        assert result == "Hi"
+
+    def test_with_container(self):
+        """Should handle LTContainer with nested LTTextLine."""
+        chars = [
+            _create_mock_ltchar("T", 10.0, 20.0),
+            _create_mock_ltchar("T", 10.5, 20.0),  # Duplicate
+        ]
+        mock_text_line = MagicMock(spec=LTTextLine)
+        mock_text_line.__iter__ = lambda self: iter(chars)
+
+        mock_container = MagicMock(spec=LTContainer)
+        mock_container.__iter__ = lambda self: iter([mock_text_line])
+
+        result = get_text_with_deduplication(mock_container, threshold=3.0)
+        assert result == "T"
+
+    def test_with_generic_object(self):
+        """Should fall back to get_text() for non-standard objects."""
+        mock_obj = MagicMock()
+        mock_obj.get_text.return_value = "fallback text"
+
+        result = get_text_with_deduplication(mock_obj, threshold=3.0)
+        assert result == "fallback text"
+
+    def test_without_get_text(self):
+        """Should return empty string for objects without get_text."""
+        mock_obj = MagicMock(spec=[])  # No get_text method
+
+        result = get_text_with_deduplication(mock_obj, threshold=3.0)
+        assert result == ""
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -17,8 +17,10 @@
 from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
 from unstructured.partition.pdf_image.pdfminer_utils import (
     PDFMinerConfig,
+    _is_duplicate_char,
     extract_image_objects,
     extract_text_objects,
+    get_text_with_deduplication,
     open_pdfminer_pages_generator,
     rect_to_bbox,
 )
@@ -466,11 +468,13 @@ def process_page_layout_from_pdfminer(
 
         if hasattr(obj, "get_text"):
             inner_text_objects = extract_text_objects(obj)
+            char_dedup_threshold = env_config.PDF_CHAR_DUPLICATE_THRESHOLD
             for inner_obj in inner_text_objects:
                 inner_bbox = rect_to_bbox(inner_obj.bbox, page_height)
                 if not _validate_bbox(inner_bbox):
                     continue
-                texts.append(inner_obj.get_text())
+                # Use deduplication to handle fake bold text (characters rendered twice)
+                texts.append(get_text_with_deduplication(inner_obj, char_dedup_threshold))
                 element_coords.append(inner_bbox)
                 element_class.append(0)
                 is_extracted.append(IsExtracted.TRUE if text_is_embedded(inner_obj) else None)
@@ -1006,6 +1010,33 @@ def check_annotations_within_element(
     return annotations_within_element
 
 
+def _deduplicate_ltchars(
+    chars: list[LTChar],
+    threshold: float,
+) -> list[LTChar]:
+    """Remove duplicate characters caused by fake bold rendering.
+
+    Some PDFs create bold text by rendering the same character twice at slightly offset
+    positions. This function removes such duplicates.
+
+    Args:
+        chars: List of LTChar objects to deduplicate.
+        threshold: Maximum pixel distance to consider characters as duplicates.
+                   Set to 0 to disable deduplication.
+
+    Returns:
+        Deduplicated list of LTChar objects.
+    """
+    if threshold <= 0 or not chars:
+        return chars
+
+    result = [chars[0]]
+    for char in chars[1:]:
+        if not _is_duplicate_char(result[-1], char, threshold):
+            result.append(char)
+    return result
+
+
 def get_words_from_obj(
     obj: LTTextBox,
     height: float,
@@ -1026,13 +1057,25 @@ def get_words_from_obj(
     characters = []
     words = []
     text_len = 0
+    char_dedup_threshold = env_config.PDF_CHAR_DUPLICATE_THRESHOLD
 
     for text_line in obj:
         word = ""
         x1, y1, x2, y2 = None, None, None, None
         start_index = 0
+        last_char: LTChar | None = None  # Track last character for deduplication
+
         for index, character in enumerate(text_line):
             if isinstance(character, LTChar):
+                # Skip duplicate characters (fake bold fix)
+                if (
+                    char_dedup_threshold > 0
+                    and last_char is not None
+                    and _is_duplicate_char(last_char, character, char_dedup_threshold)
+                ):
+                    continue
+
+                last_char = character
                 characters.append(character)
                 char = character.get_text()
 
@@ -1066,6 +1109,7 @@ def get_words_from_obj(
 
                 word += char
             else:
+                # Non-LTChar items (e.g., LTAnno) act as word boundaries
                 words.append(
                     {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index},
                 )