docling-project · pbrady · Oct 10, 2025
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
@@ -9,6 +9,7 @@
     DocItem,
     DoclingDocument,
     ImageRef,
+    NodeItem,
     PictureItem,
     ProvenanceItem,
     TextItem,
@@ -185,6 +186,89 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
 
         return conv_res
 
+    def _append_items_to_document(
+        self,
+        conv_res: ConversionResult,
+        page_doc: DoclingDocument,
+        pg_idx: int,
+        page_width: float = 1.0,
+        page_height: float = 1.0,
+    ) -> None:
+        """
+        Append items from a parsed document to the conversion result document.
+
+        Preserves hierarchical structure by using with_groups=True to include
+        ListGroup containers and tracking parent relationships based on level.
+        Fix for issue #2301.
+
+        Args:
+            conv_res: The conversion result to append items to
+            page_doc: The parsed document containing items to append
+            pg_idx: The page index (0-based)
+            page_width: Width of the page for bbox provenance (default 1.0)
+            page_height: Height of the page for bbox provenance (default 1.0)
+
+        How it works:
+            - iterate_items(with_groups=True) reveals ListGroup containers that
+              act as parents for nested ListItems
+            - Parent is found at level-1 (immediate parent in tree hierarchy)
+            - Level stack tracks items for future parent lookups
+            - Items with children are copied and cleared before appending to
+              satisfy Docling's constraint that items cannot have children when
+              appended (children are added separately by iterate_items)
+        """
+        level_stack: dict[
+            int, NodeItem
+        ] = {}  # Maps level -> NodeItem for parent tracking
+
+        # Important: Use with_groups=True to see ListGroup containers.
+        # Without this, nested list structure cannot be preserved correctly.
+        for item, level in page_doc.iterate_items(with_groups=True):
+            # Fix for issue #2301: Handle items with children
+            # The Docling constraint requires that appended items have no children.
+            # We create a copy and clear children; they will be appended separately
+            # by iterate_items() which returns items in tree order.
+            if hasattr(item, "children") and len(item.children) > 0:
+                item = item.model_copy()
+                item.children = []
+
+            # Set provenance for items that support it (DocItem subclasses)
+            # GroupItem and other container types don't have prov field
+            if hasattr(item, "prov"):
+                # Set bbox to full page since VLM-generated items from Markdown/HTML
+                # don't have precise spatial locations. This is more accurate than
+                # zero bbox and allows downstream code to work correctly.
+                item.prov = [
+                    ProvenanceItem(
+                        page_no=pg_idx + 1,
+                        bbox=BoundingBox(l=0.0, t=0.0, r=page_width, b=page_height),
+                        charspan=[0, 0],
+                    )
+                ]
+
+            # Determine parent based on level
+            # With groups visible:
+            # - Level 0: Body (GroupItem) - parent=None
+            # - Level 1: Root ListGroup - parent=None (defaults to body)
+            # - Level 2: ListItem - parent=ListGroup at level 1
+            # - Level 3: Nested ListGroup - parent=ListItem at level 2
+            # - Level 4: Nested ListItem - parent=ListGroup at level 3
+            # Rule: parent is at level-1
+            # Note: When parent=None, append_child_item() defaults to body
+            parent = None
+            if level > 1:
+                parent = level_stack.get(level - 1)
+
+            # Append with proper parent to preserve hierarchy
+            conv_res.document.append_child_item(child=item, parent=parent)
+
+            # Clean up stack: remove items at deeper levels (we've returned from that nesting depth)
+            # Then track current item for future children at level+1
+            level_stack = {
+                lvl: node for lvl, node in level_stack.items() if lvl < level
+            }
+            level_stack[level] = item
+
     def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
         doctags_list = []
         image_list = []
@@ -288,17 +372,9 @@ def _extract_markdown_code(text):
                 else None,
             )
 
-            for item, level in page_doc.iterate_items():
-                item.prov = [
-                    ProvenanceItem(
-                        page_no=pg_idx + 1,
-                        bbox=BoundingBox(
-                            t=0.0, b=0.0, l=0.0, r=0.0
-                        ),  # FIXME: would be nice not to have to "fake" it
-                        charspan=[0, 0],
-                    )
-                ]
-                conv_res.document.append_child_item(child=item)
+            self._append_items_to_document(
+                conv_res, page_doc, pg_idx, pg_width, pg_height
+            )
 
         return conv_res.document
 
@@ -365,17 +441,9 @@ def _extract_html_code(text):
                 else None,
             )
 
-            for item, level in page_doc.iterate_items():
-                item.prov = [
-                    ProvenanceItem(
-                        page_no=pg_idx + 1,
-                        bbox=BoundingBox(
-                            t=0.0, b=0.0, l=0.0, r=0.0
-                        ),  # FIXME: would be nice not to have to "fake" it
-                        charspan=[0, 0],
-                    )
-                ]
-                conv_res.document.append_child_item(child=item)
+            self._append_items_to_document(
+                conv_res, page_doc, pg_idx, pg_width, pg_height
+            )
 
         return conv_res.document
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -126,6 +126,7 @@ dev = [
     "pytest-dependency~=0.6",
     "pytest-durations~=1.6.1",
     "pytest-xdist~=3.3",
+    "requests-mock~=1.12",
     "ipykernel~=6.29",
     "ipywidgets~=8.1",
     "nbqa~=1.9",

diff --git a/tests/test_vlm_pipeline.py b/tests/test_vlm_pipeline.py
@@ -0,0 +1,240 @@
+"""
+Tests for VLM pipeline functionality.
+
+Includes tests for handling nested lists in Markdown responses,
+which previously caused: ValueError: Can not append a child with children
+See: https://github.com/docling-project/docling/issues/2301
+
+Test structure based on reproducer code contributed by @amomra in issue #2301.
+"""
+
+import time
+
+import pytest
+import requests_mock
+from docling_core.types.doc import GroupItem, ListItem
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import VlmPipelineOptions
+from docling.datamodel.pipeline_options_vlm_model import (
+    ApiVlmOptions,
+    ResponseFormat,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.pipeline.vlm_pipeline import VlmPipeline
+
+
+@pytest.fixture
+def mock_api_endpoint():
+    """Create a mock API endpoint for VLM responses."""
+    with requests_mock.Mocker() as m:
+        yield m
+
+
+# Dummy file needs to exist even though its not processed by the VLM
+TEST_PDF = "tests/data/pdf/code_and_formula.pdf"
+
+
+def create_vlm_converter(mock_endpoint, markdown_response):
+    """Helper to create a DocumentConverter with mocked VLM API."""
+    test_url = "http://test-vlm-api.com"
+
+    mock_endpoint.post(
+        test_url,
+        json={
+            "id": "test-123",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": markdown_response},
+                    "finish_reason": "stop",
+                }
+            ],
+            "created": int(time.time()),
+            "usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
+        },
+    )
+
+    pipeline_options = VlmPipelineOptions(enable_remote_services=True)
+    pipeline_options.vlm_options = ApiVlmOptions(
+        url=test_url,
+        headers={"Authorization": "Bearer test"},
+        params=dict(model="test-model"),
+        prompt="Convert to markdown",
+        timeout=90,
+        scale=1.0,
+        response_format=ResponseFormat.MARKDOWN,
+    )
+
+    return DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+                pipeline_cls=VlmPipeline,
+            ),
+        }
+    )
+
+
+def test_nested_list_with_html_tag(mock_api_endpoint):
+    """Test the original failing case: nested list with HTML tag."""
+    markdown = """- item 1
+- item 2
+  - sub item 1 <text>
+  - sub item 2"""
+
+    converter = create_vlm_converter(mock_api_endpoint, markdown)
+
+    # Should not raise ValueError
+    result = converter.convert(TEST_PDF)
+
+    assert result.document is not None
+    output = result.document.export_to_markdown()
+
+    # Verify content is present (order may vary due to flattening)
+    assert "item 1" in output
+    assert "item 2" in output
+    assert "sub item 1" in output
+    assert "sub item 2" in output
+
+
+def test_simple_nested_list(mock_api_endpoint):
+    """Test simple nested list without special characters."""
+    markdown = """- item 1
+- item 2
+  - sub item 1
+  - sub item 2"""
+
+    converter = create_vlm_converter(mock_api_endpoint, markdown)
+
+    result = converter.convert(TEST_PDF)
+
+    assert result.document is not None
+    output = result.document.export_to_markdown()
+
+    assert "item 1" in output
+    assert "item 2" in output
+    assert "sub item 1" in output
+    assert "sub item 2" in output
+
+
+def test_parent_item_with_text_and_children(mock_api_endpoint):
+    """Test that parent item text is preserved when it has children."""
+    markdown = """- item 1
+- item 2 has some text
+  - sub item 1
+  - sub item 2
+- item 3"""
+
+    converter = create_vlm_converter(mock_api_endpoint, markdown)
+
+    result = converter.convert(TEST_PDF)
+
+    assert result.document is not None
+    output = result.document.export_to_markdown()
+
+    # Verify parent text is preserved
+    assert "item 1" in output
+    assert "item 2 has some text" in output  # Parent text must not be lost
+    assert "sub item 1" in output
+    assert "sub item 2" in output
+    assert "item 3" in output
+
+
+def test_deeply_nested_list(mock_api_endpoint):
+    """Test deeply nested lists (3+ levels)."""
+    markdown = """- level 1 item 1
+  - level 2 item 1
+    - level 3 item 1
+  - level 2 item 2"""
+
+    converter = create_vlm_converter(mock_api_endpoint, markdown)
+
+    result = converter.convert(TEST_PDF)
+
+    assert result.document is not None
+    output = result.document.export_to_markdown()
+
+    assert "level 1 item 1" in output
+    assert "level 2 item 1" in output
+    assert "level 3 item 1" in output
+    assert "level 2 item 2" in output
+
+
+def test_flat_list_still_works(mock_api_endpoint):
+    """Ensure flat lists (no nesting) continue to work correctly."""
+    markdown = """- item 1
+- item 2
+- item 3"""
+
+    converter = create_vlm_converter(mock_api_endpoint, markdown)
+
+    result = converter.convert(TEST_PDF)
+
+    assert result.document is not None
+    output = result.document.export_to_markdown()
+
+    assert "item 1" in output
+    assert "item 2" in output
+    assert "item 3" in output
+
+
+# Structure Preservation Tests (added for comprehensive fix of issue #2301)
+def test_nested_list_structure_preserved(mock_api_endpoint):
+    """Verify nested list structure is correctly preserved with proper levels."""
+    markdown = """- item 1
+- item 2
+  - sub item 1
+  - sub item 2
+- item 3"""
+
+    converter = create_vlm_converter(mock_api_endpoint, markdown)
+    result = converter.convert(TEST_PDF)
+
+    assert result.document is not None
+
+    # Verify structure using iterate_items without groups (user view)
+    # Note: PDF may have multiple pages, so get items from first page only
+    all_items = list(result.document.iterate_items(with_groups=False))
+
+    # Filter to first page's items (page_no=1 in prov)
+    page1_items = [
+        (item, level)
+        for item, level in all_items
+        if hasattr(item, "prov") and len(item.prov) > 0 and item.prov[0].page_no == 1
+    ]
+
+    # Expected: at least 5 items (item 1, item 2, sub item 1, sub item 2, item 3)
+    assert len(page1_items) >= 5, (
+        f"Expected at least 5 items on page 1, got {len(page1_items)}"
+    )
+
+    # Check first 5 items (our list)
+    first_5_items = page1_items[:5]
+    levels = [level for _, level in first_5_items]
+
+    # Verify relative structure: sub-items should be deeper than parents, siblings same level
+    assert levels[0] == levels[1], (
+        "item 1 and item 2 should be at same level (siblings)"
+    )
+    assert levels[2] > levels[1], "sub item 1 should be deeper than item 2 (nested)"
+    assert levels[2] == levels[3], (
+        "sub item 1 and sub item 2 should be at same level (siblings)"
+    )
+    assert levels[4] == levels[0], "item 3 should be at same level as item 1 (siblings)"
+    assert levels[2] == levels[1] + 2, (
+        "Nested items should be 2 levels deeper (without groups: 2→4)"
+    )
+
+    # Verify text content
+    item1, _ = first_5_items[0]
+    item2, _ = first_5_items[1]
+    sub1, _ = first_5_items[2]
+    sub2, _ = first_5_items[3]
+    item3, _ = first_5_items[4]
+
+    assert "item 1" in getattr(item1, "text", "")
+    assert "item 2" in getattr(item2, "text", "")
+    assert "sub item 1" in getattr(sub1, "text", "")
+    assert "sub item 2" in getattr(sub2, "text", "")
+    assert "item 3" in getattr(item3, "text", "")