Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 90 additions & 22 deletions docling/pipeline/vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
DocItem,
DoclingDocument,
ImageRef,
NodeItem,
PictureItem,
ProvenanceItem,
TextItem,
Expand Down Expand Up @@ -185,6 +186,89 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:

return conv_res

def _append_items_to_document(
self,
conv_res: ConversionResult,
page_doc: DoclingDocument,
pg_idx: int,
page_width: float = 1.0,
page_height: float = 1.0,
) -> None:
"""
Append items from a parsed document to the conversion result document.

Preserves hierarchical structure by using with_groups=True to include
ListGroup containers and tracking parent relationships based on level.
Fix for issue #2301.

Args:
conv_res: The conversion result to append items to
page_doc: The parsed document containing items to append
pg_idx: The page index (0-based)
page_width: Width of the page for bbox provenance (default 1.0)
page_height: Height of the page for bbox provenance (default 1.0)

How it works:
- iterate_items(with_groups=True) reveals ListGroup containers that
act as parents for nested ListItems
- Parent is found at level-1 (immediate parent in tree hierarchy)
- Level stack tracks items for future parent lookups
- Items with children are copied and cleared before appending to
satisfy Docling's constraint that items cannot have children when
appended (children are added separately by iterate_items)
"""
level_stack: dict[
int, NodeItem
] = {} # Maps level -> NodeItem for parent tracking

# Important: Use with_groups=True to see ListGroup containers.
# Without this, nested list structure cannot be preserved correctly.
for item, level in page_doc.iterate_items(with_groups=True):
# Fix for issue #2301: Handle items with children
# The Docling constraint requires that appended items have no children.
# We create a copy and clear children; they will be appended separately
# by iterate_items() which returns items in tree order.
if hasattr(item, "children") and len(item.children) > 0:
item = item.model_copy()
item.children = []

# Set provenance for items that support it (DocItem subclasses)
# GroupItem and other container types don't have prov field
if hasattr(item, "prov"):
# Set bbox to full page since VLM-generated items from Markdown/HTML
# don't have precise spatial locations. This is more accurate than
# zero bbox and allows downstream code to work correctly.
item.prov = [
ProvenanceItem(
page_no=pg_idx + 1,
bbox=BoundingBox(l=0.0, t=0.0, r=page_width, b=page_height),
charspan=[0, 0],
)
]

# Determine parent based on level
# With groups visible:
# - Level 0: Body (GroupItem) - parent=None
# - Level 1: Root ListGroup - parent=None (defaults to body)
# - Level 2: ListItem - parent=ListGroup at level 1
# - Level 3: Nested ListGroup - parent=ListItem at level 2
# - Level 4: Nested ListItem - parent=ListGroup at level 3
# Rule: parent is at level-1
# Note: When parent=None, append_child_item() defaults to body
parent = None
if level > 1:
parent = level_stack.get(level - 1)

# Append with proper parent to preserve hierarchy
conv_res.document.append_child_item(child=item, parent=parent)

# Clean up stack: remove items at deeper levels (we've returned from that nesting depth)
# Then track current item for future children at level+1
level_stack = {
lvl: node for lvl, node in level_stack.items() if lvl < level
}
level_stack[level] = item

def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
doctags_list = []
image_list = []
Expand Down Expand Up @@ -288,17 +372,9 @@ def _extract_markdown_code(text):
else None,
)

for item, level in page_doc.iterate_items():
item.prov = [
ProvenanceItem(
page_no=pg_idx + 1,
bbox=BoundingBox(
t=0.0, b=0.0, l=0.0, r=0.0
), # FIXME: would be nice not to have to "fake" it
charspan=[0, 0],
)
]
conv_res.document.append_child_item(child=item)
self._append_items_to_document(
conv_res, page_doc, pg_idx, pg_width, pg_height
)

return conv_res.document

Expand Down Expand Up @@ -365,17 +441,9 @@ def _extract_html_code(text):
else None,
)

for item, level in page_doc.iterate_items():
item.prov = [
ProvenanceItem(
page_no=pg_idx + 1,
bbox=BoundingBox(
t=0.0, b=0.0, l=0.0, r=0.0
), # FIXME: would be nice not to have to "fake" it
charspan=[0, 0],
)
]
conv_res.document.append_child_item(child=item)
self._append_items_to_document(
conv_res, page_doc, pg_idx, pg_width, pg_height
)

return conv_res.document

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ dev = [
"pytest-dependency~=0.6",
"pytest-durations~=1.6.1",
"pytest-xdist~=3.3",
"requests-mock~=1.12",
"ipykernel~=6.29",
"ipywidgets~=8.1",
"nbqa~=1.9",
Expand Down
240 changes: 240 additions & 0 deletions tests/test_vlm_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
"""
Tests for VLM pipeline functionality.

Includes tests for handling nested lists in Markdown responses,
which previously caused: ValueError: Can not append a child with children
See: https://github.com/docling-project/docling/issues/2301

Test structure based on reproducer code contributed by @amomra in issue #2301.
"""

import time

import pytest
import requests_mock
from docling_core.types.doc import GroupItem, ListItem

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
ResponseFormat,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline


@pytest.fixture
def mock_api_endpoint():
"""Create a mock API endpoint for VLM responses."""
with requests_mock.Mocker() as m:
yield m


# Dummy file needs to exist even though its not processed by the VLM
TEST_PDF = "tests/data/pdf/code_and_formula.pdf"


def create_vlm_converter(mock_endpoint, markdown_response):
"""Helper to create a DocumentConverter with mocked VLM API."""
test_url = "http://test-vlm-api.com"

mock_endpoint.post(
test_url,
json={
"id": "test-123",
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": markdown_response},
"finish_reason": "stop",
}
],
"created": int(time.time()),
"usage": {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2},
},
)

pipeline_options = VlmPipelineOptions(enable_remote_services=True)
pipeline_options.vlm_options = ApiVlmOptions(
url=test_url,
headers={"Authorization": "Bearer test"},
params=dict(model="test-model"),
prompt="Convert to markdown",
timeout=90,
scale=1.0,
response_format=ResponseFormat.MARKDOWN,
)

return DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
pipeline_cls=VlmPipeline,
),
}
)


def test_nested_list_with_html_tag(mock_api_endpoint):
"""Test the original failing case: nested list with HTML tag."""
markdown = """- item 1
- item 2
- sub item 1 <text>
- sub item 2"""

converter = create_vlm_converter(mock_api_endpoint, markdown)

# Should not raise ValueError
result = converter.convert(TEST_PDF)

assert result.document is not None
output = result.document.export_to_markdown()

# Verify content is present (order may vary due to flattening)
assert "item 1" in output
assert "item 2" in output
assert "sub item 1" in output
assert "sub item 2" in output


def test_simple_nested_list(mock_api_endpoint):
"""Test simple nested list without special characters."""
markdown = """- item 1
- item 2
- sub item 1
- sub item 2"""

converter = create_vlm_converter(mock_api_endpoint, markdown)

result = converter.convert(TEST_PDF)

assert result.document is not None
output = result.document.export_to_markdown()

assert "item 1" in output
assert "item 2" in output
assert "sub item 1" in output
assert "sub item 2" in output


def test_parent_item_with_text_and_children(mock_api_endpoint):
"""Test that parent item text is preserved when it has children."""
markdown = """- item 1
- item 2 has some text
- sub item 1
- sub item 2
- item 3"""

converter = create_vlm_converter(mock_api_endpoint, markdown)

result = converter.convert(TEST_PDF)

assert result.document is not None
output = result.document.export_to_markdown()

# Verify parent text is preserved
assert "item 1" in output
assert "item 2 has some text" in output # Parent text must not be lost
assert "sub item 1" in output
assert "sub item 2" in output
assert "item 3" in output


def test_deeply_nested_list(mock_api_endpoint):
"""Test deeply nested lists (3+ levels)."""
markdown = """- level 1 item 1
- level 2 item 1
- level 3 item 1
- level 2 item 2"""

converter = create_vlm_converter(mock_api_endpoint, markdown)

result = converter.convert(TEST_PDF)

assert result.document is not None
output = result.document.export_to_markdown()

assert "level 1 item 1" in output
assert "level 2 item 1" in output
assert "level 3 item 1" in output
assert "level 2 item 2" in output


def test_flat_list_still_works(mock_api_endpoint):
"""Ensure flat lists (no nesting) continue to work correctly."""
markdown = """- item 1
- item 2
- item 3"""

converter = create_vlm_converter(mock_api_endpoint, markdown)

result = converter.convert(TEST_PDF)

assert result.document is not None
output = result.document.export_to_markdown()

assert "item 1" in output
assert "item 2" in output
assert "item 3" in output


# Structure Preservation Tests (added for comprehensive fix of issue #2301)
def test_nested_list_structure_preserved(mock_api_endpoint):
"""Verify nested list structure is correctly preserved with proper levels."""
markdown = """- item 1
- item 2
- sub item 1
- sub item 2
- item 3"""

converter = create_vlm_converter(mock_api_endpoint, markdown)
result = converter.convert(TEST_PDF)

assert result.document is not None

# Verify structure using iterate_items without groups (user view)
# Note: PDF may have multiple pages, so get items from first page only
all_items = list(result.document.iterate_items(with_groups=False))

# Filter to first page's items (page_no=1 in prov)
page1_items = [
(item, level)
for item, level in all_items
if hasattr(item, "prov") and len(item.prov) > 0 and item.prov[0].page_no == 1
]

# Expected: at least 5 items (item 1, item 2, sub item 1, sub item 2, item 3)
assert len(page1_items) >= 5, (
f"Expected at least 5 items on page 1, got {len(page1_items)}"
)

# Check first 5 items (our list)
first_5_items = page1_items[:5]
levels = [level for _, level in first_5_items]

# Verify relative structure: sub-items should be deeper than parents, siblings same level
assert levels[0] == levels[1], (
"item 1 and item 2 should be at same level (siblings)"
)
assert levels[2] > levels[1], "sub item 1 should be deeper than item 2 (nested)"
assert levels[2] == levels[3], (
"sub item 1 and sub item 2 should be at same level (siblings)"
)
assert levels[4] == levels[0], "item 3 should be at same level as item 1 (siblings)"
assert levels[2] == levels[1] + 2, (
"Nested items should be 2 levels deeper (without groups: 2→4)"
)

# Verify text content
item1, _ = first_5_items[0]
item2, _ = first_5_items[1]
sub1, _ = first_5_items[2]
sub2, _ = first_5_items[3]
item3, _ = first_5_items[4]

assert "item 1" in getattr(item1, "text", "")
assert "item 2" in getattr(item2, "text", "")
assert "sub item 1" in getattr(sub1, "text", "")
assert "sub item 2" in getattr(sub2, "text", "")
assert "item 3" in getattr(item3, "text", "")
Loading