Skip to content

Commit f93a63f

Browse files
committed
test: rechunk_document preserves chunker doc_items (regression for #266 fix)
Locks the fix from the previous commit. The test seeds a completed analysis + a mocked chunker that returns a `ChunkResult` with two `ChunkDocItem` entries, then asserts: - the in-memory chunks returned by `rechunk_document` carry the expected `self_ref` list - the persisted chunks (re-read via `list_chunks`) carry them too This is the regression net for the silent data-loss bug where the rechunk path created Chunk rows with `doc_items=[]`, breaking the chunk↔bbox highlight on the Chunk view (#266).
1 parent 7522ecb commit f93a63f

1 file changed

Lines changed: 36 additions & 1 deletion

File tree

document-parser/tests/test_chunk_service.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import pytest
1010

1111
from domain.models import AnalysisJob, AnalysisStatus, Chunk, Document
12-
from domain.value_objects import ChunkEditAction, ChunkResult
12+
from domain.value_objects import ChunkDocItem, ChunkEditAction, ChunkResult
1313
from persistence.analysis_repo import SqliteAnalysisRepository
1414
from persistence.chunk_edit_repo import (
1515
SqliteChunkEditRepository,
@@ -283,6 +283,41 @@ async def test_rechunk_503_when_no_chunker(self, service, doc):
283283
await service.rechunk_document(doc.id)
284284
assert exc.value.http_status == 503
285285

286+
async def test_rechunk_preserves_doc_items_from_chunker(self, service, repos, doc):
287+
"""0.6.1 — the bbox↔chunk linking on the Chunk view depends on
288+
the canonical chunks carrying `doc_items`. The previous implementation
289+
dropped them on a stale "ChunkResult has no doc_items" comment.
290+
"""
291+
# Seed a completed analysis.
292+
job = AnalysisJob(document_id=doc.id, status=AnalysisStatus.COMPLETED)
293+
await repos["analyses"].insert(job)
294+
job.document_json = json.dumps({"texts": []})
295+
job.completed_at = datetime.now(UTC)
296+
await repos["analyses"].update_status(job)
297+
298+
chunker = MagicMock()
299+
chunker.chunk = AsyncMock(
300+
return_value=[
301+
ChunkResult(
302+
text="t",
303+
source_page=1,
304+
token_count=4,
305+
doc_items=[
306+
ChunkDocItem(self_ref="#/texts/0", label="text"),
307+
ChunkDocItem(self_ref="#/texts/1", label="text"),
308+
],
309+
),
310+
]
311+
)
312+
service._chunker = chunker
313+
314+
result = await service.rechunk_document(doc.id)
315+
assert len(result) == 1
316+
assert [d.self_ref for d in result[0].doc_items] == ["#/texts/0", "#/texts/1"]
317+
# Persisted chunks carry doc_items too.
318+
chunks = await service.list_chunks(doc.id)
319+
assert [d.self_ref for d in chunks[0].doc_items] == ["#/texts/0", "#/texts/1"]
320+
286321

287322
# ---------------------------------------------------------------------------
288323
# promote_from_analysis_if_empty

0 commit comments

Comments
 (0)