Skip to content
This repository was archived by the owner on Apr 30, 2026. It is now read-only.

Commit 1aab6cd

Browse files
committed
Test to ensure we don't mix chunks across calls to DocumentChunker
This adds a new functional test to verify that if we call DocumentChunker multiple times with different sets of documents that we are not mixing chunks across each of those calls, even if the same output_dir parameter is passed into each call of DocumentChunker. We had an issue with this in the v0.7.x releases, but this test verifies this is no longer an issue in main. Signed-off-by: Ben Browning <bbrownin@redhat.com>
1 parent 075bee3 commit 1aab6cd

1 file changed

Lines changed: 35 additions & 0 deletions

File tree

tests/functional/test_chunkers.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,38 @@ def test_chunk_documents(
147147
assert (
148148
overlap_count / (len(chunks) - 1) >= 0.3
149149
), "Insufficient overlap between consecutive chunks"
150+
151+
152+
def test_chunk_documents_chunks_not_mixing(tmp_path, tokenizer_model_name):
153+
"""
154+
Ensure chunks are not mixing together across multiple
155+
instances of DocumentChunker even if they are passed the same
156+
output directory
157+
"""
158+
doc_path_1 = os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.md")
159+
doc_path_2 = os.path.join(TEST_DATA_DIR, "sample_documents", "moo_deng.md")
160+
161+
output = os.path.join(tmp_path, "output")
162+
os.makedirs(output, exist_ok=True)
163+
164+
chunker_1 = DocumentChunker(
165+
document_paths=[Path(doc_path_1)],
166+
output_dir=output,
167+
tokenizer_model_name=tokenizer_model_name,
168+
server_ctx_size=4096,
169+
chunk_word_count=500,
170+
)
171+
chunks_1 = " ".join(chunker_1.chunk_documents())
172+
assert "Phoenix" in chunks_1
173+
assert "Moo Deng" not in chunks_1
174+
175+
chunker_2 = DocumentChunker(
176+
document_paths=[Path(doc_path_2)],
177+
output_dir=output,
178+
tokenizer_model_name=tokenizer_model_name,
179+
server_ctx_size=4096,
180+
chunk_word_count=500,
181+
)
182+
chunks_2 = " ".join(chunker_2.chunk_documents())
183+
assert "Moo Deng" in chunks_2
184+
assert "Phoenix" not in chunks_2

0 commit comments

Comments
 (0)