Skip to content
This repository was archived by the owner on Apr 30, 2026. It is now read-only.

Commit 98189e5

Browse files
committed
Filter out blank content when saving mixed dataset
Ensure we don't save out a mised dataset rows where content=None. Signed-off-by: Derek Higgins <derekh@redhat.com>
1 parent 062cf4c commit 98189e5

2 files changed

Lines changed: 61 additions & 0 deletions

File tree

src/instructlab/sdg/datamixing.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,14 @@ def save_mixed_dataset(self, output_path, num_proc):
231231
as a jsonl file.
232232
"""
233233
mixed_ds = self._create_mixed_dataset(num_proc)
234+
235+
# fileter out any records where the any message content is None
236+
mixed_ds = mixed_ds.filter(
237+
lambda x: all(
238+
message.get("content") is not None for message in x["messages"]
239+
)
240+
)
241+
234242
mixed_ds.to_json(output_path, orient="records", lines=True)
235243
logger.info(f"Mixed Dataset saved to {output_path}")
236244

tests/unit/test_datamixing.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,3 +422,56 @@ def test_mix_instructlab_07x_precomputed_skills_with_unmask(tmp_path):
422422
assert (
423423
sample.get("unmask", None) is not None
424424
), "Mixed sample does not have unmask"
425+
426+
427+
def test_save_mixed_dataset_with_none_content(tmp_path):
428+
"""
429+
Test that we filter out mixed dataset records where any message content is None.
430+
"""
431+
432+
# Create a knowledge dataset
433+
knowledge_dataset = load_auxiliary_dataset()
434+
number_of_records = len(knowledge_dataset)
435+
# append a record with content=None
436+
knowledge_dataset = knowledge_dataset.add_item(
437+
{
438+
"id": "test_001",
439+
"messages": [
440+
{"role": "system", "content": "You are a helpful assistant."},
441+
{"role": "user", "content": "What is the capital of Ireland?"},
442+
{"role": "assistant", "content": None},
443+
],
444+
}
445+
)
446+
knowledge_dataset = knowledge_dataset.add_item(
447+
{
448+
"id": "test_002",
449+
"messages": [
450+
{"role": "system", "content": "You are a helpful assistant."},
451+
{"role": "user", "content": "What is the capital of Ireland?"},
452+
{"role": "assistant", "content": "Dublin"},
453+
],
454+
}
455+
)
456+
457+
knowledge_path = os.path.join(tmp_path, "knowledge.jsonl")
458+
jldump(knowledge_dataset, knowledge_path)
459+
460+
output_path = os.path.join(tmp_path, "output.jsonl")
461+
recipe = Recipe()
462+
recipe.add_dataset(knowledge_path, 1.0)
463+
recipe.save_mixed_dataset(output_path, TEST_NUM_PROCS)
464+
465+
# Ensure the mixed dataset is saved correctly
466+
mixed_samples = load_dataset("json", data_files=output_path, split="train")
467+
468+
# the row with content=None should have been removed
469+
assert (
470+
len(mixed_samples) == number_of_records + 1
471+
), f"Expected {number_of_records + 1} records in mixed dataset"
472+
473+
# None of the mixed samples should have content=None
474+
for sample in mixed_samples:
475+
assert all(
476+
[message.get("content") is not None for message in sample["messages"]]
477+
), "Mixed sample has content=None"

0 commit comments

Comments
 (0)