@@ -422,3 +422,56 @@ def test_mix_instructlab_07x_precomputed_skills_with_unmask(tmp_path):
422422 assert (
423423 sample .get ("unmask" , None ) is not None
424424 ), "Mixed sample does not have unmask"
425+
426+
427+ def test_save_mixed_dataset_with_none_content (tmp_path ):
428+ """
429+ Test that we filter out mixed dataset records where any message content is None.
430+ """
431+
432+ # Create a knowledge dataset
433+ knowledge_dataset = load_auxiliary_dataset ()
434+ number_of_records = len (knowledge_dataset )
435+ # append a record with content=None
436+ knowledge_dataset = knowledge_dataset .add_item (
437+ {
438+ "id" : "test_001" ,
439+ "messages" : [
440+ {"role" : "system" , "content" : "You are a helpful assistant." },
441+ {"role" : "user" , "content" : "What is the capital of Ireland?" },
442+ {"role" : "assistant" , "content" : None },
443+ ],
444+ }
445+ )
446+ knowledge_dataset = knowledge_dataset .add_item (
447+ {
448+ "id" : "test_002" ,
449+ "messages" : [
450+ {"role" : "system" , "content" : "You are a helpful assistant." },
451+ {"role" : "user" , "content" : "What is the capital of Ireland?" },
452+ {"role" : "assistant" , "content" : "Dublin" },
453+ ],
454+ }
455+ )
456+
457+ knowledge_path = os .path .join (tmp_path , "knowledge.jsonl" )
458+ jldump (knowledge_dataset , knowledge_path )
459+
460+ output_path = os .path .join (tmp_path , "output.jsonl" )
461+ recipe = Recipe ()
462+ recipe .add_dataset (knowledge_path , 1.0 )
463+ recipe .save_mixed_dataset (output_path , TEST_NUM_PROCS )
464+
465+ # Ensure the mixed dataset is saved correctly
466+ mixed_samples = load_dataset ("json" , data_files = output_path , split = "train" )
467+
468+ # the row with content=None should have been removed
469+ assert (
470+ len (mixed_samples ) == number_of_records + 1
471+ ), f"Expected { number_of_records + 1 } records in mixed dataset"
472+
473+ # None of the mixed samples should have content=None
474+ for sample in mixed_samples :
475+ assert all (
476+ [message .get ("content" ) is not None for message in sample ["messages" ]]
477+ ), "Mixed sample has content=None"
0 commit comments