3030 Pipeline ,
3131 PipelineContext ,
3232)
33- from instructlab .sdg .sdg import SDG
3433from instructlab .sdg .utils import GenerateException , models
3534from instructlab .sdg .utils .taxonomy import (
3635 leaf_node_to_samples ,
@@ -241,9 +240,9 @@ def load_pipeline(yaml_basename):
241240 return Pipeline .from_file (ctx , os .path .join (pipeline , yaml_basename ))
242241
243242 return (
244- SDG ([ load_pipeline ("knowledge.yaml" )] ),
245- SDG ([ load_pipeline ("freeform_skills.yaml" )] ),
246- SDG ([ load_pipeline ("grounded_skills.yaml" )] ),
243+ load_pipeline ("knowledge.yaml" ),
244+ load_pipeline ("freeform_skills.yaml" ),
245+ load_pipeline ("grounded_skills.yaml" ),
247246 )
248247
249248
@@ -362,16 +361,15 @@ def generate_data(
362361 batch_num_workers = num_cpus ,
363362 )
364363
365- sdg_knowledge , sdg_freeform_skill , sdg_grounded_skill = _sdg_init (ctx , pipeline )
364+ knowledge_pipe , freeform_skills_pipe , grounded_skills_pipe = _sdg_init (
365+ ctx , pipeline
366+ )
366367
367368 # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)
368369 mmlu_ctx = dataclasses .replace (ctx , checkpoint_dir = None )
369370 mmlu_bench_pipe = mmlubench_pipe_init (mmlu_ctx )
370371
371- # FIXME: remove SDG https://github.com/instructlab/sdg/pull/64
372- mixer = _mixer_init (
373- ctx , output_dir , date_suffix , sdg_knowledge .pipelines [0 ].auxiliary_inst
374- )
372+ mixer = _mixer_init (ctx , output_dir , date_suffix , knowledge_pipe .auxiliary_inst )
375373
376374 if console_output :
377375 logger .info (
@@ -388,19 +386,19 @@ def generate_data(
388386 raise GenerateException ("Error: No samples found in leaf node." )
389387
390388 if samples [0 ].get ("document" ):
391- sdg = sdg_knowledge
389+ pipe = knowledge_pipe
392390 is_knowledge = True
393391
394392 elif samples [0 ].get ("seed_context" ):
395- sdg = sdg_grounded_skill
393+ pipe = grounded_skills_pipe
396394
397395 else :
398- sdg = sdg_freeform_skill
396+ pipe = freeform_skills_pipe
399397
400398 logger .debug ("Samples: %s" , samples )
401399 ds = Dataset .from_list (samples )
402400 logger .debug ("Dataset: %s" , ds )
403- new_generated_data = sdg .generate (ds )
401+ new_generated_data = pipe .generate (ds )
404402 if len (new_generated_data ) == 0 :
405403 raise EmptyDatasetError (
406404 "Pipeline stopped: Empty dataset after running pipe"
0 commit comments