Skip to content
This repository was archived by the owner on Apr 30, 2026. It is now read-only.

Commit b292b7a

Browse files
authored
Merge pull request #209 from derekhiggins/mmlu-orig-samples
Generate mmlu bench data with the original samples
2 parents afadfd5 + 98013bc commit b292b7a

3 files changed

Lines changed: 14 additions & 6 deletions

File tree

src/instructlab/sdg/eval_data.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,11 @@ def _format_mmlu_style(ds: Dataset) -> Dataset:
5555
ds = ds.filter(lambda x: x["choices"])
5656
ds = ds.filter(lambda x: len(x["choices"]) == 4)
5757
ds = ds.filter(lambda x: x["answer"] in ["A", "B", "C", "D"])
58-
ds = ds.class_encode_column("answer")
58+
# We filter out a lot of the dataset above (and in _post_process_mcq)
59+
# if we've managed to filter out all of the results we don't want to run class_encode_column
60+
# as the answer column might not exist
61+
if len(ds):
62+
ds = ds.class_encode_column("answer")
5963
return ds
6064

6165

@@ -113,7 +117,8 @@ def generate_eval_task_data(
113117
mmlubench_pipe, task_name, samples, output_dir, date_suffix
114118
):
115119
mmlubench_data = mmlubench_pipe.generate(samples)
116-
mmlubench_data = _post_process_mcq(mmlubench_data)
120+
if len(mmlubench_data):
121+
mmlubench_data = _post_process_mcq(mmlubench_data)
117122

118123
eval_data_file_path = (
119124
f"{output_dir}/node_datasets_{date_suffix}/mmlubench_{task_name}.jsonl"

src/instructlab/sdg/generate_data.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from instructlab.sdg.pipeline import (
2525
FULL_PIPELINES_PACKAGE,
2626
SIMPLE_PIPELINES_PACKAGE,
27+
EmptyDatasetError,
2728
Pipeline,
2829
PipelineContext,
2930
)
@@ -371,6 +372,10 @@ def generate_data(
371372
ds = Dataset.from_list(samples)
372373
logger.debug("Dataset: %s" % ds)
373374
new_generated_data = sdg.generate(ds)
375+
if len(new_generated_data) == 0:
376+
raise EmptyDatasetError(
377+
"Pipeline stopped: Empty dataset after running pipe"
378+
)
374379
generated_data = (
375380
[new_generated_data]
376381
if generated_data is None
@@ -384,7 +389,7 @@ def generate_data(
384389
generate_eval_task_data(
385390
mmlu_bench_pipe,
386391
leaf_node_path,
387-
new_generated_data,
392+
ds,
388393
output_dir,
389394
date_suffix,
390395
)

src/instructlab/sdg/pipeline.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,7 @@ def _generate_single(self, dataset) -> Dataset:
184184

185185
# If at any point we end up with an empty data set, the pipeline has failed
186186
if len(dataset) == 0:
187-
raise EmptyDatasetError(
188-
f"Pipeline stopped: Empty dataset after running block: {block_name}"
189-
)
187+
return dataset
190188

191189
drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
192190
if drop_columns:

0 commit comments

Comments
 (0)