Allow mmlu_bench to return empty results

derekhiggins · derekhiggins · commit 98013bc69b23 · 2024-07-25T17:15:51.000-04:00
We can't control the model outputs and their ability to produce the required format for the mmlu_bench mcq's varys and is inconsistent. Move the check for an Empty dataset so that we can and use it to allow the mmlu pipeline to return an empty dataset, while we ensure we check other pipelines for entries. Fixes #213 Signed-off-by: Derek Higgins <derekh@redhat.com>
diff --git a/src/instructlab/sdg/eval_data.py b/src/instructlab/sdg/eval_data.py
@@ -55,7 +55,11 @@ def _format_mmlu_style(ds: Dataset) -> Dataset:
     ds = ds.filter(lambda x: x["choices"])
     ds = ds.filter(lambda x: len(x["choices"]) == 4)
     ds = ds.filter(lambda x: x["answer"] in ["A", "B", "C", "D"])
-    ds = ds.class_encode_column("answer")
+    # We filter out a lot of the dataset above (and in _post_process_mcq)
+    # if we've managed to filter out all of the results we don't want to run class_encode_column
+    # as the answer column might not exist
+    if len(ds):
+        ds = ds.class_encode_column("answer")
     return ds
 
 
@@ -113,7 +117,8 @@ def generate_eval_task_data(
     mmlubench_pipe, task_name, samples, output_dir, date_suffix
 ):
     mmlubench_data = mmlubench_pipe.generate(samples)
-    mmlubench_data = _post_process_mcq(mmlubench_data)
+    if len(mmlubench_data):
+        mmlubench_data = _post_process_mcq(mmlubench_data)
 
     eval_data_file_path = f"{output_dir}/node_datasets_{date_suffix}/mmlubench_{date_suffix}_{task_name}.jsonl"
     logger.info(f"Saving MMLU Dataset {eval_data_file_path}")
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -24,6 +24,7 @@
 from instructlab.sdg.pipeline import (
     FULL_PIPELINES_PACKAGE,
     SIMPLE_PIPELINES_PACKAGE,
+    EmptyDatasetError,
     Pipeline,
     PipelineContext,
 )
@@ -371,6 +372,10 @@ def generate_data(
         ds = Dataset.from_list(samples)
         logger.debug("Dataset: %s" % ds)
         new_generated_data = sdg.generate(ds)
+        if len(new_generated_data) == 0:
+            raise EmptyDatasetError(
+                "Pipeline stopped: Empty dataset after running pipe"
+            )
         generated_data = (
             [new_generated_data]
             if generated_data is None
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
@@ -184,9 +184,7 @@ def _generate_single(self, dataset) -> Dataset:
 
             # If at any point we end up with an empty data set, the pipeline has failed
             if len(dataset) == 0:
-                raise EmptyDatasetError(
-                    f"Pipeline stopped: Empty dataset after running block: {block_name}"
-                )
+                return dataset
 
             drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
             if drop_columns: