Merge pull request #203 from bbrowning/write-recipe-yamls

bbrowning · web-flow · commit 7087621486a9 · 2024-07-25T20:39:09.000-04:00
Introduce data mixing recipe yaml files
diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
@@ -7,6 +7,7 @@
 
 # Third Party
 from datasets import Dataset, concatenate_datasets, load_dataset
+import yaml
 
 # First Party
 from instructlab.sdg.logger_config import setup_logger
@@ -27,18 +28,12 @@ def _adjust_train_sample_size(ds: Dataset, num_samples: int):
     return pandas.dataset_from_pandas_dataframe(df)
 
 
-def _load_ds(path, sampling_size, num_proc):
+def _sample_ds(dataset, sampling_size, num_proc):
     """
-    Load a dataset from the given file path and select sampling_size
-    number/ratio of samples from it, ensuring the loaded dataset has only
-    ALLOWED_COLS columns in it with any additional columns moved to the
-    metadata section.
+    Select sampling_size number/ratio of samples from a dataset, ensuring
+    the returned dataset has only ALLOWED_COLS columns in it with any
+    additional columns moved to the metadata section.
     """
-    logger.info(f"Loading dataset from {path} ...")
-    dataset = load_dataset("json", data_files=path, split="train")
-    logger.info(f"Dataset columns: {dataset.column_names}")
-    logger.info(f"Dataset loaded with {len(dataset)} samples")
-
     if sampling_size != 1.0:
         if isinstance(sampling_size, int):
             num_samples = sampling_size
@@ -94,29 +89,63 @@ class Recipe:
     """
 
     def __init__(
-        self, initial_datasets: Optional[list] = None, sys_prompt: Optional[str] = ""
+        self, recipe_path: Optional[str] = None, sys_prompt: Optional[str] = ""
     ):
-        self.recipe = {
-            "datasets": initial_datasets or [],
-            "sys_prompt": sys_prompt,
-        }
-        self.sys_prompt = self.recipe.get("sys_prompt", "")
+        self.recipe_path = recipe_path or ""
+        self.sys_prompt = sys_prompt
+
+        # Defaults if no recipe path given or these values don't
+        # exist in the given recipe file
+        self.datasets = []
+        if recipe_path is not None:
+            recipe = self._load_recipe()
+            if "datasets" in recipe:
+                self.datasets = recipe["datasets"]
+
         self.dataset_added = False
 
+    def _load_recipe(self):
+        with open(self.recipe_path, encoding="utf-8") as fp:
+            return yaml.safe_load(fp)
+
+    def _load_ds(self, path):
+        """
+        Load a dataset from the given location. If a jsonl file is
+        given, we load the dataset from disk. Otherwise, we load the
+        path given from HuggingFace. Relative paths are resolved
+        respective to the directory the recipe yaml itself resides in.
+        """
+        if not os.path.isabs(path):
+            path = os.path.join(os.path.dirname(self.recipe_path), path)
+        logger.info(f"Loading dataset from {path} ...")
+        dataset = load_dataset("json", data_files=path, split="train")
+        logger.info(f"Dataset columns: {dataset.column_names}")
+        logger.info(f"Dataset loaded with {len(dataset)} samples")
+        return dataset
+
+    def _load_and_sample_datasets(self, num_proc):
+        """
+        Load and sample all the datasets in this recipe, taking
+        into account the desired sampling size from each individual
+        dataset to control the overall mix of samples in the final
+        dataset.
+        """
+        return [
+            _sample_ds(
+                self._load_ds(dataset["path"]), dataset["sampling_size"], num_proc
+            )
+            for dataset in self.datasets
+        ]
+
     def _create_mixed_dataset(self, num_proc):
         """
-        Create the mixed dataset from its list of included datasets, taking
-        into account the desired sampling size from each individual dataset
-        to control the overall mix of samples in the final dataset.
+        Create the final mixed dataset by loading, sampling, and
+        concatenating all datasets in this recipe
         """
         if not self.dataset_added:
             logger.error("No dataset added to the recipe")
 
-        mixed_ds = [
-            _load_ds(dataset["path"], dataset["sampling_size"], num_proc)
-            for dataset in self.recipe["datasets"]
-        ]
-
+        mixed_ds = self._load_and_sample_datasets(num_proc)
         mixed_ds = concatenate_datasets(mixed_ds)
         mixed_ds = mixed_ds.map(
             _add_system_message,
@@ -143,7 +172,20 @@ def add_dataset(self, path, sampling_size):
                            of the samples, and so on.
         """
         self.dataset_added = True
-        self.recipe["datasets"].append({"path": path, "sampling_size": sampling_size})
+        self.datasets.append({"path": path, "sampling_size": sampling_size})
+
+    def save_recipe(self, output_path):
+        recipe = {
+            "datasets": self.datasets,
+            "metadata": {"sys_prompt": self.sys_prompt},
+        }
+        with open(output_path, "w", encoding="utf-8") as fp:
+            yaml.dump(recipe, fp)
+        # Update this instance's recipe_path to reflect the path we
+        # just saved it to so that any subsequent loading of datasets
+        # (like via save_mixed_dataset) pulls from relative to the
+        # saved recipe_path.
+        self.recipe_path = output_path
 
     def save_mixed_dataset(self, output_path, num_proc):
         """
@@ -398,18 +440,34 @@ class DataMixer:
     # once.
     NUM_SYNTH_SKILLS = 30
 
-    def __init__(self, output_dir, date_suffix, sys_prompt, num_procs):
+    def __init__(self, data_dirs, output_dir, date_suffix, sys_prompt, num_procs):
+        self.data_dirs = data_dirs
         self.output_dir = output_dir
         self.sys_prompt = sys_prompt
         self.date_suffix = date_suffix
         self.num_procs = num_procs
 
-        self.knowledge_recipe = Recipe(sys_prompt=self.sys_prompt)
-        self.skills_recipe = Recipe(sys_prompt=self.sys_prompt)
+        self.knowledge_recipe = self._load_default_recipe("knowledge.yaml")
+        self.skills_recipe = self._load_default_recipe("skills.yaml")
 
+        self.output_file_knowledge_recipe = f"knowledge_recipe_{date_suffix}.yaml"
+        self.output_file_skills_recipe = f"skills_recipe_{date_suffix}.yaml"
         self.output_file_mixed_knowledge = f"knowledge_train_msgs_{date_suffix}.jsonl"
         self.output_file_mixed_skills = f"skills_train_msgs_{date_suffix}.jsonl"
 
+    def _load_default_recipe(self, yaml_basename):
+        """
+        Load a default system recipe from e.g. /usr/share/instructlab/sdg/default_data_recipes
+        if it exists, otherwise return an empty recipe.
+        """
+        for d in self.data_dirs:
+            default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
+            if os.path.exists(default_recipe_path):
+                return Recipe(
+                    recipe_path=default_recipe_path, sys_prompt=self.sys_prompt
+                )
+        return Recipe(sys_prompt=self.sys_prompt)
+
     def _gen_leaf_node_data(
         self, leaf_node_data, recipe, output_file_leaf_node, sampling_size=1.0
     ):
@@ -420,7 +478,7 @@ def _gen_leaf_node_data(
         """
         output_file = os.path.join(self.output_dir, output_file_leaf_node)
         leaf_node_data.to_json(output_file, orient="records", lines=True)
-        recipe.add_dataset(output_file, sampling_size)
+        recipe.add_dataset(output_file_leaf_node, sampling_size)
 
     def collect(self, leaf_node_path, new_generated_data, is_knowledge):
         if is_knowledge:
@@ -459,20 +517,27 @@ def collect(self, leaf_node_path, new_generated_data, is_knowledge):
                 sampling_size=self.NUM_SYNTH_SKILLS,
             )
 
-    def _gen_mixed_data(self, recipe, output_file_mixed):
+    def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
         """
         Mix the generated leaf node data into a single dataset and write it to
         disk. The heavy lifting is delegated to the Recipe class.
         """
         if recipe.dataset_added:
+            full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
+            recipe.save_recipe(full_recipe_path)
             recipe.save_mixed_dataset(
-                os.path.join(self.output_dir, output_file_mixed),
+                os.path.join(self.output_dir, output_file_data),
                 self.num_procs,
             )
 
     def generate(self):
-        self._gen_mixed_data(self.knowledge_recipe, self.output_file_mixed_knowledge)
+        self._gen_mixed_data(
+            self.knowledge_recipe,
+            self.output_file_knowledge_recipe,
+            self.output_file_mixed_knowledge,
+        )
         self._gen_mixed_data(
             self.skills_recipe,
+            self.output_file_skills_recipe,
             self.output_file_mixed_skills,
         )
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -241,6 +241,20 @@ def load_pipeline(yaml_basename):
     )
 
 
+def _mixer_init(ctx, output_dir, date_suffix):
+    pd = platformdirs.PlatformDirs(
+        appname=os.path.join("instructlab", "sdg"), multipath=True
+    )
+    data_dirs = list(pd.iter_data_dirs())
+    return DataMixer(
+        data_dirs,
+        output_dir,
+        date_suffix,
+        _SYS_PROMPT,
+        ctx.dataset_num_procs,
+    )
+
+
 # This is part of the public API, and used by instructlab.
 # TODO - parameter removal needs to be done in sync with a CLI change.
 # pylint: disable=unused-argument
@@ -342,7 +356,7 @@ def generate_data(
 
     mmlu_bench_pipe = mmlubench_pipe_init(ctx)
 
-    mixer = DataMixer(output_dir, date_suffix, _SYS_PROMPT, ctx.dataset_num_procs)
+    mixer = _mixer_init(ctx, output_dir, date_suffix)
 
     if console_output:
         logger.info(
diff --git a/tests/test_datamixing.py b/tests/test_datamixing.py
@@ -2,11 +2,31 @@
 Unit tests for the top-level datamixing module.
 """
 
+# Standard
+from importlib import resources
+from unittest.mock import patch
+import os
+
 # Third Party
 from datasets import Dataset
 
 # First Party
-from instructlab.sdg.datamixing import _add_extra_contexts_to_samples
+from instructlab.sdg.datamixing import DataMixer, Recipe, _add_extra_contexts_to_samples
+
+# We mock out the actual things that use num_procs anyway, but just
+# for a consistent value in the tests...
+TEST_NUM_PROCS = 4
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")
+TEST_RECIPE_PATH = os.path.join(TEST_DATA_DIR, "relative_path_recipe.yaml")
+TEST_SAMPLES_ABS_PATH = os.path.join(TEST_DATA_DIR, "datasets/samples.jsonl")
+
+
+def _empty_recipe(self):
+    return {}
+
+
+def _noop_sample(dataset, _sampling_size, _num_procs):
+    return dataset
 
 
 def _fake_context(msg_id):
@@ -18,6 +38,89 @@ def _fake_context(msg_id):
     }
 
 
+def test_datamixer_can_load_default_recipes():
+    """
+    Test that DataMixer can load default recipe files by pointing
+    it at a simple set of test recipe files under the testdata/
+    directory.
+    """
+    date_suffix = "2024-07-25T15_52_10"
+    prompt = "You are a useful AI assistant."
+    mixer = DataMixer(
+        [TEST_DATA_DIR], TEST_DATA_DIR, date_suffix, prompt, TEST_NUM_PROCS
+    )
+    assert mixer.knowledge_recipe.datasets[0]["path"] == "test/knowledge.jsonl"
+    assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"
+
+
+def test_recipe_init_with_empty_params_adds_dataset():
+    """
+    Test that an empty-initialized recipe can add datasets
+    """
+    recipe = Recipe()
+    recipe.add_dataset("testdata/datasets/samples.jsonl", 1.0)
+    assert recipe.dataset_added
+
+
+def test_recipe_init_with_empty_params_loads_abs_dataset():
+    """
+    Test that an empty-initialized recipe can load datasets from
+    absolute file paths.
+    """
+    recipe = Recipe()
+    dataset = recipe._load_ds(TEST_SAMPLES_ABS_PATH)
+    assert dataset is not None
+
+
+def test_recipe_init_with_empty_params_loads_rel_dataset():
+    """
+    Test that an empty-initialized recipe looks for dataset files relative
+    to the current working directory (as opposed to blowing up because of
+    no recipe_path given).
+    """
+    recipe = Recipe()
+    rel_path = os.path.relpath(TEST_SAMPLES_ABS_PATH)
+    dataset = recipe._load_ds(rel_path)
+    assert dataset is not None
+
+
+@patch.object(Recipe, "_load_recipe", _empty_recipe)
+def test_init_with_empty_recipe_files():
+    """
+    Test that we can initialize a Recipe that points to a recipe
+    file that does not contain one or more of our expected keys, and
+    that instead of blowing up (like with a KeyError) we just use sane
+    defaults.
+    """
+    recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
+    assert len(recipe.datasets) == 0
+    assert recipe.sys_prompt == ""
+
+
+@patch("instructlab.sdg.datamixing._sample_ds", _noop_sample)
+def test_load_ds_with_relative_jsonl_path():
+    """
+    Test that the _load_ds function can load from datasets from jsonl
+    files referenced with a path relative to the recipe file
+    """
+    recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
+    dataset = recipe._load_and_sample_datasets(TEST_NUM_PROCS)
+    assert dataset is not None
+
+
+@patch("instructlab.sdg.datamixing._sample_ds", _noop_sample)
+def test_load_ds_with_absolute_jsonl_path():
+    """
+    Test that the _load_ds function can load from datasets from jsonl
+    files referenced with an absolute dataset path
+    """
+    recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
+    # Patch an absolute path into our recipe before loading it
+    recipe.datasets[0]["path"] = TEST_SAMPLES_ABS_PATH
+    dataset = recipe._load_and_sample_datasets(TEST_NUM_PROCS)
+    assert dataset is not None
+
+
 def test_add_extra_contexts_to_samples_with_one_sample():
     """
     Test _add_extra_contexts_to_samples doesn't error out when
diff --git a/tests/testdata/datasets/samples.jsonl b/tests/testdata/datasets/samples.jsonl
@@ -0,0 +1 @@
+{"id": "abc123", "messages": [], "metadata": {}}
diff --git a/tests/testdata/default_data_recipes/knowledge.yaml b/tests/testdata/default_data_recipes/knowledge.yaml
@@ -0,0 +1,3 @@
+datasets:
+  - path: test/knowledge.jsonl
+    sampling_size: 1.0
diff --git a/tests/testdata/default_data_recipes/skills.yaml b/tests/testdata/default_data_recipes/skills.yaml
@@ -0,0 +1,3 @@
+datasets:
+  - path: test/skills.jsonl
+    sampling_size: 1.0
diff --git a/tests/testdata/relative_path_recipe.yaml b/tests/testdata/relative_path_recipe.yaml
@@ -0,0 +1,4 @@
+datasets:
+- path: datasets/samples.jsonl
+  sampling_size: 1.0
+sys_prompt: I am a reliable AI assistant.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"id": "abc123", "messages": [], "metadata": {}}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+datasets:`
	`2`	`+ - path: test/knowledge.jsonl`
	`3`	`+ sampling_size: 1.0`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+datasets:`
	`2`	`+ - path: test/skills.jsonl`
	`3`	`+ sampling_size: 1.0`