Introduce data mixing recipe yaml files

bbrowning · shivchander · khaledsulayman · bbrowning · commit fcfacfc51688 · 2024-07-25T17:57:01.000-04:00
This introduces Recipe yaml files, which are used both as an input into the data mixing process and as an output of the process. As an input, we have some default recipe files that specify any precomputed datasets that should be mixed with data from new skills when generating the overall mix of samples that will be sent to the training process. If a downstream user/packager wants to add default recipes (and datasets), they should install them to a path like `/usr/share/instructlab/sdg` (varies by platform, uses Python's `platformdirs.PlatformDirs` to respect platform conventions). Recipes should be in sdg/default_data_recipes/{knowledge,skills}.yaml Datasets should be in sdg/datasets but this location is not enforced. Currently we are not shipping any default recipe files in the upstream, but there is a unit test in place to ensure the functionality to load default recipes from disk works once we decide how we want to ship a precomputed dataset to our upstream users. As an output of the data generation process, we write recipe yamls to document which datasets were mixed together and in what proportions along with the system prompt that was used during the generation. Here's an example of a recipe yaml put into the output directory after running data generation: ```yaml datasets: - path: node_datasets_2024-07-25T17_49_46/knowledge_tonsils_overview_e2e-tonsils_p10.jsonl sampling_size: 1.0 metadata: sys_prompt: "I am, Red Hat\xAE Instruct Model based on Granite 7B, an AI language\ \ model developed by Red Hat and IBM Research, based on the Granite-7b-base language\ \ model. My primary function is to be a chat assistant." ``` Datasets may be referenced by relative paths, which are relative to the recipe's own directory. Or, they may use absolute filesystem paths. Anything written out under the metadata section (currently just sys_prompt) is purely informational for the user and ignored when loading recipes. Parts of this are extracted and rebased from aakankshaduggal#4 aakankshaduggal#20 Refs #162, #171, #185, #201. Co-authored-by: shivchander <shivchander.s30@gmail.com> Co-authored-by: Khaled Sulayman <khaled@thesulaymans.com> Co-authored-by: abhi1092 <abhi1092@gmail.com> Co-authored-by: Aakanksha Duggal <aduggal@redhat.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Ben Browning <bbrownin@redhat.com>
diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
@@ -7,6 +7,7 @@
 
 # Third Party
 from datasets import Dataset, concatenate_datasets, load_dataset
+import yaml
 
 # First Party
 from instructlab.sdg.logger_config import setup_logger
@@ -27,18 +28,12 @@ def _adjust_train_sample_size(ds: Dataset, num_samples: int):
     return pandas.dataset_from_pandas_dataframe(df)
 
 
-def _load_ds(path, sampling_size, num_proc):
+def _sample_ds(dataset, sampling_size, num_proc):
     """
-    Load a dataset from the given file path and select sampling_size
-    number/ratio of samples from it, ensuring the loaded dataset has only
-    ALLOWED_COLS columns in it with any additional columns moved to the
-    metadata section.
+    Select sampling_size number/ratio of samples from a dataset, ensuring
+    the returned dataset has only ALLOWED_COLS columns in it with any
+    additional columns moved to the metadata section.
     """
-    logger.info(f"Loading dataset from {path} ...")
-    dataset = load_dataset("json", data_files=path, split="train")
-    logger.info(f"Dataset columns: {dataset.column_names}")
-    logger.info(f"Dataset loaded with {len(dataset)} samples")
-
     if sampling_size != 1.0:
         if isinstance(sampling_size, int):
             num_samples = sampling_size
@@ -94,29 +89,63 @@ class Recipe:
     """
 
     def __init__(
-        self, initial_datasets: Optional[list] = None, sys_prompt: Optional[str] = ""
+        self, recipe_path: Optional[str] = None, sys_prompt: Optional[str] = ""
     ):
-        self.recipe = {
-            "datasets": initial_datasets or [],
-            "sys_prompt": sys_prompt,
-        }
-        self.sys_prompt = self.recipe.get("sys_prompt", "")
+        self.recipe_path = recipe_path or ""
+        self.sys_prompt = sys_prompt
+
+        # Defaults if no recipe path given or these values don't
+        # exist in the given recipe file
+        self.datasets = []
+        if recipe_path is not None:
+            recipe = self._load_recipe()
+            if "datasets" in recipe:
+                self.datasets = recipe["datasets"]
+
         self.dataset_added = False
 
+    def _load_recipe(self):
+        with open(self.recipe_path, encoding="utf-8") as fp:
+            return yaml.safe_load(fp)
+
+    def _load_ds(self, path):
+        """
+        Load a dataset from the given location. If a jsonl file is
+        given, we load the dataset from disk. Otherwise, we load the
+        path given from HuggingFace. Relative paths are resolved
+        respective to the directory the recipe yaml itself resides in.
+        """
+        if not os.path.isabs(path):
+            path = os.path.join(os.path.dirname(self.recipe_path), path)
+        logger.info(f"Loading dataset from {path} ...")
+        dataset = load_dataset("json", data_files=path, split="train")
+        logger.info(f"Dataset columns: {dataset.column_names}")
+        logger.info(f"Dataset loaded with {len(dataset)} samples")
+        return dataset
+
+    def _load_and_sample_datasets(self, num_proc):
+        """
+        Load and sample all the datasets in this recipe, taking
+        into account the desired sampling size from each individual
+        dataset to control the overall mix of samples in the final
+        dataset.
+        """
+        return [
+            _sample_ds(
+                self._load_ds(dataset["path"]), dataset["sampling_size"], num_proc
+            )
+            for dataset in self.datasets
+        ]
+
     def _create_mixed_dataset(self, num_proc):
         """
-        Create the mixed dataset from its list of included datasets, taking
-        into account the desired sampling size from each individual dataset
-        to control the overall mix of samples in the final dataset.
+        Create the final mixed dataset by loading, sampling, and
+        concatenating all datasets in this recipe
         """
         if not self.dataset_added:
             logger.error("No dataset added to the recipe")
 
-        mixed_ds = [
-            _load_ds(dataset["path"], dataset["sampling_size"], num_proc)
-            for dataset in self.recipe["datasets"]
-        ]
-
+        mixed_ds = self._load_and_sample_datasets(num_proc)
         mixed_ds = concatenate_datasets(mixed_ds)
         mixed_ds = mixed_ds.map(
             _add_system_message,
@@ -143,7 +172,20 @@ def add_dataset(self, path, sampling_size):
                            of the samples, and so on.
         """
         self.dataset_added = True
-        self.recipe["datasets"].append({"path": path, "sampling_size": sampling_size})
+        self.datasets.append({"path": path, "sampling_size": sampling_size})
+
+    def save_recipe(self, output_path):
+        recipe = {
+            "datasets": self.datasets,
+            "metadata": {"sys_prompt": self.sys_prompt},
+        }
+        with open(output_path, "w", encoding="utf-8") as fp:
+            yaml.dump(recipe, fp)
+        # Update this instance's recipe_path to reflect the path we
+        # just saved it to so that any subsequent loading of datasets
+        # (like via save_mixed_dataset) pulls from relative to the
+        # saved recipe_path.
+        self.recipe_path = output_path
 
     def save_mixed_dataset(self, output_path, num_proc):
         """
@@ -398,18 +440,34 @@ class DataMixer:
     # once.
     NUM_SYNTH_SKILLS = 30
 
-    def __init__(self, output_dir, date_suffix, sys_prompt, num_procs):
+    def __init__(self, data_dirs, output_dir, date_suffix, sys_prompt, num_procs):
+        self.data_dirs = data_dirs
         self.output_dir = output_dir
         self.sys_prompt = sys_prompt
         self.date_suffix = date_suffix
         self.num_procs = num_procs
 
-        self.knowledge_recipe = Recipe(sys_prompt=self.sys_prompt)
-        self.skills_recipe = Recipe(sys_prompt=self.sys_prompt)
+        self.knowledge_recipe = self._load_default_recipe("knowledge.yaml")
+        self.skills_recipe = self._load_default_recipe("skills.yaml")
 
+        self.output_file_knowledge_recipe = f"knowledge_recipe_{date_suffix}.yaml"
+        self.output_file_skills_recipe = f"skills_recipe_{date_suffix}.yaml"
         self.output_file_mixed_knowledge = f"knowledge_train_msgs_{date_suffix}.jsonl"
         self.output_file_mixed_skills = f"skills_train_msgs_{date_suffix}.jsonl"
 
+    def _load_default_recipe(self, yaml_basename):
+        """
+        Load a default system recipe from e.g. /usr/share/instructlab/sdg/default_data_recipes
+        if it exists, otherwise return an empty recipe.
+        """
+        for d in self.data_dirs:
+            default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
+            if os.path.exists(default_recipe_path):
+                return Recipe(
+                    recipe_path=default_recipe_path, sys_prompt=self.sys_prompt
+                )
+        return Recipe(sys_prompt=self.sys_prompt)
+
     def _gen_leaf_node_data(
         self, leaf_node_data, recipe, output_file_leaf_node, sampling_size=1.0
     ):
@@ -420,7 +478,7 @@ def _gen_leaf_node_data(
         """
         output_file = os.path.join(self.output_dir, output_file_leaf_node)
         leaf_node_data.to_json(output_file, orient="records", lines=True)
-        recipe.add_dataset(output_file, sampling_size)
+        recipe.add_dataset(output_file_leaf_node, sampling_size)
 
     def collect(self, leaf_node_path, new_generated_data, is_knowledge):
         if is_knowledge:
@@ -459,20 +517,27 @@ def collect(self, leaf_node_path, new_generated_data, is_knowledge):
                 sampling_size=self.NUM_SYNTH_SKILLS,
             )
 
-    def _gen_mixed_data(self, recipe, output_file_mixed):
+    def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
         """
         Mix the generated leaf node data into a single dataset and write it to
         disk. The heavy lifting is delegated to the Recipe class.
         """
         if recipe.dataset_added:
+            full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
+            recipe.save_recipe(full_recipe_path)
             recipe.save_mixed_dataset(
-                os.path.join(self.output_dir, output_file_mixed),
+                os.path.join(self.output_dir, output_file_data),
                 self.num_procs,
             )
 
     def generate(self):
-        self._gen_mixed_data(self.knowledge_recipe, self.output_file_mixed_knowledge)
+        self._gen_mixed_data(
+            self.knowledge_recipe,
+            self.output_file_knowledge_recipe,
+            self.output_file_mixed_knowledge,
+        )
         self._gen_mixed_data(
             self.skills_recipe,
+            self.output_file_skills_recipe,
             self.output_file_mixed_skills,
         )
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -240,6 +240,20 @@ def load_pipeline(yaml_basename):
     )
 
 
+def _mixer_init(ctx, output_dir, date_suffix):
+    pd = platformdirs.PlatformDirs(
+        appname=os.path.join("instructlab", "sdg"), multipath=True
+    )
+    data_dirs = list(pd.iter_data_dirs())
+    return DataMixer(
+        data_dirs,
+        output_dir,
+        date_suffix,
+        _SYS_PROMPT,
+        ctx.dataset_num_procs,
+    )
+
+
 # This is part of the public API, and used by instructlab.
 # TODO - parameter removal needs to be done in sync with a CLI change.
 # pylint: disable=unused-argument
@@ -341,7 +355,7 @@ def generate_data(
 
     mmlu_bench_pipe = mmlubench_pipe_init(ctx)
 
-    mixer = DataMixer(output_dir, date_suffix, _SYS_PROMPT, ctx.dataset_num_procs)
+    mixer = _mixer_init(ctx, output_dir, date_suffix)
 
     if console_output:
         logger.info(
diff --git a/tests/test_datamixing.py b/tests/test_datamixing.py
@@ -2,11 +2,31 @@
 Unit tests for the top-level datamixing module.
 """
 
+# Standard
+from importlib import resources
+from unittest.mock import patch
+import os
+
 # Third Party
 from datasets import Dataset
 
 # First Party
-from instructlab.sdg.datamixing import _add_extra_contexts_to_samples
+from instructlab.sdg.datamixing import DataMixer, Recipe, _add_extra_contexts_to_samples
+
+# We mock out the actual things that use num_procs anyway, but just
+# for a consistent value in the tests...
+TEST_NUM_PROCS = 4
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")
+TEST_RECIPE_PATH = os.path.join(TEST_DATA_DIR, "relative_path_recipe.yaml")
+TEST_SAMPLES_ABS_PATH = os.path.join(TEST_DATA_DIR, "datasets/samples.jsonl")
+
+
+def _empty_recipe(self):
+    return {}
+
+
+def _noop_sample(dataset, _sampling_size, _num_procs):
+    return dataset
 
 
 def _fake_context(msg_id):
@@ -18,6 +38,89 @@ def _fake_context(msg_id):
     }
 
 
+def test_datamixer_can_load_default_recipes():
+    """
+    Test that DataMixer can load default recipe files by pointing
+    it at a simple set of test recipe files under the testdata/
+    directory.
+    """
+    date_suffix = "2024-07-25T15_52_10"
+    prompt = "You are a useful AI assistant."
+    mixer = DataMixer(
+        [TEST_DATA_DIR], TEST_DATA_DIR, date_suffix, prompt, TEST_NUM_PROCS
+    )
+    assert mixer.knowledge_recipe.datasets[0]["path"] == "test/knowledge.jsonl"
+    assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"
+
+
+def test_recipe_init_with_empty_params_adds_dataset():
+    """
+    Test that an empty-initialized recipe can add datasets
+    """
+    recipe = Recipe()
+    recipe.add_dataset("testdata/datasets/samples.jsonl", 1.0)
+    assert recipe.dataset_added
+
+
+def test_recipe_init_with_empty_params_loads_abs_dataset():
+    """
+    Test that an empty-initialized recipe can load datasets from
+    absolute file paths.
+    """
+    recipe = Recipe()
+    dataset = recipe._load_ds(TEST_SAMPLES_ABS_PATH)
+    assert dataset is not None
+
+
+def test_recipe_init_with_empty_params_loads_rel_dataset():
+    """
+    Test that an empty-initialized recipe looks for dataset files relative
+    to the current working directory (as opposed to blowing up because of
+    no recipe_path given).
+    """
+    recipe = Recipe()
+    rel_path = os.path.relpath(TEST_SAMPLES_ABS_PATH)
+    dataset = recipe._load_ds(rel_path)
+    assert dataset is not None
+
+
+@patch.object(Recipe, "_load_recipe", _empty_recipe)
+def test_init_with_empty_recipe_files():
+    """
+    Test that we can initialize a Recipe that points to a recipe
+    file that does not contain one or more of our expected keys, and
+    that instead of blowing up (like with a KeyError) we just use sane
+    defaults.
+    """
+    recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
+    assert len(recipe.datasets) == 0
+    assert recipe.sys_prompt == ""
+
+
+@patch("instructlab.sdg.datamixing._sample_ds", _noop_sample)
+def test_load_ds_with_relative_jsonl_path():
+    """
+    Test that the _load_ds function can load from datasets from jsonl
+    files referenced with a path relative to the recipe file
+    """
+    recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
+    dataset = recipe._load_and_sample_datasets(TEST_NUM_PROCS)
+    assert dataset is not None
+
+
+@patch("instructlab.sdg.datamixing._sample_ds", _noop_sample)
+def test_load_ds_with_absolute_jsonl_path():
+    """
+    Test that the _load_ds function can load from datasets from jsonl
+    files referenced with an absolute dataset path
+    """
+    recipe = Recipe(recipe_path=TEST_RECIPE_PATH)
+    # Patch an absolute path into our recipe before loading it
+    recipe.datasets[0]["path"] = TEST_SAMPLES_ABS_PATH
+    dataset = recipe._load_and_sample_datasets(TEST_NUM_PROCS)
+    assert dataset is not None
+
+
 def test_add_extra_contexts_to_samples_with_one_sample():
     """
     Test _add_extra_contexts_to_samples doesn't error out when
diff --git a/tests/testdata/datasets/samples.jsonl b/tests/testdata/datasets/samples.jsonl
@@ -0,0 +1 @@
+{"id": "abc123", "messages": [], "metadata": {}}
diff --git a/tests/testdata/default_data_recipes/knowledge.yaml b/tests/testdata/default_data_recipes/knowledge.yaml
@@ -0,0 +1,3 @@
+datasets:
+  - path: test/knowledge.jsonl
+    sampling_size: 1.0
diff --git a/tests/testdata/default_data_recipes/skills.yaml b/tests/testdata/default_data_recipes/skills.yaml
@@ -0,0 +1,3 @@
+datasets:
+  - path: test/skills.jsonl
+    sampling_size: 1.0
diff --git a/tests/testdata/relative_path_recipe.yaml b/tests/testdata/relative_path_recipe.yaml
@@ -0,0 +1,4 @@
+datasets:
+- path: datasets/samples.jsonl
+  sampling_size: 1.0
+sys_prompt: I am a reliable AI assistant.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"id": "abc123", "messages": [], "metadata": {}}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+datasets:`
	`2`	`+ - path: test/knowledge.jsonl`
	`3`	`+ sampling_size: 1.0`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+datasets:`
	`2`	`+ - path: test/skills.jsonl`
	`3`	`+ sampling_size: 1.0`