From 93718d197df2459d1c31bd09604c8e2788c9760e Mon Sep 17 00:00:00 2001
From: Eugen Hotaj <eugen_hotaj_91@hotmail.com>
Date: Tue, 14 Jan 2025 10:53:44 -0800
Subject: [PATCH 1/6] [EZ] Pass seed to data sampler.

We currently hardcode zero. I assume this is not intentional.
---
 recipes/dev/early_exit_finetune_distributed.py  | 2 +-
 recipes/full_finetune_distributed.py            | 2 +-
 recipes/full_finetune_single_device.py          | 2 +-
 recipes/knowledge_distillation_distributed.py   | 2 +-
 recipes/knowledge_distillation_single_device.py | 2 +-
 recipes/lora_dpo_distributed.py                 | 2 +-
 recipes/lora_dpo_single_device.py               | 2 +-
 recipes/lora_finetune_distributed.py            | 2 +-
 recipes/lora_finetune_single_device.py          | 2 +-
 recipes/ppo_full_finetune_single_device.py      | 2 +-
 recipes/qat_distributed.py                      | 2 +-
 recipes/qat_lora_finetune_distributed.py        | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py
index 663697e978..1d9edf8d8f 100644
--- a/recipes/dev/early_exit_finetune_distributed.py
+++ b/recipes/dev/early_exit_finetune_distributed.py
@@ -664,7 +664,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 4f32faefdb..9bac042eea 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -657,7 +657,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 946e970206..69964595d6 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -573,7 +573,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=self.seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
index 77fc50927c..f1c6bd26eb 100644
--- a/recipes/knowledge_distillation_distributed.py
+++ b/recipes/knowledge_distillation_distributed.py
@@ -662,7 +662,7 @@ def _setup_data(
             num_replicas=world_size,
             rank=rank,
             shuffle=shuffle,
-            seed=0,
+            seed=self.seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index 71d850d791..9764243e95 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -537,7 +537,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=self.seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index d54adc2cf4..cb7c97d3f7 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -504,7 +504,7 @@ def _setup_data(
             ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
         )
 
         dataloader = DataLoader(
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index 9b5dc6fb1a..189e6a90fa 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -389,7 +389,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=self.seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index 39c8b104e5..707b7fed4a 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -602,7 +602,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
         )
 
         dataloader = DataLoader(
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 9a3f3eacfb..a5caca9254 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -543,7 +543,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=self.seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index cb6357c3dc..29834d1e70 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -579,7 +579,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=self.seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index 8c458daa21..fb9226beba 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -617,7 +617,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
index c742dae226..c1dcbaf24c 100644
--- a/recipes/qat_lora_finetune_distributed.py
+++ b/recipes/qat_lora_finetune_distributed.py
@@ -644,7 +644,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
         )
 
         dataloader = DataLoader(

From e8b9c848374ce84b023acf016ba13933aad02c0f Mon Sep 17 00:00:00 2001
From: Eugen Hotaj <eugen_hotaj_91@hotmail.com>
Date: Sat, 18 Jan 2025 11:04:17 -0800
Subject: [PATCH 2/6] comments

---
 recipes/dev/early_exit_finetune_distributed.py  | 4 +++-
 recipes/full_finetune_distributed.py            | 4 +++-
 recipes/full_finetune_single_device.py          | 4 +++-
 recipes/knowledge_distillation_distributed.py   | 4 +++-
 recipes/knowledge_distillation_single_device.py | 4 +++-
 recipes/lora_dpo_distributed.py                 | 4 +++-
 recipes/lora_dpo_single_device.py               | 4 +++-
 recipes/lora_finetune_distributed.py            | 4 +++-
 recipes/ppo_full_finetune_single_device.py      | 5 +++--
 recipes/qat_distributed.py                      | 4 +++-
 recipes/qat_lora_finetune_distributed.py        | 4 +++-
 11 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py
index 1d9edf8d8f..36785e1ab3 100644
--- a/recipes/dev/early_exit_finetune_distributed.py
+++ b/recipes/dev/early_exit_finetune_distributed.py
@@ -367,6 +367,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -639,6 +640,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -664,7 +666,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 9bac042eea..8b02d3b68e 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -318,6 +318,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -632,6 +633,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -657,7 +659,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 69964595d6..810ac4c38b 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -305,6 +305,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -546,6 +547,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -573,7 +575,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=self.seed,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
index f1c6bd26eb..b93330345c 100644
--- a/recipes/knowledge_distillation_distributed.py
+++ b/recipes/knowledge_distillation_distributed.py
@@ -292,6 +292,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -638,6 +639,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -662,7 +664,7 @@ def _setup_data(
             num_replicas=world_size,
             rank=rank,
             shuffle=shuffle,
-            seed=self.seed,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index 9764243e95..d0dd026a75 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -283,6 +283,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -515,6 +516,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -537,7 +539,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=self.seed,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index cb7c97d3f7..7e464f00e3 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -296,6 +296,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -486,6 +487,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -504,7 +506,7 @@ def _setup_data(
             ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
 
         dataloader = DataLoader(
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index 189e6a90fa..965b3c5849 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -247,6 +247,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -369,6 +370,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -389,7 +391,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=self.seed,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index 707b7fed4a..46848130af 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -312,6 +312,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -577,6 +578,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -602,7 +604,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
 
         dataloader = DataLoader(
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index 29834d1e70..dcb125c2e2 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -218,6 +218,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed", 0),
         )
 
         self._setup_training_parameters(cfg)
@@ -560,7 +561,7 @@ def _setup_optimizer(
             return optimizer
 
     def _setup_data(
-        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int
+        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int, seed: int
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here.
@@ -579,7 +580,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=self.seed,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index fb9226beba..f0989640a7 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -321,6 +321,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -592,6 +593,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -617,7 +619,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
         dataloader = DataLoader(
             dataset=ds,
diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
index c1dcbaf24c..a48067b1c4 100644
--- a/recipes/qat_lora_finetune_distributed.py
+++ b/recipes/qat_lora_finetune_distributed.py
@@ -335,6 +335,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -619,6 +620,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -644,7 +646,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
 
         dataloader = DataLoader(

From 41147aa2eb364d17f95df85f42c2f8df21b89ba3 Mon Sep 17 00:00:00 2001
From: Eugen Hotaj <eugen_hotaj_91@hotmail.com>
Date: Sat, 18 Jan 2025 16:12:56 -0800
Subject: [PATCH 3/6] missed one

---
 recipes/lora_finetune_single_device.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index a5caca9254..0ff4f03ed7 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -308,6 +308,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed", 0),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -516,6 +517,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -543,7 +545,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=self.seed,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,

From aec6477510c324c3416de4c6a92f70ae3beedada Mon Sep 17 00:00:00 2001
From: Eugen Hotaj <eugen_hotaj_91@hotmail.com>
Date: Sun, 19 Jan 2025 13:52:49 -0800
Subject: [PATCH 4/6] Update tests to match new loss values.

---
 .../recipes/test_full_finetune_distributed.py |  8 +++----
 .../test_full_finetune_single_device.py       |  4 ++--
 ...test_knowledge_distillation_distributed.py |  2 +-
 ...st_knowledge_distillation_single_device.py |  2 +-
 .../recipes/test_lora_finetune_distributed.py |  4 ++--
 .../test_lora_finetune_single_device.py       |  8 +++----
 .../test_ppo_full_finetune_single_device.py   | 24 +++++++++----------
 .../test_qat_lora_finetune_distributed.py     |  2 +-
 8 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/tests/recipes/test_full_finetune_distributed.py b/tests/recipes/test_full_finetune_distributed.py
index 4cdc42d96b..ecc2e69899 100644
--- a/tests/recipes/test_full_finetune_distributed.py
+++ b/tests/recipes/test_full_finetune_distributed.py
@@ -51,15 +51,15 @@ def _get_test_config_overrides(self, epochs: int = 2):
 
     def _fetch_expected_loss_values_multi_rank(self, model_type):
         loss_values_map = {
-            "llama2": [10.5209, 10.5217, 10.4945, 10.5136],
-            "llama3": [11.9839, 11.9684, 11.9596, 11.93656],
+            "llama2": [10.5320, 10.5581, 10.4741, 10.4980],
+            "llama3": [11.9265, 11.9249, 11.9737, 11.9757],
         }
         return loss_values_map[model_type]
 
     def _fetch_expected_loss_values_single_rank(self, model_type):
         loss_values_map = {
-            "llama2": [10.5051, 10.5572, 10.4780, 10.5678],
-            "llama3": [11.9742, 12.0049, 11.9382, 12.0464],
+            "llama2": [10.5509, 10.4980, 10.4821, 10.4682],
+            "llama3": [11.8887, 11.9787, 11.9533, 11.9979],
         }
         return loss_values_map[model_type]
 
diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py
index d15601a1b1..69e3d3973b 100644
--- a/tests/recipes/test_full_finetune_single_device.py
+++ b/tests/recipes/test_full_finetune_single_device.py
@@ -56,8 +56,8 @@ def _get_test_config_overrides(self):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama2": [10.5201, 10.5217, 10.4945, 10.5136],
-            "llama3": [11.9839, 11.9684, 11.9596, 11.9366],
+            "llama2": [10.5320, 10.5581, 10.4740, 10.4984],
+            "llama3": [11.9265, 11.9249, 11.9737, 11.9757],
         }
 
         return loss_values_map[model_type]
diff --git a/tests/recipes/test_knowledge_distillation_distributed.py b/tests/recipes/test_knowledge_distillation_distributed.py
index 43c29400c3..8ed82689b8 100644
--- a/tests/recipes/test_knowledge_distillation_distributed.py
+++ b/tests/recipes/test_knowledge_distillation_distributed.py
@@ -55,7 +55,7 @@ def _get_test_config_overrides(self, epochs: int = 2):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama3": [11.8316, 11.7520, 11.7642, 11.7664],
+            "llama3": [11.7545, 11.7653, 11.7810, 11.7969],
         }
         return loss_values_map[model_type]
 
diff --git a/tests/recipes/test_knowledge_distillation_single_device.py b/tests/recipes/test_knowledge_distillation_single_device.py
index 76a6b1479c..e7cee96003 100644
--- a/tests/recipes/test_knowledge_distillation_single_device.py
+++ b/tests/recipes/test_knowledge_distillation_single_device.py
@@ -56,7 +56,7 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama3": [11.7898, 11.7825, 11.7788, 11.7671],
+            "llama3": [11.7612, 11.7607, 11.7856, 11.7869],
         }
         return loss_values_map[model_type]
 
diff --git a/tests/recipes/test_lora_finetune_distributed.py b/tests/recipes/test_lora_finetune_distributed.py
index ef2686aeba..790bc780b3 100644
--- a/tests/recipes/test_lora_finetune_distributed.py
+++ b/tests/recipes/test_lora_finetune_distributed.py
@@ -54,8 +54,8 @@ def _fetch_expected_loss_values(self, model_type):
         # These values have been validated against single device recipe test via
         # https://gist.github.com/ebsmothers/f1c3db7c66655a23a91e0290360960c4
         loss_values_map = {
-            "llama2": [10.5209, 10.5269, 10.5130, 10.5242],
-            "llama3": [11.9839, 11.9691, 11.9617, 11.9383],
+            "llama2": [10.5320, 10.5608, 10.4895, 10.5068],
+            "llama3": [11.9259, 11.9250, 11.9753, 11.9774],
         }
         return loss_values_map[model_type]
 
diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py
index 39b1fa3b6a..a760de0d47 100644
--- a/tests/recipes/test_lora_finetune_single_device.py
+++ b/tests/recipes/test_lora_finetune_single_device.py
@@ -52,15 +52,15 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama2": [10.5209, 10.5269, 10.5130, 10.5242],
-            "llama3": [11.9838, 11.9691, 11.9616, 11.9383],
+            "llama2": [10.5320, 10.5608, 10.4895, 10.5068],
+            "llama3": [11.9265, 11.9255, 11.9754, 11.9780],
         }
         return loss_values_map[model_type]
 
     def _fetch_qlora_expected_loss_values(self, dtype):
         if dtype == "bf16":
-            return [10.5197, 10.5272, 10.5129, 10.5243]
-        return [10.5198, 10.5271, 10.5131, 10.5244]
+            return [10.5313, 10.5575, 10.4884, 10.5073]
+        return [10.5313, 10.5576, 10.4885, 10.5071]
 
     @pytest.mark.integration_test
     @pytest.mark.parametrize(
diff --git a/tests/recipes/test_ppo_full_finetune_single_device.py b/tests/recipes/test_ppo_full_finetune_single_device.py
index 412c4c06dd..acb67b016b 100644
--- a/tests/recipes/test_ppo_full_finetune_single_device.py
+++ b/tests/recipes/test_ppo_full_finetune_single_device.py
@@ -123,18 +123,18 @@ def test_loss(self, tmpdir, monkeypatch):
 
         loss_values = get_loss_values_from_metric_logger(log_file)
         expected_loss_values = [
-            1.0403,
-            0.9495,
-            0.9084,
-            1.0494,
-            0.9609,
-            0.8846,
-            1.0282,
-            0.9390,
-            0.8915,
-            1.0166,
-            0.9231,
-            0.9352,
+            1.0522,
+            0.9608,
+            0.9141,
+            1.0410,
+            0.9544,
+            0.8663,
+            1.0258,
+            0.9375,
+            0.8831,
+            1.0182,
+            0.9241,
+            0.9411,
         ]
         torch.testing.assert_close(
             loss_values, expected_loss_values, atol=1e-4, rtol=1e-5
diff --git a/tests/recipes/test_qat_lora_finetune_distributed.py b/tests/recipes/test_qat_lora_finetune_distributed.py
index 6c43adcc73..be49c0c80f 100644
--- a/tests/recipes/test_qat_lora_finetune_distributed.py
+++ b/tests/recipes/test_qat_lora_finetune_distributed.py
@@ -53,7 +53,7 @@ def _get_test_config_overrides(self):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama3": [11.9835, 11.9694, 11.9615, 11.9383],
+            "llama3": [11.9259, 11.9250, 11.9753, 11.9774],
         }
         return loss_values_map[model_type]
 

From 3708bf9e9e7379fe17cbdd8bb4d2367f5c6a3d99 Mon Sep 17 00:00:00 2001
From: Eugen Hotaj <eugen_hotaj_91@hotmail.com>
Date: Sun, 19 Jan 2025 16:17:49 -0800
Subject: [PATCH 5/6] Update missed test, fix miscopied values.

---
 tests/recipes/test_lora_finetune_distributed.py | 2 +-
 tests/recipes/test_qat_distributed.py           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/recipes/test_lora_finetune_distributed.py b/tests/recipes/test_lora_finetune_distributed.py
index 790bc780b3..6ea05c3aa0 100644
--- a/tests/recipes/test_lora_finetune_distributed.py
+++ b/tests/recipes/test_lora_finetune_distributed.py
@@ -55,7 +55,7 @@ def _fetch_expected_loss_values(self, model_type):
         # https://gist.github.com/ebsmothers/f1c3db7c66655a23a91e0290360960c4
         loss_values_map = {
             "llama2": [10.5320, 10.5608, 10.4895, 10.5068],
-            "llama3": [11.9259, 11.9250, 11.9753, 11.9774],
+            "llama3": [11.9265, 11.9255, 11.9754, 11.9780],
         }
         return loss_values_map[model_type]
 
diff --git a/tests/recipes/test_qat_distributed.py b/tests/recipes/test_qat_distributed.py
index 34dd190125..9f2193a13f 100644
--- a/tests/recipes/test_qat_distributed.py
+++ b/tests/recipes/test_qat_distributed.py
@@ -45,8 +45,8 @@ def _get_test_config_overrides(self):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama2": [10.5211, 10.5217, 10.4944, 10.5134],
-            "llama3": [11.9836, 11.9683, 11.9594, 11.9366],
+            "llama2": [10.5337, 10.5563, 10.4786, 10.5002],
+            "llama3": [11.9270, 11.9240, 11.9731, 11.9751],
         }
         return loss_values_map[model_type]
 

From 9f7951ec99500ae52ec53d5c03eeb47d373b7a94 Mon Sep 17 00:00:00 2001
From: Eugen Hotaj <eugen_hotaj_91@hotmail.com>
Date: Tue, 28 Jan 2025 12:05:45 -0800
Subject: [PATCH 6/6] Handle edge case where "seed" does not exist in config.

---
 recipes/dev/early_exit_finetune_distributed.py  | 2 +-
 recipes/full_finetune_distributed.py            | 2 +-
 recipes/full_finetune_single_device.py          | 2 +-
 recipes/knowledge_distillation_distributed.py   | 2 +-
 recipes/knowledge_distillation_single_device.py | 2 +-
 recipes/lora_dpo_distributed.py                 | 2 +-
 recipes/lora_dpo_single_device.py               | 2 +-
 recipes/lora_finetune_distributed.py            | 2 +-
 recipes/lora_finetune_single_device.py          | 2 +-
 recipes/ppo_full_finetune_single_device.py      | 2 +-
 recipes/qat_distributed.py                      | 2 +-
 recipes/qat_lora_finetune_distributed.py        | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py
index 3d9e054fb8..5492b0d2cd 100644
--- a/recipes/dev/early_exit_finetune_distributed.py
+++ b/recipes/dev/early_exit_finetune_distributed.py
@@ -367,7 +367,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 80ec3e2495..b0dfdef924 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -318,7 +318,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 8e8e5f400a..f842a45017 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -305,7 +305,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
index daef5c79bd..5dfae086ad 100644
--- a/recipes/knowledge_distillation_distributed.py
+++ b/recipes/knowledge_distillation_distributed.py
@@ -292,7 +292,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index a2552bdd9f..c9cf058784 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -287,7 +287,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index 7e464f00e3..8888266ab5 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -296,7 +296,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index 0185a227d6..6dbed68c98 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -247,7 +247,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index e67b6e66a5..1be8ba71a9 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -312,7 +312,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 5561c1c01b..9c946657e6 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -308,7 +308,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index c3c501bcfe..65d7e7eeb4 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -225,7 +225,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         self._setup_training_parameters(cfg)
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index cffdc0fc9d..15e482e989 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -321,7 +321,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
index 926eeb81ce..66637a3d8c 100644
--- a/recipes/qat_lora_finetune_distributed.py
+++ b/recipes/qat_lora_finetune_distributed.py
@@ -335,7 +335,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            seed=cfg.get("seed", 0),
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the