pytorch · EugenHotaj · Jan 14, 2025 · Jan 18, 2025 · Jan 19, 2025 · Jan 19, 2025
diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py
@@ -367,6 +367,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -639,6 +640,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -664,7 +666,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -331,6 +331,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -645,6 +646,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -670,7 +672,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -305,6 +305,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -546,6 +547,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -573,7 +575,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
@@ -292,6 +292,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -638,6 +639,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -662,7 +664,7 @@ def _setup_data(
             num_replicas=world_size,
             rank=rank,
             shuffle=shuffle,
-            seed=0,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
@@ -287,6 +287,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -519,6 +520,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -541,7 +543,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -296,6 +296,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -486,6 +487,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -504,7 +506,7 @@ def _setup_data(
             ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
 
         dataloader = DataLoader(

diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
@@ -247,6 +247,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -369,6 +370,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -389,7 +391,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -312,6 +312,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -577,6 +578,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -602,7 +604,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
 
         dataloader = DataLoader(

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -308,6 +308,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -516,6 +517,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -543,7 +545,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
@@ -225,6 +225,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            seed=cfg.get("seed") or 0,
         )
 
         self._setup_training_parameters(cfg)
@@ -651,7 +652,7 @@ def _setup_optimizer(
             return optimizer
 
     def _setup_data(
-        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int
+        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int, seed: int
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here.
@@ -670,7 +671,7 @@ def _setup_data(
             num_replicas=1,
             rank=0,
             shuffle=shuffle,
-            seed=0,
+            seed=seed,
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -321,6 +321,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -592,6 +593,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -617,7 +619,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
         dataloader = DataLoader(
             dataset=ds,

diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
@@ -335,6 +335,7 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            seed=cfg.get("seed") or 0,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -619,6 +620,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        seed: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -644,7 +646,7 @@ def _setup_data(
         collate_fn = _get_component_from_path(collate_fn)
 
         sampler = DistributedSampler(
-            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
+            ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed
         )
 
         dataloader = DataLoader(

diff --git a/tests/recipes/test_full_finetune_distributed.py b/tests/recipes/test_full_finetune_distributed.py
@@ -51,15 +51,15 @@ def _get_test_config_overrides(self, epochs: int = 2):
 
     def _fetch_expected_loss_values_multi_rank(self, model_type):
         loss_values_map = {
-            "llama2": [10.5209, 10.5217, 10.4945, 10.5136],
-            "llama3": [11.9839, 11.9684, 11.9596, 11.93656],
+            "llama2": [10.5320, 10.5581, 10.4741, 10.4980],
+            "llama3": [11.9265, 11.9249, 11.9737, 11.9757],
         }
         return loss_values_map[model_type]
 
     def _fetch_expected_loss_values_single_rank(self, model_type):
         loss_values_map = {
-            "llama2": [10.5051, 10.5572, 10.4780, 10.5678],
-            "llama3": [11.9742, 12.0049, 11.9382, 12.0464],
+            "llama2": [10.5509, 10.4980, 10.4821, 10.4682],
+            "llama3": [11.8887, 11.9787, 11.9533, 11.9979],
         }
         return loss_values_map[model_type]
 

diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py
@@ -56,8 +56,8 @@ def _get_test_config_overrides(self):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama2": [10.5201, 10.5217, 10.4945, 10.5136],
-            "llama3": [11.9839, 11.9684, 11.9596, 11.9366],
+            "llama2": [10.5320, 10.5581, 10.4740, 10.4984],
+            "llama3": [11.9265, 11.9249, 11.9737, 11.9757],
         }
 
         return loss_values_map[model_type]

diff --git a/tests/recipes/test_knowledge_distillation_distributed.py b/tests/recipes/test_knowledge_distillation_distributed.py
@@ -55,7 +55,7 @@ def _get_test_config_overrides(self, epochs: int = 2):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama3": [11.8316, 11.7520, 11.7642, 11.7664],
+            "llama3": [11.7545, 11.7653, 11.7810, 11.7969],
         }
         return loss_values_map[model_type]
 

diff --git a/tests/recipes/test_knowledge_distillation_single_device.py b/tests/recipes/test_knowledge_distillation_single_device.py
@@ -56,7 +56,7 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama3": [11.7898, 11.7825, 11.7788, 11.7671],
+            "llama3": [11.7612, 11.7607, 11.7856, 11.7869],
         }
         return loss_values_map[model_type]
 

diff --git a/tests/recipes/test_lora_finetune_distributed.py b/tests/recipes/test_lora_finetune_distributed.py
@@ -54,8 +54,8 @@ def _fetch_expected_loss_values(self, model_type):
         # These values have been validated against single device recipe test via
         # https://gist.github.com/ebsmothers/f1c3db7c66655a23a91e0290360960c4
         loss_values_map = {
-            "llama2": [10.5209, 10.5269, 10.5130, 10.5242],
-            "llama3": [11.9839, 11.9691, 11.9617, 11.9383],
+            "llama2": [10.5320, 10.5608, 10.4895, 10.5068],
+            "llama3": [11.9265, 11.9255, 11.9754, 11.9780],
         }
         return loss_values_map[model_type]
 

diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py
@@ -52,15 +52,15 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2):
 
     def _fetch_expected_loss_values(self, model_type):
         loss_values_map = {
-            "llama2": [10.5209, 10.5269, 10.5130, 10.5242],
-            "llama3": [11.9838, 11.9691, 11.9616, 11.9383],
+            "llama2": [10.5320, 10.5608, 10.4895, 10.5068],
+            "llama3": [11.9265, 11.9255, 11.9754, 11.9780],
         }
         return loss_values_map[model_type]
 
     def _fetch_qlora_expected_loss_values(self, dtype):
         if dtype == "bf16":
-            return [10.5197, 10.5272, 10.5129, 10.5243]
-        return [10.5198, 10.5271, 10.5131, 10.5244]
+            return [10.5313, 10.5575, 10.4884, 10.5073]
+        return [10.5313, 10.5576, 10.4885, 10.5071]
 
     @pytest.mark.integration_test
     @pytest.mark.parametrize(