From 93718d197df2459d1c31bd09604c8e2788c9760e Mon Sep 17 00:00:00 2001 From: Eugen Hotaj Date: Tue, 14 Jan 2025 10:53:44 -0800 Subject: [PATCH 1/6] [EZ] Pass seed to data sampler. We currently hardcode zero. I assume this is not intentional. --- recipes/dev/early_exit_finetune_distributed.py | 2 +- recipes/full_finetune_distributed.py | 2 +- recipes/full_finetune_single_device.py | 2 +- recipes/knowledge_distillation_distributed.py | 2 +- recipes/knowledge_distillation_single_device.py | 2 +- recipes/lora_dpo_distributed.py | 2 +- recipes/lora_dpo_single_device.py | 2 +- recipes/lora_finetune_distributed.py | 2 +- recipes/lora_finetune_single_device.py | 2 +- recipes/ppo_full_finetune_single_device.py | 2 +- recipes/qat_distributed.py | 2 +- recipes/qat_lora_finetune_distributed.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py index 663697e978..1d9edf8d8f 100644 --- a/recipes/dev/early_exit_finetune_distributed.py +++ b/recipes/dev/early_exit_finetune_distributed.py @@ -664,7 +664,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0 + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 4f32faefdb..9bac042eea 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -657,7 +657,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0 + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index 946e970206..69964595d6 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -573,7 +573,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=0, + seed=self.seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py index 77fc50927c..f1c6bd26eb 100644 --- a/recipes/knowledge_distillation_distributed.py +++ b/recipes/knowledge_distillation_distributed.py @@ -662,7 +662,7 @@ def _setup_data( num_replicas=world_size, rank=rank, shuffle=shuffle, - seed=0, + seed=self.seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index 71d850d791..9764243e95 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -537,7 +537,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=0, + seed=self.seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index d54adc2cf4..cb7c97d3f7 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -504,7 +504,7 @@ def _setup_data( ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0 + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed ) dataloader = DataLoader( diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index 9b5dc6fb1a..189e6a90fa 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -389,7 +389,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=0, + seed=self.seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 39c8b104e5..707b7fed4a 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -602,7 +602,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0 + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed ) dataloader = DataLoader( diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 9a3f3eacfb..a5caca9254 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -543,7 +543,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=0, + seed=self.seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index cb6357c3dc..29834d1e70 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -579,7 +579,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=0, + seed=self.seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index 8c458daa21..fb9226beba 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -617,7 +617,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0 + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py index c742dae226..c1dcbaf24c 100644 --- a/recipes/qat_lora_finetune_distributed.py +++ b/recipes/qat_lora_finetune_distributed.py @@ -644,7 +644,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0 + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed ) dataloader = DataLoader( From e8b9c848374ce84b023acf016ba13933aad02c0f Mon Sep 17 00:00:00 2001 From: Eugen Hotaj Date: Sat, 18 Jan 2025 11:04:17 -0800 Subject: [PATCH 2/6] comments --- recipes/dev/early_exit_finetune_distributed.py | 4 +++- recipes/full_finetune_distributed.py | 4 +++- recipes/full_finetune_single_device.py | 4 +++- recipes/knowledge_distillation_distributed.py | 4 +++- recipes/knowledge_distillation_single_device.py | 4 +++- recipes/lora_dpo_distributed.py | 4 +++- recipes/lora_dpo_single_device.py | 4 +++- recipes/lora_finetune_distributed.py | 4 +++- recipes/ppo_full_finetune_single_device.py | 5 +++-- recipes/qat_distributed.py | 4 +++- recipes/qat_lora_finetune_distributed.py | 4 +++- 11 files changed, 33 insertions(+), 12 deletions(-) diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py index 1d9edf8d8f..36785e1ab3 100644 --- a/recipes/dev/early_exit_finetune_distributed.py +++ b/recipes/dev/early_exit_finetune_distributed.py @@ -367,6 +367,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -639,6 +640,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -664,7 +666,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 9bac042eea..8b02d3b68e 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -318,6 +318,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -632,6 +633,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -657,7 +659,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index 69964595d6..810ac4c38b 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -305,6 +305,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -546,6 +547,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -573,7 +575,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=self.seed, + seed=seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py index f1c6bd26eb..b93330345c 100644 --- a/recipes/knowledge_distillation_distributed.py +++ b/recipes/knowledge_distillation_distributed.py @@ -292,6 +292,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -638,6 +639,7 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -662,7 +664,7 @@ def _setup_data( num_replicas=world_size, rank=rank, shuffle=shuffle, - seed=self.seed, + seed=seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index 9764243e95..d0dd026a75 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -283,6 +283,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -515,6 +516,7 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -537,7 +539,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=self.seed, + seed=seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index cb7c97d3f7..7e464f00e3 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -296,6 +296,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -486,6 +487,7 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -504,7 +506,7 @@ def _setup_data( ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed ) dataloader = DataLoader( diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index 189e6a90fa..965b3c5849 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -247,6 +247,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -369,6 +370,7 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -389,7 +391,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=self.seed, + seed=seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 707b7fed4a..46848130af 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -312,6 +312,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -577,6 +578,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -602,7 +604,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed ) dataloader = DataLoader( diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index 29834d1e70..dcb125c2e2 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -218,6 +218,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + seed=cfg.get("seed", 0), ) self._setup_training_parameters(cfg) @@ -560,7 +561,7 @@ def _setup_optimizer( return optimizer def _setup_data( - self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int + self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int, seed: int ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. @@ -579,7 +580,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=self.seed, + seed=seed, ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index fb9226beba..f0989640a7 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -321,6 +321,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -592,6 +593,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -617,7 +619,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed ) dataloader = DataLoader( dataset=ds, diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py index c1dcbaf24c..a48067b1c4 100644 --- a/recipes/qat_lora_finetune_distributed.py +++ b/recipes/qat_lora_finetune_distributed.py @@ -335,6 +335,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -619,6 +620,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -644,7 +646,7 @@ def _setup_data( collate_fn = _get_component_from_path(collate_fn) sampler = DistributedSampler( - ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=self.seed + ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed ) dataloader = DataLoader( From 41147aa2eb364d17f95df85f42c2f8df21b89ba3 Mon Sep 17 00:00:00 2001 From: Eugen Hotaj Date: Sat, 18 Jan 2025 16:12:56 -0800 Subject: [PATCH 3/6] missed one --- recipes/lora_finetune_single_device.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index a5caca9254..0ff4f03ed7 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -308,6 +308,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + seed=cfg.get("seed", 0), ) # Finally update the recipe state which can only be correctly set after all of the @@ -516,6 +517,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + seed: int, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -543,7 +545,7 @@ def _setup_data( num_replicas=1, rank=0, shuffle=shuffle, - seed=self.seed, + seed=seed, ) dataloader = DataLoader( dataset=ds, From aec6477510c324c3416de4c6a92f70ae3beedada Mon Sep 17 00:00:00 2001 From: Eugen Hotaj Date: Sun, 19 Jan 2025 13:52:49 -0800 Subject: [PATCH 4/6] Update tests to match new loss values. --- .../recipes/test_full_finetune_distributed.py | 8 +++---- .../test_full_finetune_single_device.py | 4 ++-- ...test_knowledge_distillation_distributed.py | 2 +- ...st_knowledge_distillation_single_device.py | 2 +- .../recipes/test_lora_finetune_distributed.py | 4 ++-- .../test_lora_finetune_single_device.py | 8 +++---- .../test_ppo_full_finetune_single_device.py | 24 +++++++++---------- .../test_qat_lora_finetune_distributed.py | 2 +- 8 files changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/recipes/test_full_finetune_distributed.py b/tests/recipes/test_full_finetune_distributed.py index 4cdc42d96b..ecc2e69899 100644 --- a/tests/recipes/test_full_finetune_distributed.py +++ b/tests/recipes/test_full_finetune_distributed.py @@ -51,15 +51,15 @@ def _get_test_config_overrides(self, epochs: int = 2): def _fetch_expected_loss_values_multi_rank(self, model_type): loss_values_map = { - "llama2": [10.5209, 10.5217, 10.4945, 10.5136], - "llama3": [11.9839, 11.9684, 11.9596, 11.93656], + "llama2": [10.5320, 10.5581, 10.4741, 10.4980], + "llama3": [11.9265, 11.9249, 11.9737, 11.9757], } return loss_values_map[model_type] def _fetch_expected_loss_values_single_rank(self, model_type): loss_values_map = { - "llama2": [10.5051, 10.5572, 10.4780, 10.5678], - "llama3": [11.9742, 12.0049, 11.9382, 12.0464], + "llama2": [10.5509, 10.4980, 10.4821, 10.4682], + "llama3": [11.8887, 11.9787, 11.9533, 11.9979], } return loss_values_map[model_type] diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py index d15601a1b1..69e3d3973b 100644 --- a/tests/recipes/test_full_finetune_single_device.py +++ b/tests/recipes/test_full_finetune_single_device.py @@ -56,8 +56,8 @@ def _get_test_config_overrides(self): def _fetch_expected_loss_values(self, model_type): loss_values_map = { - "llama2": [10.5201, 10.5217, 10.4945, 10.5136], - "llama3": [11.9839, 11.9684, 11.9596, 11.9366], + "llama2": [10.5320, 10.5581, 10.4740, 10.4984], + "llama3": [11.9265, 11.9249, 11.9737, 11.9757], } return loss_values_map[model_type] diff --git a/tests/recipes/test_knowledge_distillation_distributed.py b/tests/recipes/test_knowledge_distillation_distributed.py index 43c29400c3..8ed82689b8 100644 --- a/tests/recipes/test_knowledge_distillation_distributed.py +++ b/tests/recipes/test_knowledge_distillation_distributed.py @@ -55,7 +55,7 @@ def _get_test_config_overrides(self, epochs: int = 2): def _fetch_expected_loss_values(self, model_type): loss_values_map = { - "llama3": [11.8316, 11.7520, 11.7642, 11.7664], + "llama3": [11.7545, 11.7653, 11.7810, 11.7969], } return loss_values_map[model_type] diff --git a/tests/recipes/test_knowledge_distillation_single_device.py b/tests/recipes/test_knowledge_distillation_single_device.py index 76a6b1479c..e7cee96003 100644 --- a/tests/recipes/test_knowledge_distillation_single_device.py +++ b/tests/recipes/test_knowledge_distillation_single_device.py @@ -56,7 +56,7 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2): def _fetch_expected_loss_values(self, model_type): loss_values_map = { - "llama3": [11.7898, 11.7825, 11.7788, 11.7671], + "llama3": [11.7612, 11.7607, 11.7856, 11.7869], } return loss_values_map[model_type] diff --git a/tests/recipes/test_lora_finetune_distributed.py b/tests/recipes/test_lora_finetune_distributed.py index ef2686aeba..790bc780b3 100644 --- a/tests/recipes/test_lora_finetune_distributed.py +++ b/tests/recipes/test_lora_finetune_distributed.py @@ -54,8 +54,8 @@ def _fetch_expected_loss_values(self, model_type): # These values have been validated against single device recipe test via # https://gist.github.com/ebsmothers/f1c3db7c66655a23a91e0290360960c4 loss_values_map = { - "llama2": [10.5209, 10.5269, 10.5130, 10.5242], - "llama3": [11.9839, 11.9691, 11.9617, 11.9383], + "llama2": [10.5320, 10.5608, 10.4895, 10.5068], + "llama3": [11.9259, 11.9250, 11.9753, 11.9774], } return loss_values_map[model_type] diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py index 39b1fa3b6a..a760de0d47 100644 --- a/tests/recipes/test_lora_finetune_single_device.py +++ b/tests/recipes/test_lora_finetune_single_device.py @@ -52,15 +52,15 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32", epochs: int = 2): def _fetch_expected_loss_values(self, model_type): loss_values_map = { - "llama2": [10.5209, 10.5269, 10.5130, 10.5242], - "llama3": [11.9838, 11.9691, 11.9616, 11.9383], + "llama2": [10.5320, 10.5608, 10.4895, 10.5068], + "llama3": [11.9265, 11.9255, 11.9754, 11.9780], } return loss_values_map[model_type] def _fetch_qlora_expected_loss_values(self, dtype): if dtype == "bf16": - return [10.5197, 10.5272, 10.5129, 10.5243] - return [10.5198, 10.5271, 10.5131, 10.5244] + return [10.5313, 10.5575, 10.4884, 10.5073] + return [10.5313, 10.5576, 10.4885, 10.5071] @pytest.mark.integration_test @pytest.mark.parametrize( diff --git a/tests/recipes/test_ppo_full_finetune_single_device.py b/tests/recipes/test_ppo_full_finetune_single_device.py index 412c4c06dd..acb67b016b 100644 --- a/tests/recipes/test_ppo_full_finetune_single_device.py +++ b/tests/recipes/test_ppo_full_finetune_single_device.py @@ -123,18 +123,18 @@ def test_loss(self, tmpdir, monkeypatch): loss_values = get_loss_values_from_metric_logger(log_file) expected_loss_values = [ - 1.0403, - 0.9495, - 0.9084, - 1.0494, - 0.9609, - 0.8846, - 1.0282, - 0.9390, - 0.8915, - 1.0166, - 0.9231, - 0.9352, + 1.0522, + 0.9608, + 0.9141, + 1.0410, + 0.9544, + 0.8663, + 1.0258, + 0.9375, + 0.8831, + 1.0182, + 0.9241, + 0.9411, ] torch.testing.assert_close( loss_values, expected_loss_values, atol=1e-4, rtol=1e-5 diff --git a/tests/recipes/test_qat_lora_finetune_distributed.py b/tests/recipes/test_qat_lora_finetune_distributed.py index 6c43adcc73..be49c0c80f 100644 --- a/tests/recipes/test_qat_lora_finetune_distributed.py +++ b/tests/recipes/test_qat_lora_finetune_distributed.py @@ -53,7 +53,7 @@ def _get_test_config_overrides(self): def _fetch_expected_loss_values(self, model_type): loss_values_map = { - "llama3": [11.9835, 11.9694, 11.9615, 11.9383], + "llama3": [11.9259, 11.9250, 11.9753, 11.9774], } return loss_values_map[model_type] From 3708bf9e9e7379fe17cbdd8bb4d2367f5c6a3d99 Mon Sep 17 00:00:00 2001 From: Eugen Hotaj Date: Sun, 19 Jan 2025 16:17:49 -0800 Subject: [PATCH 5/6] Update missed test, fix miscopied values. --- tests/recipes/test_lora_finetune_distributed.py | 2 +- tests/recipes/test_qat_distributed.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/recipes/test_lora_finetune_distributed.py b/tests/recipes/test_lora_finetune_distributed.py index 790bc780b3..6ea05c3aa0 100644 --- a/tests/recipes/test_lora_finetune_distributed.py +++ b/tests/recipes/test_lora_finetune_distributed.py @@ -55,7 +55,7 @@ def _fetch_expected_loss_values(self, model_type): # https://gist.github.com/ebsmothers/f1c3db7c66655a23a91e0290360960c4 loss_values_map = { "llama2": [10.5320, 10.5608, 10.4895, 10.5068], - "llama3": [11.9259, 11.9250, 11.9753, 11.9774], + "llama3": [11.9265, 11.9255, 11.9754, 11.9780], } return loss_values_map[model_type] diff --git a/tests/recipes/test_qat_distributed.py b/tests/recipes/test_qat_distributed.py index 34dd190125..9f2193a13f 100644 --- a/tests/recipes/test_qat_distributed.py +++ b/tests/recipes/test_qat_distributed.py @@ -45,8 +45,8 @@ def _get_test_config_overrides(self): def _fetch_expected_loss_values(self, model_type): loss_values_map = { - "llama2": [10.5211, 10.5217, 10.4944, 10.5134], - "llama3": [11.9836, 11.9683, 11.9594, 11.9366], + "llama2": [10.5337, 10.5563, 10.4786, 10.5002], + "llama3": [11.9270, 11.9240, 11.9731, 11.9751], } return loss_values_map[model_type] From 9f7951ec99500ae52ec53d5c03eeb47d373b7a94 Mon Sep 17 00:00:00 2001 From: Eugen Hotaj Date: Tue, 28 Jan 2025 12:05:45 -0800 Subject: [PATCH 6/6] Handle edge case where "seed" does not exist in config. --- recipes/dev/early_exit_finetune_distributed.py | 2 +- recipes/full_finetune_distributed.py | 2 +- recipes/full_finetune_single_device.py | 2 +- recipes/knowledge_distillation_distributed.py | 2 +- recipes/knowledge_distillation_single_device.py | 2 +- recipes/lora_dpo_distributed.py | 2 +- recipes/lora_dpo_single_device.py | 2 +- recipes/lora_finetune_distributed.py | 2 +- recipes/lora_finetune_single_device.py | 2 +- recipes/ppo_full_finetune_single_device.py | 2 +- recipes/qat_distributed.py | 2 +- recipes/qat_lora_finetune_distributed.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py index 3d9e054fb8..5492b0d2cd 100644 --- a/recipes/dev/early_exit_finetune_distributed.py +++ b/recipes/dev/early_exit_finetune_distributed.py @@ -367,7 +367,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 80ec3e2495..b0dfdef924 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -318,7 +318,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index 8e8e5f400a..f842a45017 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -305,7 +305,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py index daef5c79bd..5dfae086ad 100644 --- a/recipes/knowledge_distillation_distributed.py +++ b/recipes/knowledge_distillation_distributed.py @@ -292,7 +292,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index a2552bdd9f..c9cf058784 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -287,7 +287,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index 7e464f00e3..8888266ab5 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -296,7 +296,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index 0185a227d6..6dbed68c98 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -247,7 +247,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index e67b6e66a5..1be8ba71a9 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -312,7 +312,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 5561c1c01b..9c946657e6 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -308,7 +308,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index c3c501bcfe..65d7e7eeb4 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -225,7 +225,7 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) self._setup_training_parameters(cfg) diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index cffdc0fc9d..15e482e989 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -321,7 +321,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py index 926eeb81ce..66637a3d8c 100644 --- a/recipes/qat_lora_finetune_distributed.py +++ b/recipes/qat_lora_finetune_distributed.py @@ -335,7 +335,7 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - seed=cfg.get("seed", 0), + seed=cfg.get("seed") or 0, ) # Finally update the recipe state which can only be correctly set after all of the