From 16eed831752b0dd2e420436365fc3d7b15180852 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 10 Mar 2025 18:04:08 -0400 Subject: [PATCH 1/4] remove double initialize Signed-off-by: Kyle Sayers --- src/llmcompressor/core/lifecycle.py | 9 ++++++--- src/llmcompressor/transformers/finetune/session_mixin.py | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/llmcompressor/core/lifecycle.py b/src/llmcompressor/core/lifecycle.py index e69882800..bfbc297fd 100644 --- a/src/llmcompressor/core/lifecycle.py +++ b/src/llmcompressor/core/lifecycle.py @@ -84,10 +84,13 @@ def initialize( :return: List of data returned from initialization of modifiers :rtype: List[Any] """ - self.state.update(**kwargs) - if self.initialized_: # TODO: do not initialize twice - return + if self.initialized_: + raise ValueError( + "Initialize was called twice. To update state values after " + "initialization, please use `active_session().state.update()`" + ) + self.state.update(**kwargs) logger.debug("Initializing compression lifecycle") self.recipe_container.append(recipe, recipe_stage, recipe_args) self.modifiers = self.recipe_container.get_modifiers() diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index f64916e69..50a92b1f3 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -222,7 +222,9 @@ def create_optimizer(self): len(self.train_dataset) / total_batch_size ) - initialize(optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch) + active_session().state.update( + optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch + ) return self.optimizer From f722ab4d29ca21430dba7db11607b1d0f68bb21d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 13 Mar 2025 10:56:00 -0400 Subject: [PATCH 2/4] calculate total_steps_per_epoch earlier, remove compression_ready Signed-off-by: Kyle Sayers --- src/llmcompressor/core/lifecycle.py | 6 --- src/llmcompressor/core/state.py | 12 ------ .../transformers/finetune/session_mixin.py | 43 +++++++++---------- tests/unit/core/test_state.py | 10 ----- 4 files changed, 20 insertions(+), 51 deletions(-) diff --git a/src/llmcompressor/core/lifecycle.py b/src/llmcompressor/core/lifecycle.py index bfbc297fd..ea3b55954 100644 --- a/src/llmcompressor/core/lifecycle.py +++ b/src/llmcompressor/core/lifecycle.py @@ -218,12 +218,6 @@ def _check_setup_event_lifecycle(self, event_type: EventType): "Cannot invoke event before recipe, model, and start are set" ) - if not self.state.compression_ready: - logger.error("Cannot invoke event before recipe, model, and start are set") - raise ValueError( - "Cannot invoke event before recipe, model, and start are set" - ) - logger.debug("Setting up event lifecycle for event type: {}", event_type) for mod in self.modifiers: diff --git a/src/llmcompressor/core/state.py b/src/llmcompressor/core/state.py index 23b150284..7651eefa0 100644 --- a/src/llmcompressor/core/state.py +++ b/src/llmcompressor/core/state.py @@ -119,18 +119,6 @@ class State: model_log_cadence: Optional[float] = None _last_log_step: Union[float, int, None] = None - @property - def compression_ready(self) -> bool: - """ - Check if the model and optimizer are set for compression. - - :return: True if model and optimizer are set, False otherwise - :rtype: bool - """ - ready = self.model is not None and self.optimizer is not None - logger.debug("Compression ready: {}", ready) - return ready - def update( self, model: Any = None, diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index 50a92b1f3..2e15b8e0a 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -149,6 +149,25 @@ def initialize_session( train_data = self.get_train_dataloader() + # calculate total_steps_per_epoch + # n_gpu handled internally by dataloader + total_batch_size = ( + self.args.per_device_train_batch_size + * self.args.gradient_accumulation_steps + ) + if isinstance(self.train_dataset, IterableDataset): + logger.warning( + "Training is being run with a streamed dataset, " + "steps_per_epoch cannot be determined and will default to " + "1. LLM Compressor modifiers utilizing this statistic may not " + "behave as expected. " + ) + self.total_steps_per_epoch = 1 + else: + self.total_steps_per_epoch = math.ceil( + len(self.train_dataset) / total_batch_size + ) + self.accelerator.wait_for_everyone() with summon_full_params_context(self.model, offload_to_cpu=True): initialize( @@ -161,6 +180,7 @@ def initialize_session( start=epoch, copy_data=False, fsdp_active=self.is_fsdp_enabled, + steps_per_epoch=self.total_steps_per_epoch, metadata=self.metadata, ) self.accelerator.wait_for_everyone() @@ -203,29 +223,6 @@ def create_optimizer(self): self._check_super_defined("create_optimizer") super().create_optimizer() - # n_gpu handled internally by dataloader - total_batch_size = ( - self.args.per_device_train_batch_size - * self.args.gradient_accumulation_steps - ) - - if isinstance(self.train_dataset, IterableDataset): - logger.warning( - "Training is being run with a streamed dataset, " - "steps_per_epoch cannot be determined and will default to " - "1. LLM Compressor modifiers utilizing this statistic may not " - "behave as expected. " - ) - self.total_steps_per_epoch = 1 - else: - self.total_steps_per_epoch = math.ceil( - len(self.train_dataset) / total_batch_size - ) - - active_session().state.update( - optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch - ) - return self.optimizer def create_scheduler( diff --git a/tests/unit/core/test_state.py b/tests/unit/core/test_state.py index 3f7f992dc..bd291d895 100644 --- a/tests/unit/core/test_state.py +++ b/tests/unit/core/test_state.py @@ -67,16 +67,6 @@ def test_state_update(): assert state.model_log_cadence == 2 -@pytest.mark.regression -def test_state_sparsification_ready(): - state = State() - assert not state.compression_ready - - state.model = "model" - state.optimizer = "optimizer" - assert state.compression_ready - - @pytest.mark.regression def test_state_update_loggers(): state = State() From cda02888d9ba3e1ca26a3c623e63e047f7d45f22 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 13 Mar 2025 14:24:29 -0400 Subject: [PATCH 3/4] change error wording Signed-off-by: Kyle Sayers --- src/llmcompressor/core/lifecycle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/core/lifecycle.py b/src/llmcompressor/core/lifecycle.py index ea3b55954..99570f95c 100644 --- a/src/llmcompressor/core/lifecycle.py +++ b/src/llmcompressor/core/lifecycle.py @@ -86,7 +86,7 @@ def initialize( """ if self.initialized_: raise ValueError( - "Initialize was called twice. To update state values after " + "Initialize was called twice. To update state values prior to " "initialization, please use `active_session().state.update()`" ) From 800c6198250b590926bdbd09a21c09328789bfd1 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 27 Mar 2025 11:19:05 -0400 Subject: [PATCH 4/4] allow double initialization Signed-off-by: Kyle Sayers --- src/llmcompressor/core/lifecycle.py | 8 +--- .../transformers/finetune/session_mixin.py | 45 ++++++++++--------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/llmcompressor/core/lifecycle.py b/src/llmcompressor/core/lifecycle.py index 99570f95c..6ea5795a6 100644 --- a/src/llmcompressor/core/lifecycle.py +++ b/src/llmcompressor/core/lifecycle.py @@ -84,14 +84,8 @@ def initialize( :return: List of data returned from initialization of modifiers :rtype: List[Any] """ - if self.initialized_: - raise ValueError( - "Initialize was called twice. To update state values prior to " - "initialization, please use `active_session().state.update()`" - ) - - self.state.update(**kwargs) logger.debug("Initializing compression lifecycle") + self.state.update(**kwargs) self.recipe_container.append(recipe, recipe_stage, recipe_args) self.modifiers = self.recipe_container.get_modifiers() self._set_model_layer_prefix() diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index f07251af0..67eac59b4 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -143,25 +143,6 @@ def initialize_session( train_data = self.get_train_dataloader() - # calculate total_steps_per_epoch - # n_gpu handled internally by dataloader - total_batch_size = ( - self.args.per_device_train_batch_size - * self.args.gradient_accumulation_steps - ) - if isinstance(self.train_dataset, IterableDataset): - logger.warning( - "Training is being run with a streamed dataset, " - "steps_per_epoch cannot be determined and will default to " - "1. LLM Compressor modifiers utilizing this statistic may not " - "behave as expected. " - ) - self.total_steps_per_epoch = 1 - else: - self.total_steps_per_epoch = math.ceil( - len(self.train_dataset) / total_batch_size - ) - self.accelerator.wait_for_everyone() with summon_full_params_context(self.model, offload_to_cpu=True): active_session().initialize( @@ -175,7 +156,6 @@ def initialize_session( copy_data=False, attach_optim_callbacks=True, fsdp_active=self.is_fsdp_enabled, - steps_per_epoch=self.total_steps_per_epoch, metadata=self.metadata, ) @@ -219,6 +199,29 @@ def create_optimizer(self): self._check_super_defined("create_optimizer") super().create_optimizer() + # n_gpu handled internally by dataloader + total_batch_size = ( + self.args.per_device_train_batch_size + * self.args.gradient_accumulation_steps + ) + + if isinstance(self.train_dataset, IterableDataset): + logger.warning( + "Training is being run with a streamed dataset, " + "steps_per_epoch cannot be determined and will default to " + "1. LLM Compressor modifiers utilizing this statistic may not " + "behave as expected. " + ) + self.total_steps_per_epoch = 1 + else: + self.total_steps_per_epoch = math.ceil( + len(self.train_dataset) / total_batch_size + ) + + active_session().initialize( + optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch + ) + return self.optimizer def create_scheduler( @@ -255,7 +258,7 @@ def training_step( """ self._check_super_defined("training_step") - callbacks.batch_start(batch_data=inputs) + callbacks.batch_start(batch_data=inputs, global_step=self.state.epoch) model_outputs = super().training_step( model=model, inputs=inputs, num_items_in_batch=num_items_in_batch )