unnecessary dataclasses

IlyasMoutawwakil · IlyasMoutawwakil · commit 034114927457 · 2025-03-24T14:09:21.000Z
diff --git a/examples/stable-diffusion/training/train_dreambooth.py b/examples/stable-diffusion/training/train_dreambooth.py
@@ -39,6 +39,7 @@
 import torch.nn.functional as F
 import torch.utils.checkpoint
 import transformers
+from accelerate import DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs
 from diffusers import (
@@ -61,7 +62,6 @@
 
 from optimum.habana import GaudiConfig
 from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.accelerate.utils.dataclasses import GaudiDistributedType
 from optimum.habana.diffusers import GaudiStableDiffusionPipeline
 from optimum.habana.transformers.trainer import _is_peft_model
 from optimum.habana.utils import set_seed
@@ -1088,7 +1088,7 @@ def unwrap_model(model, training=False):
         if not training:
             return model
         else:
-            if accelerator.distributed_type == GaudiDistributedType.MULTI_HPU:
+            if accelerator.distributed_type == DistributedType.MULTI_HPU:
                 kwargs = {}
                 kwargs["gradient_as_bucket_view"] = True
                 accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_flux.py b/examples/stable-diffusion/training/train_dreambooth_lora_flux.py
@@ -34,6 +34,7 @@
 import torch
 import torch.utils.checkpoint
 import transformers
+from accelerate import DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration
 from datasets import load_dataset
@@ -69,7 +70,6 @@
 
 from optimum.habana import GaudiConfig
 from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.accelerate.utils.dataclasses import GaudiDistributedType
 from optimum.habana.utils import set_seed
 
 
@@ -762,7 +762,7 @@ def save_model_hook(models, weights, output_dir):
     def load_model_hook(models, input_dir):
         transformer_ = None
 
-        if not accelerator.distributed_type == GaudiDistributedType.DEEPSPEED:
+        if not accelerator.distributed_type == DistributedType.DEEPSPEED:
             while len(models) > 0:
                 model = models.pop()
 
@@ -1075,7 +1075,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
                 progress_bar.update(1)
                 global_step += 1
 
-                if accelerator.is_main_process or accelerator.distributed_type == GaudiDistributedType.DEEPSPEED:
+                if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED:
                     if global_step % args.checkpointing_steps == 0:
                         # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                         if args.checkpoints_total_limit is not None:
diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
@@ -35,6 +35,7 @@
 import torch.nn.functional as F
 import torch.utils.checkpoint
 import transformers
+from accelerate import DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs
 from diffusers import (
@@ -68,7 +69,6 @@
 
 from optimum.habana import GaudiConfig
 from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.accelerate.utils.dataclasses import GaudiDistributedType
 from optimum.habana.diffusers import GaudiStableDiffusionXLPipeline
 from optimum.habana.transformers.trainer import _is_peft_model
 from optimum.habana.utils import set_seed
@@ -1019,7 +1019,7 @@ def unwrap_model(model, training=False):
         if not training:
             return model
         else:
-            if accelerator.distributed_type == GaudiDistributedType.MULTI_HPU:
+            if accelerator.distributed_type == DistributedType.MULTI_HPU:
                 kwargs = {}
                 kwargs["gradient_as_bucket_view"] = True
                 accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
@@ -41,6 +41,7 @@
 import torch.nn.functional as F
 import torch.utils.checkpoint
 import transformers
+from accelerate import DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration
 from datasets import load_dataset
@@ -62,7 +63,6 @@
 
 from optimum.habana import GaudiConfig
 from optimum.habana.accelerate import GaudiAccelerator
-from optimum.habana.accelerate.utils.dataclasses import GaudiDistributedType
 from optimum.habana.diffusers import (
     GaudiDDIMScheduler,
     GaudiEulerAncestralDiscreteScheduler,
@@ -896,7 +896,7 @@ def main(args):
                         for idx, dt in enumerate(dataset["train"]):
                             dt["image"].save(f"{args.mediapipe}/{idx}.jpg")
                             f.write(dt["text"] + "\n")
-            if accelerator.distributed_type != GaudiDistributedType.NO:
+            if accelerator.distributed_type != DistributedType.NO:
                 torch.distributed.barrier()
 
             from media_pipe_imgdir import get_dataset_for_pipeline
@@ -1145,7 +1145,7 @@ def unwrap_model(model, training=False):
         if not training:
             return model
         else:
-            if accelerator.distributed_type == GaudiDistributedType.MULTI_HPU:
+            if accelerator.distributed_type == DistributedType.MULTI_HPU:
                 kwargs = {}
                 kwargs["gradient_as_bucket_view"] = True
                 accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
diff --git a/optimum/habana/accelerate/accelerator.py b/optimum/habana/accelerate/accelerator.py
@@ -61,7 +61,7 @@
 
 from ..distributed import parallel_state
 from .state import GaudiPartialState
-from .utils import GaudiDistributedType, GaudiDynamoBackend, convert_model
+from .utils import convert_model
 
 
 logger = get_logger(__name__)
@@ -162,7 +162,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
         """
         if device_placement is None:
             device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP
-            if not evaluation_mode and self.distributed_type == GaudiDistributedType.MULTI_HPU:
+            if not evaluation_mode and self.distributed_type == DistributedType.MULTI_HPU:
                 device_placement = None
         self._models.append(model)
 
@@ -223,13 +223,15 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
         elif device_placement and not self.verify_device_map(model):
             model = model.to(self.device)
         if not evaluation_mode:
-            if self.distributed_type == GaudiDistributedType.MULTI_HPU and self._distribution_strategy != "fast_ddp":
+            ###############################################################################################################
+            if self.distributed_type == DistributedType.MULTI_HPU and self._distribution_strategy != "fast_ddp":
                 if any(p.requires_grad for p in model.parameters()):
                     kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
                     model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
                     if self.ddp_handler is not None:
                         self.ddp_handler.register_comm_hook(model)
-            elif self.distributed_type == GaudiDistributedType.FSDP:
+            ###############################################################################################################
+            elif self.distributed_type == DistributedType.FSDP:
                 from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
 
                 # Check if the model is already a FSDP model due to `Manual Wrapping` and if so,
@@ -353,7 +355,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                     del self._models[-2]
                 self._models[-1] = model
         # torch.compile should be called last and only if the model isn't already compiled.
-        if self.state.dynamo_plugin.backend != GaudiDynamoBackend.NO and not is_compiled_module(model):
+        if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
             compile_kwargs = self.state.dynamo_plugin.to_kwargs()
             ############################################################################################################
             if self.use_regional_compilation:
@@ -567,7 +569,7 @@ def _prepare_deepspeed(self, *args):
                 os.environ["DEEPSPEED_USE_HPU"] = "true"
             engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
             # torch.compile should be called if dynamo plugin backend is set and only if the model isn't already compiled.
-            if self.state.dynamo_plugin.backend != GaudiDynamoBackend.NO and not is_compiled_module(model):
+            if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
                 compile_kwargs = self.state.dynamo_plugin.to_kwargs()
                 ###############################################################################################################
                 if self.use_regional_compilation:
diff --git a/optimum/habana/accelerate/state.py b/optimum/habana/accelerate/state.py
@@ -17,19 +17,20 @@
 
 import accelerate
 import torch
+from accelerate import DistributedType
 from accelerate.state import PartialState
 from accelerate.utils import is_deepspeed_available, parse_flag_from_env
 
 from optimum.utils import logging
 
 from ..distributed import parallel_state
-from .utils import GaudiDistributedType
 
 
 logger = logging.get_logger()
 
 
-# TODO: Remove when minimize_memory is supported in upstream accelerate and sequence parallelism is managed in GaudiTrainer
+# TODO: Remove when minimize_memory is supported in upstream accelerate
+# and sequence/context parallelism is managed in GaudiTrainer or supported in upstream accelerate
 class GaudiPartialState(PartialState):
     """
     Adapted from: https://github.com/huggingface/accelerate/blob/8514c35192ac9762920f1ab052e5cea4c0e46eeb/src/accelerate/state.py#L96
@@ -61,7 +62,7 @@ def __init__(self, cpu: bool = False, **kwargs):
                             "DeepSpeed is not available, install it with: `pip install"
                             " git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0`."
                         )
-                    self.distributed_type = GaudiDistributedType.DEEPSPEED
+                    self.distributed_type = DistributedType.DEEPSPEED
                     import deepspeed
 
                     if world_size > 1:
@@ -74,12 +75,12 @@ def __init__(self, cpu: bool = False, **kwargs):
                     logger.info("DeepSpeed is enabled.")
                     self._mixed_precision = "no"  # deepspeed handles mixed_precision using deepspeed_config
                 elif os.environ.get("ACCELERATE_USE_FSDP", "false") == "true":
-                    self.distributed_type = GaudiDistributedType.FSDP
+                    self.distributed_type = DistributedType.FSDP
                     if not torch.distributed.is_initialized():
                         torch.distributed.init_process_group(backend=self.backend, rank=rank, world_size=world_size)
                         logger.info("Enabled distributed run.")
                 else:
-                    self.distributed_type = GaudiDistributedType.MULTI_HPU
+                    self.distributed_type = DistributedType.MULTI_HPU
                     if not torch.distributed.is_initialized():
                         torch.distributed.init_process_group(backend=self.backend, rank=rank, world_size=world_size)
                         logger.info("Enabled distributed run.")
@@ -104,9 +105,9 @@ def __init__(self, cpu: bool = False, **kwargs):
                         logger.info("FP8 amax reduction group is already initialized.")
             else:
                 self.distributed_type = (
-                    GaudiDistributedType.NO
+                    DistributedType.NO
                     if os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "false"
-                    else GaudiDistributedType.DEEPSPEED
+                    else DistributedType.DEEPSPEED
                 )
                 self.num_processes = 1
                 self.process_index = self.local_process_index = 0
diff --git a/optimum/habana/accelerate/utils/__init__.py b/optimum/habana/accelerate/utils/__init__.py
@@ -1,10 +1,3 @@
-from .dataclasses import (
-    GaudiDistributedType,
-    GaudiDynamoBackend,
-    GaudiFP8RecipeKwargs,
-    GaudiFullyShardedDataParallelPlugin,
-    GaudiTorchDynamoPlugin,
-)
 from .transformer_engine import (
     FP8ContextWrapper,
     convert_model,
diff --git a/optimum/habana/accelerate/utils/dataclasses.py b/optimum/habana/accelerate/utils/dataclasses.py
diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
@@ -35,7 +35,7 @@
 import huggingface_hub.utils as hf_hub_utils
 import numpy as np
 import torch
-from accelerate import skip_first_batches
+from accelerate import DistributedType, skip_first_batches
 from accelerate.data_loader import SeedableRandomSampler
 from accelerate.utils import (
     DistributedDataParallelKwargs,
@@ -47,8 +47,6 @@
 )
 from huggingface_hub import upload_folder
 from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler
-
-from optimum.utils import logging
 from transformers import Trainer
 from transformers.data.data_collator import DataCollator
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
@@ -106,8 +104,10 @@
     is_safetensors_available,
 )
 
+from optimum.utils import logging
+
 from ..accelerate import GaudiAccelerator
-from ..accelerate.utils import FP8ContextWrapper, GaudiDistributedType
+from ..accelerate.utils import FP8ContextWrapper
 from ..utils import (
     HabanaProfile,
     get_hpu_memory_stats,
@@ -903,7 +903,7 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args, use_reentrant: Optio
         self._globalstep_last_logged = self.state.global_step
         self._zero_model_grad(model)
         _grad_norm: Optional[float] = None
-        _should_compute_grad_norm: bool = not self.accelerator.distributed_type == GaudiDistributedType.DEEPSPEED and (
+        _should_compute_grad_norm: bool = not self.accelerator.distributed_type == DistributedType.DEEPSPEED and (
             # Gradient clipping
             args.max_grad_norm is not None and args.max_grad_norm > 0
         )
@@ -1280,15 +1280,15 @@ def _maybe_log_save_evaluate(self, tr_loss, _grad_norm, model, trial, epoch, ign
 
             # This grad_norm block was outside of _maybe_log_save_evaluate method causing perf degradation.
             # Moving it here so the grad tensor is only copied when it's needed.
-            if self.accelerator.distributed_type == GaudiDistributedType.DEEPSPEED:
+            if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
                 grad_norm = model.get_global_grad_norm()
                 # In some cases the grad norm may not return a float
                 if hasattr(grad_norm, "item"):
                     grad_norm = grad_norm.item()
             else:
                 if (
                     _grad_norm is not None
-                    and self.accelerator.distributed_type != GaudiDistributedType.FSDP
+                    and self.accelerator.distributed_type != DistributedType.FSDP
                     and _grad_norm.size() == torch.Size([1])
                 ):
                     grad_norm = _grad_norm.detach().item()
diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py
@@ -22,6 +22,7 @@
 from pathlib import Path
 from typing import Optional, Union
 
+from accelerate import DistributedType
 from accelerate.state import AcceleratorState
 from packaging import version
 from transformers.debug_utils import DebugOption
@@ -47,7 +48,6 @@
 from optimum.utils import logging
 
 from ..accelerate.state import GaudiPartialState
-from ..accelerate.utils import GaudiDistributedType
 from ..utils import get_habana_frameworks_version
 from .gaudi_configuration import GaudiConfig
 
@@ -922,7 +922,7 @@ def _setup_devices(self) -> "torch.device":
                 )
             # We rely on `PartialState` to yell if there's issues here (which it will)
             self.distributed_state = GaudiPartialState(cpu=self.use_cpu)
-            if self.deepspeed and self.distributed_state.distributed_type != GaudiDistributedType.DEEPSPEED:
+            if self.deepspeed and self.distributed_state.distributed_type != DistributedType.DEEPSPEED:
                 raise RuntimeError(
                     "Tried to use an already configured `Accelerator` or `PartialState` that was not initialized for DeepSpeed, "
                     "but also passed in a `deepspeed` configuration to the `TrainingArguments`. Please set "
@@ -999,7 +999,7 @@ def _setup_devices(self) -> "torch.device":
                 "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
             )
 
-        if self.distributed_state.distributed_type == GaudiDistributedType.NO:
+        if self.distributed_state.distributed_type == DistributedType.NO:
             self._n_gpu = 0
 
         return device