deepspeedai · loadams · Mar 27, 2025 · Feb 14, 2025 · Mar 27, 2025
@@ -294,7 +294,11 @@ def parse_args():
                       'bfloat16 data type.', flush=True)
 
     args.async_tensor_model_parallel_allreduce = True
-    args.gradient_accumulation_fusion = True
+    if torch.cuda.is_available() and torch.version.hip:
+        args.gradient_accumulation_fusion = False
+    elif torch.cuda.is_available() and torch.version.cuda:
+        args.gradient_accumulation_fusion = True
+
     args.padded_vocab_size = 0 # tokenizer.py
     args.model_type = 1
     args.data_parallel_size = 1

@@ -13,7 +13,6 @@
 from domino.modules.fused_func import bias_dropout_add_fused_train
 from domino.modules.fused_bias_gelu import bias_gelu
 
-from megatron import fused_kernels
 
 
 def initialize_domino():
@@ -111,27 +110,6 @@ def _compile_dependencies():
                 flush=True,
             )
 
-    # Always build on rank zero first.
-    if torch.distributed.get_rank() == 0:
-        start_time = time.time()
-        print("> compiling and loading fused kernels ...", flush=True)
-        fused_kernels.load(args)
-        torch.distributed.barrier()
-    else:
-        torch.distributed.barrier()
-        fused_kernels.load(args)
-    # Simple barrier to make sure all ranks have passed the
-    # compilation phase successfully before moving on to the
-    # rest of the program. We think this might ensure that
-    # the lock is released.
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(
-            ">>> done with compiling and loading fused kernels. "
-            "Compilation time: {:.3f} seconds".format(time.time() - start_time),
-            flush=True,
-        )
-
 
 def set_jit_fusion_options():
     """Set PyTorch JIT layer fusion options."""

@@ -45,7 +45,6 @@ GPT_ARGS="
     --weight-decay 1e-2 \
     --lr-warmup-fraction .01 \
     --clip-grad 1.0 \
-    --no-gradient-accumulation-fusion \
     --fp16 \
     --tensor-model-parallel-size $WORLD_SIZE
 "

@@ -53,7 +53,6 @@ LLAMA_ARGS="
     --weight-decay 1e-2 \
     --lr-warmup-fraction .01 \
     --clip-grad 1.0 \
-    --no-gradient-accumulation-fusion \
     --fp16 \
     --tensor-model-parallel-size $WORLD_SIZE \
     --seed 3407 \