run domino example on amd

Hongwei Chen · Hongwei Chen · commit 7aaff6d90b42 · 2025-02-14T07:26:32.000Z
diff --git a/training/DeepSpeed-Domino/domino/arguments.py b/training/DeepSpeed-Domino/domino/arguments.py
@@ -294,7 +294,11 @@ def parse_args():
                       'bfloat16 data type.', flush=True)
 
     args.async_tensor_model_parallel_allreduce = True
-    args.gradient_accumulation_fusion = True
+    if torch.cuda.is_available() and torch.version.hip:
+        args.gradient_accumulation_fusion = False
+    elif torch.cuda.is_available() and torch.version.cuda:
+        args.gradient_accumulation_fusion = True
+    
     args.padded_vocab_size = 0 # tokenizer.py
     args.model_type = 1
     args.data_parallel_size = 1
diff --git a/training/DeepSpeed-Domino/domino/initialize.py b/training/DeepSpeed-Domino/domino/initialize.py
@@ -13,7 +13,6 @@
 from domino.modules.fused_func import bias_dropout_add_fused_train
 from domino.modules.fused_bias_gelu import bias_gelu
 
-from megatron import fused_kernels
 
 
 def initialize_domino():
@@ -111,27 +110,6 @@ def _compile_dependencies():
                 flush=True,
             )
 
-    # Always build on rank zero first.
-    if torch.distributed.get_rank() == 0:
-        start_time = time.time()
-        print("> compiling and loading fused kernels ...", flush=True)
-        fused_kernels.load(args)
-        torch.distributed.barrier()
-    else:
-        torch.distributed.barrier()
-        fused_kernels.load(args)
-    # Simple barrier to make sure all ranks have passed the
-    # compilation phase successfully before moving on to the
-    # rest of the program. We think this might ensure that
-    # the lock is released.
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(
-            ">>> done with compiling and loading fused kernels. "
-            "Compilation time: {:.3f} seconds".format(time.time() - start_time),
-            flush=True,
-        )
-
 
 def set_jit_fusion_options():
     """Set PyTorch JIT layer fusion options."""
diff --git a/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh b/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh
@@ -45,7 +45,6 @@ GPT_ARGS="
     --weight-decay 1e-2 \
     --lr-warmup-fraction .01 \
     --clip-grad 1.0 \
-    --no-gradient-accumulation-fusion \
     --fp16 \
     --tensor-model-parallel-size $WORLD_SIZE
 "
diff --git a/training/DeepSpeed-Domino/pretrain_llama_13b.sh b/training/DeepSpeed-Domino/pretrain_llama_13b.sh
@@ -53,7 +53,6 @@ LLAMA_ARGS="
     --weight-decay 1e-2 \
     --lr-warmup-fraction .01 \
     --clip-grad 1.0 \
-    --no-gradient-accumulation-fusion \
     --fp16 \
     --tensor-model-parallel-size $WORLD_SIZE \
     --seed 3407 \

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,6 @@ GPT_ARGS="`
`45`	`45`	`--weight-decay 1e-2 \`
`46`	`46`	`--lr-warmup-fraction .01 \`
`47`	`47`	`--clip-grad 1.0 \`
`48`		`- --no-gradient-accumulation-fusion \`
`49`	`48`	`--fp16 \`
`50`	`49`	`--tensor-model-parallel-size $WORLD_SIZE`
`51`	`50`	`"`