diff --git a/training/DeepSpeed-Domino/domino/arguments.py b/training/DeepSpeed-Domino/domino/arguments.py index 8bc59223a..e846726b9 100644 --- a/training/DeepSpeed-Domino/domino/arguments.py +++ b/training/DeepSpeed-Domino/domino/arguments.py @@ -294,7 +294,11 @@ def parse_args(): 'bfloat16 data type.', flush=True) args.async_tensor_model_parallel_allreduce = True - args.gradient_accumulation_fusion = True + if torch.cuda.is_available() and torch.version.hip: + args.gradient_accumulation_fusion = False + elif torch.cuda.is_available() and torch.version.cuda: + args.gradient_accumulation_fusion = True + args.padded_vocab_size = 0 # tokenizer.py args.model_type = 1 args.data_parallel_size = 1 diff --git a/training/DeepSpeed-Domino/domino/initialize.py b/training/DeepSpeed-Domino/domino/initialize.py index 36e0fa1bc..bb697dc4b 100644 --- a/training/DeepSpeed-Domino/domino/initialize.py +++ b/training/DeepSpeed-Domino/domino/initialize.py @@ -13,7 +13,6 @@ from domino.modules.fused_func import bias_dropout_add_fused_train from domino.modules.fused_bias_gelu import bias_gelu -from megatron import fused_kernels def initialize_domino(): @@ -111,27 +110,6 @@ def _compile_dependencies(): flush=True, ) - # Always build on rank zero first. - if torch.distributed.get_rank() == 0: - start_time = time.time() - print("> compiling and loading fused kernels ...", flush=True) - fused_kernels.load(args) - torch.distributed.barrier() - else: - torch.distributed.barrier() - fused_kernels.load(args) - # Simple barrier to make sure all ranks have passed the - # compilation phase successfully before moving on to the - # rest of the program. We think this might ensure that - # the lock is released. - torch.distributed.barrier() - if torch.distributed.get_rank() == 0: - print( - ">>> done with compiling and loading fused kernels. " - "Compilation time: {:.3f} seconds".format(time.time() - start_time), - flush=True, - ) - def set_jit_fusion_options(): """Set PyTorch JIT layer fusion options.""" diff --git a/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh b/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh index 131411d2b..9df97b2d0 100644 --- a/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh +++ b/training/DeepSpeed-Domino/pretrain_gpt3_6.7b.sh @@ -45,7 +45,6 @@ GPT_ARGS=" --weight-decay 1e-2 \ --lr-warmup-fraction .01 \ --clip-grad 1.0 \ - --no-gradient-accumulation-fusion \ --fp16 \ --tensor-model-parallel-size $WORLD_SIZE " diff --git a/training/DeepSpeed-Domino/pretrain_llama_13b.sh b/training/DeepSpeed-Domino/pretrain_llama_13b.sh index 1a438513a..80df429da 100644 --- a/training/DeepSpeed-Domino/pretrain_llama_13b.sh +++ b/training/DeepSpeed-Domino/pretrain_llama_13b.sh @@ -53,7 +53,6 @@ LLAMA_ARGS=" --weight-decay 1e-2 \ --lr-warmup-fraction .01 \ --clip-grad 1.0 \ - --no-gradient-accumulation-fusion \ --fp16 \ --tensor-model-parallel-size $WORLD_SIZE \ --seed 3407 \