File tree 4 files changed +5
-25
lines changed
training/DeepSpeed-Domino
4 files changed +5
-25
lines changed Original file line number Diff line number Diff line change @@ -294,7 +294,11 @@ def parse_args():
294
294
'bfloat16 data type.' , flush = True )
295
295
296
296
args .async_tensor_model_parallel_allreduce = True
297
- args .gradient_accumulation_fusion = True
297
+ if torch .cuda .is_available () and torch .version .hip :
298
+ args .gradient_accumulation_fusion = False
299
+ elif torch .cuda .is_available () and torch .version .cuda :
300
+ args .gradient_accumulation_fusion = True
301
+
298
302
args .padded_vocab_size = 0 # tokenizer.py
299
303
args .model_type = 1
300
304
args .data_parallel_size = 1
Original file line number Diff line number Diff line change 13
13
from domino .modules .fused_func import bias_dropout_add_fused_train
14
14
from domino .modules .fused_bias_gelu import bias_gelu
15
15
16
- from megatron import fused_kernels
17
16
18
17
19
18
def initialize_domino ():
@@ -111,27 +110,6 @@ def _compile_dependencies():
111
110
flush = True ,
112
111
)
113
112
114
- # Always build on rank zero first.
115
- if torch .distributed .get_rank () == 0 :
116
- start_time = time .time ()
117
- print ("> compiling and loading fused kernels ..." , flush = True )
118
- fused_kernels .load (args )
119
- torch .distributed .barrier ()
120
- else :
121
- torch .distributed .barrier ()
122
- fused_kernels .load (args )
123
- # Simple barrier to make sure all ranks have passed the
124
- # compilation phase successfully before moving on to the
125
- # rest of the program. We think this might ensure that
126
- # the lock is released.
127
- torch .distributed .barrier ()
128
- if torch .distributed .get_rank () == 0 :
129
- print (
130
- ">>> done with compiling and loading fused kernels. "
131
- "Compilation time: {:.3f} seconds" .format (time .time () - start_time ),
132
- flush = True ,
133
- )
134
-
135
113
136
114
def set_jit_fusion_options ():
137
115
"""Set PyTorch JIT layer fusion options."""
Original file line number Diff line number Diff line change @@ -45,7 +45,6 @@ GPT_ARGS="
45
45
--weight-decay 1e-2 \
46
46
--lr-warmup-fraction .01 \
47
47
--clip-grad 1.0 \
48
- --no-gradient-accumulation-fusion \
49
48
--fp16 \
50
49
--tensor-model-parallel-size $WORLD_SIZE
51
50
"
Original file line number Diff line number Diff line change @@ -53,7 +53,6 @@ LLAMA_ARGS="
53
53
--weight-decay 1e-2 \
54
54
--lr-warmup-fraction .01 \
55
55
--clip-grad 1.0 \
56
- --no-gradient-accumulation-fusion \
57
56
--fp16 \
58
57
--tensor-model-parallel-size $WORLD_SIZE \
59
58
--seed 3407 \
You can’t perform that action at this time.
0 commit comments