We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 9885ddb commit 9b3f18bCopy full SHA for 9b3f18b
megatron/core/optimizer/qk_clip.py
@@ -22,6 +22,8 @@ def clip_qk(model, log_max_only=False) -> float:
22
for model_chunk in model:
23
for transformer_layer in model_chunk.module.module.decoder.layers:
24
if hasattr(transformer_layer.self_attention, 'clip_qk'):
25
+ if transformer_layer.self_attention.core_attention.current_max_attn_logits is None:
26
+ continue
27
torch.distributed.all_reduce(
28
transformer_layer.self_attention.core_attention.current_max_attn_logits,
29
op=torch.distributed.ReduceOp.MAX,
0 commit comments