We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent b6a88bd commit 86534bfCopy full SHA for 86534bf
megatron/core/optimizer/qk_clip.py
@@ -22,7 +22,10 @@ def clip_qk(model, log_max_only=False) -> float:
22
for model_chunk in model:
23
for transformer_layer in model_chunk.module.module.decoder.layers:
24
if hasattr(transformer_layer.self_attention, 'clip_qk'):
25
- if transformer_layer.self_attention.core_attention.current_max_attn_logits is None:
+ if (
26
+ transformer_layer.self_attention.core_attention.current_max_attn_logits
27
+ is None
28
+ ):
29
continue
30
torch.distributed.all_reduce(
31
transformer_layer.self_attention.core_attention.current_max_attn_logits,
0 commit comments