We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent e3e785c commit d2ba1bcCopy full SHA for d2ba1bc
praxis/contrib/gpu/scripts_gpu/te_helper.py
@@ -208,7 +208,7 @@ def update_attn_te_tpl(te_tpl, attn_tpl):
208
assert (transformer_layer_tpl.tr_fflayer_tpl.has_bias ==
209
transformer_layer_tpl.tr_atten_tpl.use_bias), "TE only allows same bias settings."
210
te_transformer_tpl.use_bias = transformer_layer_tpl.tr_fflayer_tpl.has_bias
211
- te_transformer_tpl.self_attn_mask_type = 'causal' \
+ te_transformer_tpl.self_attn_mask_type = 'causal_padding' \
212
if stacked_transformer_obj.mask_self_attention else 'padding'
213
214
te_transformer_tpl.logical_axes_rules = te_flax.extend_logical_axis_rules(tuple())
0 commit comments