@@ -26,7 +26,7 @@ class Qwen3_5MoeGatedDeltaNet(_HuggingFaceModule, _Qwen3_5MoeGatedDeltaNet):
2626 def __init__ (self , config : TransformerConfig , submodules : SelfAttentionSubmodules , layer_number : int , ** kwargs ):
2727 assert config .context_parallel_size == 1 , 'Qwen3_5 currently does not support context parallel.'
2828 assert _Qwen3_5MoeGatedDeltaNet is not object , 'please update the `transformers` version.'
29- if config .args . packing :
29+ if getattr ( config .args , ' packing' , False ) :
3030 raise ValueError ('Please set the environment variable `SWIFT_USE_MCORE_GDN=1` to enable the megatron-core '
3131 'implementation of GatedDeltaNet, which supports packing.' )
3232 _Qwen3_5MoeGatedDeltaNet .__init__ (self , config , layer_number )
@@ -43,7 +43,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
4343 thd_format = packed_seq_params is not None and packed_seq_params .qkv_format == 'thd'
4444 # Note: for packed inputs, we do not perform padding_free unpadding.
4545 # Doing so would allow different sequences to see each other; for efficiency we keep this implementation.
46- if thd_format and not args . packing :
46+ if thd_format :
4747 new_hidden_states = hidden_states .new_zeros (
4848 (packed_seq_params .num_samples , packed_seq_params .max_seqlen_q .item (), hidden_states .shape [- 1 ]))
4949 attention_mask = hidden_states .new_zeros (
@@ -60,7 +60,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
6060 if attention_mask is not None :
6161 attention_mask = (~ attention_mask ).sum (dim = (1 , 2 )) > 0
6262 res = super ().forward (hidden_states = hidden_states , attention_mask = attention_mask )
63- if thd_format and not args . packing :
63+ if thd_format :
6464 res = res [attention_mask ][:, None ]
6565 res = torch .concat ([res , res .new_zeros (seq_len - res .shape [0 ], 1 , res .shape [2 ])])
6666 else :
0 commit comments