Skip to content

Commit 45a1481

Browse files
committed
fix
1 parent b0ddb33 commit 45a1481

2 files changed

Lines changed: 4 additions & 3 deletions

File tree

swift/arguments/base_args/model_args.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ def get_model_kwargs(self):
236236
'max_memory': self.max_memory,
237237
'quantization_config': self.get_quantization_config(),
238238
'attn_impl': self.attn_impl,
239+
'experts_impl': self.experts_impl,
239240
'new_special_tokens': self.new_special_tokens,
240241
'rope_scaling': self.rope_scaling,
241242
'max_model_len': self.max_model_len,

swift/megatron/model/mm_gpts/qwen3_5.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class Qwen3_5MoeGatedDeltaNet(_HuggingFaceModule, _Qwen3_5MoeGatedDeltaNet):
2626
def __init__(self, config: TransformerConfig, submodules: SelfAttentionSubmodules, layer_number: int, **kwargs):
2727
assert config.context_parallel_size == 1, 'Qwen3_5 currently does not support context parallel.'
2828
assert _Qwen3_5MoeGatedDeltaNet is not object, 'please update the `transformers` version.'
29-
if config.args.packing:
29+
if getattr(config.args, 'packing', False):
3030
raise ValueError('Please set the environment variable `SWIFT_USE_MCORE_GDN=1` to enable the megatron-core '
3131
'implementation of GatedDeltaNet, which supports packing.')
3232
_Qwen3_5MoeGatedDeltaNet.__init__(self, config, layer_number)
@@ -43,7 +43,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
4343
thd_format = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd'
4444
# Note: for packed inputs, we do not perform padding_free unpadding.
4545
# Doing so would allow different sequences to see each other; for efficiency we keep this implementation.
46-
if thd_format and not args.packing:
46+
if thd_format:
4747
new_hidden_states = hidden_states.new_zeros(
4848
(packed_seq_params.num_samples, packed_seq_params.max_seqlen_q.item(), hidden_states.shape[-1]))
4949
attention_mask = hidden_states.new_zeros(
@@ -60,7 +60,7 @@ def forward(self, hidden_states: torch.Tensor, **kwargs):
6060
if attention_mask is not None:
6161
attention_mask = (~attention_mask).sum(dim=(1, 2)) > 0
6262
res = super().forward(hidden_states=hidden_states, attention_mask=attention_mask)
63-
if thd_format and not args.packing:
63+
if thd_format:
6464
res = res[attention_mask][:, None]
6565
res = torch.concat([res, res.new_zeros(seq_len - res.shape[0], 1, res.shape[2])])
6666
else:

0 commit comments

Comments
 (0)