Skip to content

Commit 0ce92a3

Browse files
authored
[megatron] support Qwen3-Next apply_wd_to_qk_layernorm (modelscope#7954)
1 parent 9a900bf commit 0ce92a3

File tree

4 files changed

+17
-6
lines changed

4 files changed

+17
-6
lines changed

docs/source/Megatron-SWIFT/Command-line-parameters.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ Megatron训练参数继承自Megatron参数和基本参数(**与ms-swift共用
301301
- 提示:在日志中打印的"learning rate"为llm的学习率。
302302
- aligner_lr: 当训练多模态大模型时,该参数指定aligner的学习率,默认为None,等于learning_rate。
303303
- gradient_checkpointing_kwargs: 传入`torch.utils.checkpoint`中的参数。例如设置为`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`。默认为None。该参数只对`vit_gradient_checkpointing`生效。
304+
- apply_wd_to_qk_layernorm: 用于Qwen3-Next全参数训练,对 qk layernorm 应用权重衰减。默认为False。
304305
- 🔥packing: 使用`padding_free`的方式将不同长度的数据样本打包成**近似**统一长度的样本(packing能保证不对完整的序列进行切分),实现训练时各节点与进程的负载均衡(避免长文本拖慢短文本的训练速度),从而提高GPU利用率,保持显存占用稳定。当使用 `--attention_backend flash` 时,可确保packed样本内的不同序列之间相互独立,互不可见(除Qwen3-Next,因为含有linear-attention)。该参数默认为`False`。Megatron-SWIFT的所有训练任务都支持该参数。注意:**packing会导致数据集样本数减少,请自行调节梯度累加数和学习率**
305306
- packing_length: packing的长度。默认为None,设置为max_length。
306307
- packing_num_proc: packing的进程数,默认为1。需要注意的是,不同的`packing_num_proc`,最终形成的packed数据集是不同的。(该参数在流式packing时不生效)。通常不需要修改该值,packing速度远快于tokenize速度。

docs/source_en/Megatron-SWIFT/Command-line-parameters.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ Megatron training parameters are inherited from Megatron parameters and basic pa
320320
- Note: The "learning rate" printed in the logs is the learning rate of the LLM.
321321
- aligner_lr: Specifies the learning rate for the aligner module in multimodal models. Default is `None`, same as `learning_rate`.
322322
- gradient_checkpointing_kwargs: Arguments passed to `torch.utils.checkpoint`. For example: set `--gradient_checkpointing_kwargs '{"use_reentrant": false}'`. Defaults to `None`. This parameter only takes effect when `vit_gradient_checkpointing` is enabled.
323+
- apply_wd_to_qk_layernorm: Used for Qwen3-Next full-parameter training to apply weight decay to qk layernorm. Defaults to False.
323324
- 🔥packing: Use the `padding_free` method to pack data samples of different lengths into samples of **approximately** uniform length (packing ensures that complete sequences are not split), achieving load balancing across nodes and processes during training (preventing long texts from slowing down short text training), thereby improving GPU utilization and maintaining stable memory usage. When using `--attention_backend flash`, it ensures that different sequences within packed samples remain independent and invisible to each other (except for Qwen3-Next, which contains linear-attention). This parameter defaults to `False`. All training tasks in Megatron-SWIFT support this parameter. Note: **packing will reduce the number of dataset samples, please adjust gradient accumulation steps and learning rate accordingly**.
324325
- packing_length: the length to use for packing. Defaults to None, in which case it is set to max_length.
325326
- packing_num_proc: Number of processes for packing, default is 1. Note that different values of `packing_num_proc` will result in different packed datasets. (This parameter does not take effect during streaming packing). Usually there is no need to modify this value, as packing speed is much faster than tokenization speed.

swift/megatron/arguments/megatron_args.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ class ExtraMegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
379379
linear_value_head_dim: Optional[int] = None
380380
linear_conv_kernel_dim: Optional[int] = None
381381
layer_types: Optional[List[str]] = None
382+
apply_wd_to_qk_layernorm: bool = False
382383
# qwen3_vl, qwen3_omni
383384
mrope_interleaved: Optional[bool] = None
384385

@@ -730,6 +731,8 @@ def __post_init__(self):
730731
os.environ.setdefault('CUDA_DEVICE_MAX_CONNECTIONS', '1')
731732
if self.recompute_granularity == 'none':
732733
self.recompute_granularity = None
734+
if self.apply_wd_to_qk_layernorm and self.hf_model_type != 'qwen3_next':
735+
raise ValueError('apply_wd_to_qk_layernorm is only supported for qwen3_next')
733736
self._set_default()
734737
self.model_info, self.model_meta = get_model_info_meta(
735738
self.model, model_type=self.model_type, use_hf=self.use_hf, hub_token=self.hub_token)

swift/megatron/trainers/base.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import shutil
77
import time
88
from abc import ABC, abstractmethod
9-
from contextlib import contextmanager
9+
from contextlib import contextmanager, nullcontext
1010
from datetime import datetime
1111
from functools import partial
1212
from typing import Callable, Dict, List, Literal, Optional
@@ -317,7 +317,9 @@ def _get_param_groups(
317317
Returns:
318318
List of parameter groups.
319319
"""
320+
args = get_args()
320321
if self.args.vit_lr is not None or self.args.aligner_lr is not None:
322+
assert self.args.megatron_model_meta.is_multimodal
321323
vit_lr = self.args.vit_lr if self.args.vit_lr is not None else self.args.lr
322324
aligner_lr = self.args.aligner_lr if self.args.aligner_lr is not None else self.args.lr
323325
logger.info(f'vit_lr: {vit_lr}, aligner_lr: {aligner_lr}, llm_lr: {self.args.lr}')
@@ -335,6 +337,9 @@ def _get_param_groups(
335337

336338
if no_weight_decay_cond is not None:
337339
no_wd: bool = no_weight_decay_cond(name, param)
340+
elif args.apply_wd_to_qk_layernorm and any(
341+
name.endswith(k) for k in ['q_layernorm.weight', 'k_layernorm.weight']):
342+
no_wd = False
338343
else:
339344
# Do not regularize biases and norm parameters.
340345
# optionally, also skip weight decay for embedding parameters if requested
@@ -423,10 +428,6 @@ def _get_param_groups(
423428

424429
@contextmanager
425430
def _patch_get_param_groups(self):
426-
if not self.args.megatron_model_meta.is_multimodal or (self.args.vit_lr is None
427-
and self.args.aligner_lr is None):
428-
yield
429-
return
430431
from megatron.core import optimizer
431432

432433
_get_param_groups = optimizer._get_param_groups
@@ -497,7 +498,12 @@ def new_model_provider_func(*_args, **kwargs):
497498
# read iteration
498499
if not args.finetune:
499500
args.iteration, args.num_floating_point_operations_so_far = self._load_iteration()
500-
with self._patch_load_state_dict(self._load_base_checkpoint), self._patch_get_param_groups():
501+
502+
if args.apply_wd_to_qk_layernorm or self.args.vit_lr is not None or self.args.aligner_lr is not None:
503+
param_groups_context = self._patch_get_param_groups()
504+
else:
505+
param_groups_context = nullcontext()
506+
with self._patch_load_state_dict(self._load_base_checkpoint), param_groups_context:
501507
model, optimizer, opt_param_scheduler = self._origin_setup_model_and_optimizer(
502508
new_model_provider_func, model_type, *_args, **kwargs)
503509
self.wrapped_models = model

0 commit comments

Comments
 (0)