modelscope · meichangsu1 · Dec 17, 2025 · Dec 9, 2025 · Dec 19, 2025 · gemini-code-assist
diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
@@ -257,6 +257,9 @@ gradient_checkpointing: true
 - 🔥neftune_noise_alpha: neftune添加的噪声系数。默认为0，通常可以设置为5、10、15。
 - 🔥use_liger_kernel: 是否启用[Liger](https://github.com/linkedin/Liger-Kernel)内核加速训练并减少显存消耗。默认为False。示例shell参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger)。
   - 注意：liger_kernel不支持device_map，请使用DDP/DeepSpeed进行多卡训练。liger_kernel目前只支持`task_type='causal_lm'`。
+- use_cce: 是否启用[cut-cross-entropy](https://github.com/apple/ml-cross-entropy)融合算子降低显存并加速训练。默认为False。示例shell参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/cce)。
+- use_tiled_mlp: 是否启用Tiled MLP进行内存高效的长序列训练。启用后，MLP层会被替换为分块实现，将序列分成多个shard进行计算以减少显存占用。默认为False。
+- tiled_mlp_num_shards: Tiled MLP计算时将序列分成的shard数量。默认为None，即设置为4。较大的值可以减少显存但可能增加计算时间。
 - average_tokens_across_devices: 是否在设备之间进行token数平均。如果设置为True，将使用all_reduce同步`num_tokens_in_batch`以进行精确的损失计算。默认为False。
 - max_grad_norm: 梯度裁剪。默认为1.。
   - 注意：日志中的grad_norm记录的是裁剪前的值。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -260,6 +260,9 @@ Other important parameters:
 - 🔥neftune_noise_alpha: Noise magnitude for NEFTune. Default is 0. Common values: 5, 10, 15.
 - 🔥use_liger_kernel: Whether to enable the [Liger](https://github.com/linkedin/Liger-Kernel) kernel to accelerate training and reduce GPU memory consumption. Defaults to False. Example shell script can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/liger).
   - Note: Liger kernel does not support `device_map`. Use DDP or DeepSpeed for multi-GPU training. Currently, liger_kernel only supports `task_type='causal_lm'`.
+- use_cce: Whether to enable the [cut-cross-entropy](https://github.com/apple/ml-cross-entropy) fused operator to reduce GPU memory usage and accelerate training. Defaults to `False`. Example shell script can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/cce).
+- use_tiled_mlp: Whether to enable Tiled MLP for memory-efficient long sequence training. When enabled, MLP layers are replaced with a tiled implementation that processes sequences in chunks to reduce memory usage. Defaults to False.
+- tiled_mlp_num_shards: Number of shards to split the sequence for tiled MLP computation. Defaults to None, which sets it to 4. Larger values reduce memory but may increase computation time.
 - average_tokens_across_devices: Whether to average token counts across devices. If `True`, `num_tokens_in_batch` is synchronized via `all_reduce` for accurate loss computation. Default is `False`.
 - max_grad_norm: Gradient clipping. Default is 1.
   - Note: The logged `grad_norm` reflects the value **before** clipping.

diff --git a/examples/train/activation_cpu_offload/fsdp2.json b/examples/train/activation_cpu_offload/fsdp2.json
@@ -0,0 +1,26 @@
+{
+    "_description": "FSDP2 configuration for distributed training (PyTorch native FSDP v2)",
+    "_requires": "torch>=2.4.0",
+    "_note": "This is the recommended configuration for multi-GPU training without CPU offloading. NOTE: When using FSDP2, do NOT use --gradient_checkpointing, use activation_checkpointing in fsdp_config instead.",
-    "_note": "This is the recommended configuration for multi-GPU training without CPU offloading. NOTE: When using FSDP2, do NOT use --gradient_checkpointing, use activation_checkpointing in fsdp_config instead.",
+    "_note": "This is the recommended configuration for multi-GPU training with CPU offloading. NOTE: When using FSDP2, do NOT use --gradient_checkpointing, use activation_checkpointing in fsdp_config instead.",
-    "_note": "This is the recommended configuration for multi-GPU training without CPU offloading. NOTE: When using FSDP2, do NOT use --gradient_checkpointing, use activation_checkpointing in fsdp_config instead.",
+    "_note": "This is the recommended configuration for multi-GPU training with CPU offloading. NOTE: When using FSDP2, do NOT use --gradient_checkpointing, use activation_checkpointing in fsdp_config instead.",
+
+    "_param_docs": {
+        "fsdp": "FSDP strategy string. Options: 'full_shard' (ZeRO-3 style, shards params+grads+optimizer), 'shard_grad_op' (ZeRO-2 style, shards grads+optimizer only). Add 'auto_wrap' to enable automatic layer wrapping. Add 'offload' to enable CPU offloading.",
+        "fsdp_version": "FSDP version. Use 2 for PyTorch native FSDP2 (recommended). FSDP2 uses DTensor for per-parameter sharding, supports LoRA/QLoRA natively.",
+        "auto_wrap_policy": "How to wrap model layers. 'TRANSFORMER_BASED_WRAP' wraps transformer decoder layers (from model._no_split_modules). 'SIZE_BASED_WRAP' wraps modules exceeding min_num_params.",
+        "cpu_ram_efficient_loading": "If true, only rank 0 loads full model weights, then broadcasts to other ranks. Reduces CPU RAM usage during initialization.",
+        "state_dict_type": "'SHARDED_STATE_DICT' (recommended): each rank saves its own shard without extra communication. 'FULL_STATE_DICT': gathers full model on rank 0 (higher memory, slower).",
+        "reshard_after_forward": "true = FULL_SHARD (ZeRO-3), reshards params after forward pass. false = SHARD_GRAD_OP (ZeRO-2), keeps params gathered during forward/backward.",
+        "activation_checkpointing": "Use FSDP's native activation checkpointing instead of gradient_checkpointing. This is the correct way to save memory with FSDP.",
+        "activation_cpu_offload": "true = offload activations to CPU. false = keep activations on GPU,can enable when using activation_checkpointing."
+    },
+    "fsdp": "full_shard auto_wrap",
+    "fsdp_config": {
+        "fsdp_version": 2,
+        "reshard_after_forward": true,
+        "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+        "cpu_ram_efficient_loading": true,
+        "state_dict_type": "SHARDED_STATE_DICT",
+        "activation_checkpointing": false,
+        "activation_cpu_offload": true
+    }
+}
diff --git a/examples/train/activation_cpu_offload/train.sh b/examples/train/activation_cpu_offload/train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+    --model 'Qwen/Qwen3-0.6B' \
+    --dataset 'swift/self-cognition#1000' \ \
-    --dataset 'swift/self-cognition#1000' \ \
+    --dataset 'swift/self-cognition#1000' \
-    --dataset 'swift/self-cognition#1000' \ \
+    --dataset 'swift/self-cognition#1000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --freeze_vit true \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --fsdp './examples/train/activation_cpu_offload/fsdp2.json'
diff --git a/examples/train/cce/sft.sh b/examples/train/cce/sft.sh
@@ -0,0 +1,17 @@
+# test env: 1 * A10
+# Using use_cce: 2.62GB
+# Not using use_cce: 16.24G
+
+# Install CCE dependency
+pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f643b88"
+
+# Run ms-swift (example)
+swift sft \
+  --model Qwen/Qwen2.5-0.5B-Instruct \
+  --dataset gsm8k#1024 \
+  --train_type lora \
+  --per_device_train_batch_size 64 \
+  --per_device_eval_batch_size 64 \
+  --use_hf true \
+  --use_cce true \
+  "$@"
diff --git a/examples/train/tiled_mlp/fsdp2.json b/examples/train/tiled_mlp/fsdp2.json
@@ -0,0 +1,25 @@
+{
+  "compute_environment": "LOCAL_MACHINE",
+  "debug": false,
+  "distributed_type": "FSDP",
+  "downcast_bf16": "no",
+  "fsdp_config": {
+    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+    "fsdp_cpu_ram_efficient_loading": true,
+    "fsdp_reshard_after_forward": true,
+    "fsdp_state_dict_type": "FULL_STATE_DICT",
-    "fsdp_state_dict_type": "FULL_STATE_DICT",
+    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-    "fsdp_state_dict_type": "FULL_STATE_DICT",
+    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+    "fsdp_activation_checkpointing": true,
+    "fsdp_version": 2
+  },
+  "machine_rank": 0,
+  "main_training_function": "main",
+  "mixed_precision": "bf16",
+  "num_machines": 1,
+  "num_processes": 2,
+  "rdzv_backend": "static",
+  "same_network": true,
+  "tpu_env": [],
+  "tpu_use_cluster": false,
+  "tpu_use_sudo": false,
+  "use_cpu": false
+}
diff --git a/examples/train/tiled_mlp/train_deepspeed.sh b/examples/train/tiled_mlp/train_deepspeed.sh
@@ -0,0 +1,24 @@
+CUDA_VISIBLE_DEVICES=0,1 \
+NPROC_PER_NODE=2 \
+swift sft \
+    --model Qwen/Qwen3-4B \
+    --dataset swift/self-cognition#200 \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 1 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --use_tiled_mlp true \
+    --tiled_mlp_num_shards 4 \
+    --deepspeed zero3
diff --git a/examples/train/tiled_mlp/train_fsdp2.sh b/examples/train/tiled_mlp/train_fsdp2.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# FSDP2 training with tiled MLP
+# Requires accelerate config with fsdp_version: 2
+
+# First, create the accelerate config (fsdp2.json) or use the one in examples/train/multi-gpu/fsdp2_lora/
+
+# FSDP2 with tiled MLP
+accelerate launch --config_file fsdp2.json \
+    -m swift sft \
+    --model Qwen/Qwen3-4B \
+    --dataset swift/self-cognition#200 \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --learning_rate 1e-5 \
+    --gradient_checkpointing false \
+    --weight_decay 0.1 \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 1 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --use_tiled_mlp true \
+    --tiled_mlp_num_shards 4
diff --git a/swift/llm/train/sft.py b/swift/llm/train/sft.py
@@ -51,6 +51,10 @@ def _prepare_generation_config(self):
     @RayHelper.function(group='default')
     def _prepare_model_tokenizer(self, **kwargs):
         args = self.args
+        # Apply tiled MLP before model instantiation
+        if getattr(args, 'use_tiled_mlp', False):
+            from swift.plugin.tiled_mlp import apply_tiled_mlp
+            apply_tiled_mlp(args.model_type, num_shards=getattr(args, 'tiled_mlp_num_shards', None))
         self.model, self.processor = args.get_model_processor(**kwargs)
         if args.sequence_parallel_size > 1:
             from swift.trainers.sequence_parallel import sequence_parallel
@@ -265,6 +269,7 @@ def train(self, trainer):
     @RayHelper.function(group='default')
     def _prepare_callbacks(self):
         from .callback import DynamicLayerActivationCallback, TrainerAdapterCallback
+        from swift.plugin import ActivationCpuOffloadCallBack
         args = self.args
         callbacks = []
         if args.lisa_activated_layers > 0:
@@ -275,6 +280,10 @@ def _prepare_callbacks(self):
                 model=self.model)
             lisa_callback.switch_active_layers()  # Make trainable parameters printing a correct value
             callbacks.append(lisa_callback)
+        # Check activation_cpu_offload from fsdp_config
+        fsdp_config = getattr(self.args, 'fsdp_config', {})
+        if isinstance(fsdp_config, dict) and fsdp_config.get('activation_cpu_offload', False):
+            callbacks.append(ActivationCpuOffloadCallBack())
 
         if args.is_adapter and args.train_type == 'adalora':
             callbacks.append(TrainerAdapterCallback(args))

diff --git a/swift/llm/train/tuner.py b/swift/llm/train/tuner.py
@@ -86,6 +86,73 @@ def apply_liger(model_type: str):
                           'by running `pip install -U liger-kernel`')
 
 
+def apply_cce(model_type: str):
+    try:
+        from cut_cross_entropy.transformers import cce_patch
+        from swift.llm import ModelType
+    except ImportError:
+        raise ImportError('Please upgrade cut-cross-entropy to apply cce kernels to this model '
+                          'by running `pip install "cut-cross-entropy[transformers] @ '
+                          'git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f643b88"`')
+
+    model_type_map = {
+        # llama family
+        ModelType.llama: 'llama',
+        ModelType.llama3: 'llama',
+        ModelType.llama3_1: 'llama',
+        ModelType.llama3_2: 'llama',
+        ModelType.llama4: 'llama4',
+        ModelType.llama3_2_vision: 'mllama',
+        # mistral & mixtral family
+        ModelType.mistral: 'mistral',
+        ModelType.mixtral: 'mixtral',
+        # phi
+        ModelType.phi3: 'phi3',
+        # gemma family
+        ModelType.gemma: 'gemma',
+        ModelType.gemma2: 'gemma2',
+        ModelType.gemma3_text: 'gemma3_text',
+        ModelType.gemma3_vision: 'gemma3',
+        ModelType.gemma3n: 'gemma3n',
+        # glm4 family
+        ModelType.glm4: 'glm4',
+        ModelType.glm4_0414: 'glm4',
+        ModelType.glm4_5: 'glm4_moe',
+        ModelType.glm4_z1_rumination: 'glm4_moe',
+        ModelType.glm4v: 'glm4v',
+        ModelType.glm4_1v: 'glm4v',
+        ModelType.glm4_5v: 'glm4v_moe',
+        # llava
+        ModelType.llava1_5_hf: 'llava',
+        ModelType.llava_llama3_hf: 'llava',
+        # qwen2 family
+        ModelType.qwen2: 'qwen2',
+        ModelType.qwen2_5: 'qwen2',
+        ModelType.qwen2_vl: 'qwen2_vl',
+        ModelType.qwen2_5_vl: 'qwen2_5_vl',
+        # qwen3 family
+        ModelType.qwen3: 'qwen3',
+        ModelType.qwen3_guard: 'qwen3',
+        ModelType.qwen3_thinking: 'qwen3',
+        ModelType.qwen3_nothinking: 'qwen3',
+        ModelType.qwen3_coder: 'qwen3',
+        ModelType.qwen3_moe: 'qwen3_moe',
+        ModelType.qwen3_moe_thinking: 'qwen3_moe',
+        ModelType.qwen3_next: 'qwen3_next',
+        ModelType.qwen3_next_thinking: 'qwen3_next',
+        ModelType.qwen3_vl: 'qwen3_vl',
+        ModelType.qwen3_moe_vl: 'qwen3_vl_moe',
+    }
+
+    cce_model_type = model_type_map.get(model_type)
+    if cce_model_type:
+        cce_patch(cce_model_type)
+        return
+
+    supported_models = ', '.join(sorted(set(model_type_map.values())))
+    raise ValueError(f'Unsupported cce model_type: {model_type}. Supported types: {supported_models}')
+
+
 def get_multimodal_target_regex(
     model,
     *,
@@ -375,6 +442,9 @@ def prepare_model(cls, args, model, *, template=None, train_dataset=None, task_t
             # Apply liger
             apply_liger(args.model_type)
 
+        if args.use_cce and 'use_cce' not in inspect.signature(TrainingArguments).parameters:
+            apply_cce(args.model_type)
+
         if args.is_adapter:
             if args.tuner_backend != 'unsloth' and args.train_type not in extra_tuners:
                 # Fix the name of the layer in xcomposer that contains Plora.

diff --git a/swift/plugin/__init__.py b/swift/plugin/__init__.py
@@ -17,6 +17,8 @@
     from .rm_plugin import rm_plugins
     from .env import envs, Env
     from .context_manager import context_managers, ContextManager
+    from .tiled_mlp import (TiledSwiGLUMLP, apply_tiled_mlp, is_fsdp2_enabled, is_fsdp1_enabled, get_tiled_mlp_mode)
+    from swift.plugin.activation_cpu_offload import ActivationCpuOffloadCallBack
 
 else:
     _import_structure = {