THUDM
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README_zh.md‎
Lines changed: 1 addition & 1 deletion b/‎README_zh.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/models/glm5-744B-A40B.sh‎
Lines changed: 55 additions & 0 deletions b/‎scripts/models/glm5-744B-A40B.sh‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎scripts/run-glm5-744B-A40B.sh‎
Lines changed: 184 additions & 0 deletions b/‎scripts/run-glm5-744B-A40B.sh‎
Lines changed: 184 additions & 0 deletions
diff --git a/‎slime/backends/megatron_utils/actor.py‎
Lines changed: 6 additions & 8 deletions b/‎slime/backends/megatron_utils/actor.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎slime/backends/megatron_utils/data.py‎
Lines changed: 45 additions & 13 deletions b/‎slime/backends/megatron_utils/data.py‎
Lines changed: 45 additions & 13 deletions
@@ -10,7 +10,7 @@
 1.  **High-Performance Training**: Supports efficient training in various modes by connecting Megatron with SGLang;
 2.  **Flexible Data Generation**: Enables arbitrary training data generation workflows through custom data generation interfaces and server-based engines.
 
-slime is the RL-framework behind [GLM-4.7](https://z.ai/blog/glm-4.7), [GLM-4.6](https://z.ai/blog/glm-4.6), [GLM-4.5](https://z.ai/blog/glm-4.5) and apart from models from Z.ai, we also supports the following models:
+slime is the RL-framework behind [GLM-5](https://z.ai/blog/glm-5), [GLM-4.7](https://z.ai/blog/glm-4.7), [GLM-4.6](https://z.ai/blog/glm-4.6), [GLM-4.5](https://z.ai/blog/glm-4.5) and apart from models from Z.ai, we also supports the following models:
 - Qwen3 series (Qwen3Next, Qwen3MoE, Qwen3), Qwen2.5 series;
 - DeepSeek V3 series (DeepSeek V3, V3.1, DeepSeek R1);
 - Llama 3.
 
@@ -10,7 +10,7 @@
 1. **高性能训练**：通过连接 Megatron 与 SGLang，支持各种模式的高效训练；
 2. **灵活的数据生成**：通过自定义数据生成接口以及 server based engine，实现任意的数据训练数据生成流程。
 
-slime 是 [GLM-4.7](https://z.ai/blog/glm-4.7)、[GLM-4.6](https://z.ai/blog/glm-4.6)、[GLM-4.5](https://z.ai/blog/glm-4.5) 背后的 RL 训练框架，除此之外，slime 还支持:
+slime 是 [GLM-5](https://z.ai/blog/glm-5)、[GLM-4.7](https://z.ai/blog/glm-4.7)、[GLM-4.6](https://z.ai/blog/glm-4.6)、[GLM-4.5](https://z.ai/blog/glm-4.5) 背后的 RL 训练框架，除此之外，slime 还支持:
 - Qwen3 系列 (Qwen3Next, Qwen3MoE, Qwen3), Qwen2.5 系列；
 - DeepSeek V3 系列 (DeepSeek V3, V3.1, DeepSeek R1)；
 - Llama 3。
 
@@ -0,0 +1,55 @@
+MOE_ROUTED_EXPERTS=256
+MOE_ACTIVE_ROUTED_EXPERTS=8
+MOE_SHARED_EXPERTS=1
+
+NHIDDEN=6144
+MOE_FFN_HIDDEN=2048
+MOE_SHARED_EXPERT_INTERMEDIATE_SIZE=$(($MOE_FFN_HIDDEN * $MOE_SHARED_EXPERTS))
+FFN_HIDDEN=12288
+N_DENSE_LAYERS=3
+N_MOE_LAYERS=75
+NHEADS=64
+
+MODEL_ARGS=(
+   --spec "slime_plugins.models.glm5.glm5" "get_glm5_spec"
+    --moe-layer-freq [0]*$N_DENSE_LAYERS+[1]*$N_MOE_LAYERS
+    --num-experts $MOE_ROUTED_EXPERTS
+    --moe-shared-expert-intermediate-size $MOE_SHARED_EXPERT_INTERMEDIATE_SIZE
+    --moe-router-topk $MOE_ACTIVE_ROUTED_EXPERTS
+    --moe-grouped-gemm
+    --moe-permute-fusion
+    --moe-ffn-hidden-size $MOE_FFN_HIDDEN
+    --moe-router-score-function sigmoid
+    --moe-router-pre-softmax
+    --moe-router-enable-expert-bias
+    --moe-router-bias-update-rate 0
+    --moe-router-load-balancing-type seq_aux_loss
+    --moe-router-topk-scaling-factor 2.5
+    --moe-aux-loss-coeff 0
+    --moe-router-dtype fp32
+    --make-vocab-size-divisible-by 16
+    --num-layers $((N_DENSE_LAYERS + N_MOE_LAYERS))
+    --hidden-size $NHIDDEN
+    --ffn-hidden-size $FFN_HIDDEN
+    --num-attention-heads $NHEADS
+    --disable-bias-linear
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --position-embedding-type rope
+    --no-position-embedding
+    --normalization RMSNorm
+    --qk-layernorm
+    --multi-latent-attention
+    --q-lora-rank 2048
+    --kv-lora-rank 512
+    --qk-head-dim 192
+    --v-head-dim 256
+    --kv-channels 192
+    --qk-pos-emb-head-dim 64
+    --vocab-size 154880
+    --rotary-base 1000000
+    --enable-experimental
+
+    # slime specific args
+    --allgather-cp
+)
@@ -0,0 +1,184 @@
+#!/bin/bash
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+# will prevent ray from buffering stdout/stderr
+export PYTHONBUFFERED=16
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/models/glm5-744B-A40B.sh"
+
+CKPT_ARGS=(
+   --hf-checkpoint $BASE_DIR/GLM-5
+   --ref-load $BASE_DIR/GLM-5_torch_dist/
+   --load $BASE_DIR/GLM-5_slime/
+   --save $BASE_DIR/GLM-5_slime/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data $BASE_DIR/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+
+   --rm-type deepscaler
+
+   --num-rollout 3000
+   --rollout-batch-size 8
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 32768
+   --rollout-temperature 1
+
+   --global-batch-size 64
+)
+
+PERF_ARGS=(
+    --tensor-model-parallel-size 4
+    --sequence-parallel
+    --pipeline-model-parallel-size 4
+    --decoder-last-pipeline-num-layers 18
+    --expert-model-parallel-size 32
+    --expert-tensor-parallel-size 1
+    --context-parallel-size 2
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 16384
+   --data-pad-size-multiplier 4096
+   --log-probs-chunk-size 1024
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   #--use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --kl-coef 0.00
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+WANDB_ARGS=(
+   # --use-wandb
+   # --wandb-project slime-dev
+   # --wandb-group glm5-test
+   # --wandb-key ${WANDB_KEY}
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 64
+   --sglang-mem-fraction-static 0.70
+   --sglang-enable-dp-attention
+   --sglang-ep-size 64
+   --sglang-dp-size 64
+   --sglang-moe-dense-tp-size 1
+   --sglang-enable-dp-lm-head
+
+   --sglang-moe-a2a-backend deepep
+   --sglang-deepep-mode auto
+
+   --prefill-num-servers 1
+
+   # mtp
+   --sglang-speculative-algorithm EAGLE
+   --sglang-speculative-num-steps 3
+   --sglang-speculative-eagle-topk 1
+   --sglang-speculative-num-draft-tokens 4
+
+   # dsa
+   --sglang-page-size 64
+   --sglang-nsa-decode-backend flashmla_sparse
+   --sglang-nsa-prefill-backend flashmla_sparse
+   --sglang-attention-backend nsa
+   --sglang-cuda-graph-max-bs 8
+
+   --sglang-max-running-requests 512
+   --sglang-chunked-prefill-size 131072
+
+   --sglang-watchdog-timeout 3600
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+
+   # use deepep for megatron
+   --moe-enable-deepep
+   --moe-token-dispatcher-type flex
+)
+
+# Build the runtime environment JSON with proper variable substitution
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
+    \"no_proxy\": \"${no_proxy}\",
+    \"MASTER_ADDR\": \"${MASTER_ADDR}\",
+    \"INDEXER_ROPE_NEOX_STYLE\": \"0\",
+    \"SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK\": \"32\",
+    \"NVSHMEM_DISABLE_NCCL\": \"1\"
+  }
+}"
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+-- python3 train.py \
+   --actor-num-nodes 32 \
+   --actor-num-gpus-per-node 8 \
+   --colocate \
+   --update-weight-buffer-size $(( 1024 * 1024 * 1024 * 2 )) \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]}
@@ -223,14 +223,12 @@ def _get_rollout_data(self, rollout_data_ref: Box) -> RolloutBatch:
                 continue
             rollout_data[key] = [
                 torch.tensor(
-                    (
-                        slice_log_prob_with_cp(
-                            log_prob,
-                            total_length,
-                            response_length,
-                            self.args.qkv_format,
-                            rollout_data["max_seq_lens"][i] if self.args.qkv_format == "bshd" else None,
-                        )
+                    slice_log_prob_with_cp(
+                        log_prob,
+                        total_length,
+                        response_length,
+                        self.args.qkv_format,
+                        rollout_data["max_seq_lens"][i] if self.args.qkv_format == "bshd" else None,
                     ),
                     device=torch.cuda.current_device(),
                     dtype=torch.float32,
 
@@ -27,6 +27,7 @@ def get_batch(
     keys: Sequence[str],
     pad_multiplier: int = 128,
     qkv_format: str = "thd",
+    allgather_cp: bool = False,
 ) -> dict[str, torch.Tensor | PackedSeqParams | list[torch.Tensor] | None]:
     """
     Generate a CP-ready micro-batch with packed sequence parameters.
@@ -64,31 +65,53 @@ def get_batch(
     batch["unconcat_tokens"] = tokens
 
     cp_size = mpu.get_context_parallel_world_size()
+    cp_rank = mpu.get_context_parallel_rank()
 
     if qkv_format == "bshd":
         max_seqlen = batch["max_seq_lens"][0]
         assert max([t.size(0) for t in tokens]) <= max_seqlen
         tokens = [slice_with_cp(t, pad_token_id, qkv_format, max_seqlen) for t in tokens]
         tokens = torch.stack(tokens)
+
     elif qkv_format == "thd":
-        tokens = [slice_with_cp(t, pad_token_id, qkv_format) for t in tokens]
+        if allgather_cp:
+            # DSA mode: concatenate all sequences first, then slice once with CP.
+            # We also pad the *global* concatenated stream to make per-rank chunks equal.
+            cu_seqlens_list: list[int] = [0]
+            for t in tokens:
+                cu_seqlens_list.append(cu_seqlens_list[-1] + t.size(0))
+
+            tokens = torch.cat(tokens, dim=0)
+
+            # Pad global stream so (1) divisible by cp_size (equal chunks),
+            # (2) divisible by pad_size (reduce fragmentation).
+            global_pad_size = cp_size * pad_size
+            pad = (global_pad_size - tokens.size(0) % global_pad_size) % global_pad_size
+            if pad != 0:
+                tokens = F.pad(tokens, (0, pad), value=pad_token_id)
+                cu_seqlens_list.append(cu_seqlens_list[-1] + pad)
+
+            cu_seqlens = torch.tensor(cu_seqlens_list, dtype=torch.int, device=torch.cuda.current_device())
+            tokens = tokens.chunk(cp_size, dim=0)[cp_rank]
+        else:
+            tokens = [slice_with_cp(t, pad_token_id, qkv_format) for t in tokens]
 
-        cu_seqlens = [0]
-        for t in tokens:
-            cu_seqlens.append(cu_seqlens[-1] + t.size(0))
+            cu_seqlens = [0]
+            for t in tokens:
+                cu_seqlens.append(cu_seqlens[-1] + t.size(0))
 
-        tokens = torch.cat(tokens)
+            tokens = torch.cat(tokens)
 
-        # Always pad to reduce memory fragmentation and maybe make the computation faster
-        pad = (pad_size - tokens.size(0) % pad_size) % pad_size
-        if pad != 0:
-            tokens = F.pad(tokens, (0, pad), value=pad_token_id)
-            cu_seqlens.append(cu_seqlens[-1] + pad)
+            # Always pad to reduce memory fragmentation and maybe make the computation faster
+            pad = (pad_size - tokens.size(0) % pad_size) % pad_size
+            if pad != 0:
+                tokens = F.pad(tokens, (0, pad), value=pad_token_id)
+                cu_seqlens.append(cu_seqlens[-1] + pad)
 
-        # thd requires the cu_seqlens to be of the origin length
-        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int).cuda() * cp_size
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            # thd requires the cu_seqlens to be of the origin length
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int).cuda() * cp_size
 
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
         packed_seq_params = PackedSeqParams(
             cu_seqlens_q=cu_seqlens,
             cu_seqlens_kv=cu_seqlens,
@@ -115,11 +138,20 @@ def get_batch(
         prompt_length = total_length - response_length
         # Align mask to token stream positions (prompt_length-1 left pad, 1 right pad)
         loss_mask = F.pad(loss_mask, (prompt_length - 1, 1), value=0)
+        if allgather_cp:
+            loss_masks.append(loss_mask)
+            continue
         loss_mask = slice_with_cp(loss_mask, 0, qkv_format, max_seqlen)
         loss_masks.append(loss_mask)
 
     if qkv_format == "bshd":
         loss_masks = torch.stack(loss_masks)
+    elif qkv_format == "thd" and allgather_cp:
+        # DSA: concatenate first (same as tokens), pad globally (same pad as above), then slice once.
+        loss_masks = torch.cat(loss_masks, dim=0)
+        if pad != 0:
+            loss_masks = F.pad(loss_masks, (0, pad), value=0)
+        loss_masks = loss_masks.chunk(cp_size, dim=0)[cp_rank].unsqueeze(0)
     elif qkv_format == "thd":
         loss_masks = torch.cat(loss_masks)
         loss_masks = F.pad(loss_masks, (0, pad), value=0).unsqueeze(0)