mod doc

ZRandomize · ZRandomize · commit 8ef61f1b8eb9 · 2026-03-17T03:14:17.000+08:00
diff --git a/README.md b/README.md
@@ -180,6 +180,6 @@ set_optimization(
 
 - [x] SFT exps
 - [x] Reference configs: Qwen3 8B `playground/pretrain/qwen3/qwen3_8.py`, Step3.5 Flash `playground/pretrain/step3p5/step3p5_flash.py`
-- [ ] Eval
+- [x] Eval
 - [ ] RLVR implementation
-- [ ] Triton kernel implementation
+- [x] Triton kernel implementation
diff --git a/README_ZH.md b/README_ZH.md
@@ -175,6 +175,6 @@ set_optimization(
 
 - [x] SFT exps
 - [x] Reference configs: Qwen3 8B `playground/pretrain/qwen3/qwen3_8.py`, Step3.5 Flash `playground/pretrain/step3p5/step3p5_flash.py`
-- [ ] Eval
+- [x] Eval
 - [ ] RLVR 实现
-- [ ] Triton kernel 实现
+- [x] Triton kernel 实现
diff --git a/playground/sft/step3/step3p5_flash_sft_step3_data_muon.py b/playground/sft/step3/step3p5_flash_sft_step3_data_muon.py
@@ -219,6 +219,7 @@ def configure_optimizable(self):
             moe_weighted_gather="triton",
             TokenDispatcher="deep_ep",
             grouped_gemm="nv_grouped_gemm",
+            # grouped_gemm="function_imple", # slower fallback
             AttentionCore="flash-attn-3",
         )
 

Original file line number	Diff line number	Diff line change
`@@ -219,6 +219,7 @@ def configure_optimizable(self):`
`219`	`219`	`moe_weighted_gather="triton",`
`220`	`220`	`TokenDispatcher="deep_ep",`
`221`	`221`	`grouped_gemm="nv_grouped_gemm",`
	`222`	`+ # grouped_gemm="function_imple", # slower fallback`
`222`	`223`	`AttentionCore="flash-attn-3",`
`223`	`224`	`)`
`224`	`225`