Use cutlass fp4 gemm by default (sgl-project#11813)

Qiaolin-Yu · web-flow · commit ebda73dc723c · 2025-10-18T14:10:15.000-07:00
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -79,7 +79,7 @@
     "SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true"
 )
 USE_CUTLASS_BACKEND_FOR_FP4_GEMM = get_bool_env_var(
-    "SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM"
+    "SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM", "true"
 )
 # TODO make it true by default when the DeepEP PR is merged
 CUTEDSL_MOE_NVFP4_DISPATCH = get_bool_env_var(

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@`
`79`	`79`	`"SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true"`
`80`	`80`	`)`
`81`	`81`	`USE_CUTLASS_BACKEND_FOR_FP4_GEMM = get_bool_env_var(`
`82`		`- "SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM"`
	`82`	`+ "SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM", "true"`
`83`	`83`	`)`
`84`	`84`	`# TODO make it true by default when the DeepEP PR is merged`
`85`	`85`	`CUTEDSL_MOE_NVFP4_DISPATCH = get_bool_env_var(`