Address review comments

luoyuan.luo · luoyuan.luo · commit 78477c935143 · 2026-02-18T19:37:03.000+08:00
diff --git a/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py b/python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py
@@ -29,26 +29,7 @@
 
 mode_concentrated = IS_CI or (os.environ.get("SGLANG_BENCH_MODE", "") == "concentrated")
 
-if int(os.environ.get("SGLANG_NSYS_PROFILING", "0")):
-    configs = [
-        [
-            768 * 8,
-            2048,
-            128,
-            48,
-            fp8_type_,
-            dict(
-                column_major_scales=True,
-                scale_tma_aligned=True,
-                scale_ue8m0=True,
-                fuse_silu_and_mul=True,
-                # masked_layout_mode=None,
-                masked_layout_mode="balanced",
-                # masked_layout_mode="extreme",
-            ),
-        ]
-    ]
-elif mode_concentrated:
+if mode_concentrated:
     configs = list(
         itertools.product(
             [768],
diff --git a/python/sglang/jit_kernel/per_token_group_quant_8bit.py b/python/sglang/jit_kernel/per_token_group_quant_8bit.py
@@ -9,18 +9,15 @@
 if TYPE_CHECKING:
     from tvm_ffi.module import Module
 
-_OUTPUT_DTYPE_MAP = {
-    torch.float8_e4m3fn: "fp8_e4m3_t",
-    torch.int8: "int8_t",
-}
+from sglang.jit_kernel.utils import CPP_DTYPE_MAP as OUTPUT_DTYPE_MAP
 
 
 @cache_once
 def _jit_per_token_group_quant_8bit_module(
     dtype: torch.dtype, output_type: torch.dtype
 ) -> Module:
     input_args = make_cpp_args(dtype)
-    out_cpp = _OUTPUT_DTYPE_MAP[output_type]
+    out_cpp = OUTPUT_DTYPE_MAP[output_type]
     return load_jit(
         "per_token_group_quant_8bit",
         cuda_files=["gemm/per_token_group_quant_8bit.cuh"],
diff --git a/python/sglang/jit_kernel/utils.py b/python/sglang/jit_kernel/utils.py
@@ -10,7 +10,6 @@
 if TYPE_CHECKING:
     from tvm_ffi import Module
 
-
 F = TypeVar("F", bound=Callable[..., Any])
 
 
@@ -73,7 +72,9 @@ def __str__(self) -> str:
 CPP_DTYPE_MAP = {
     torch.float: "fp32_t",
     torch.float16: "fp16_t",
+    torch.float8_e4m3fn: "fp8_e4m3_t",
     torch.bfloat16: "bf16_t",
+    torch.int8: "int8_t",
 }
 
 
diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -63,6 +63,10 @@
 
         enable_sgl_per_token_group_quant_8bit = False
 
+    from sglang.jit_kernel.per_token_group_quant_8bit import (
+        per_token_group_quant_8bit as sgl_per_token_group_quant_8bit_jit,
+    )
+
 if _is_hip:
     _has_vllm = False
     if _use_aiter:
@@ -501,19 +505,31 @@ def sglang_per_token_group_quant_fp8(
     if x.shape[0] > 0:
         # Temporary
         if enable_sgl_per_token_group_quant_8bit:
-            sgl_per_token_group_quant_8bit(
-                x,
-                x_q,
-                x_s,
-                group_size,
-                eps,
-                fp8_min,
-                fp8_max,
-                scale_ue8m0,
-                fuse_silu_and_mul,
-                masked_m,
-                enable_v2=enable_v2,
-            )
+            if enable_v2:
+                sgl_per_token_group_quant_8bit(
+                    x,
+                    x_q,
+                    x_s,
+                    group_size,
+                    eps,
+                    fp8_min,
+                    fp8_max,
+                    scale_ue8m0,
+                    fuse_silu_and_mul,
+                    masked_m,
+                    enable_v2=True,
+                )
+            else:
+                sgl_per_token_group_quant_8bit_jit(
+                    input=x,
+                    output_q=x_q,
+                    output_s=x_s,
+                    group_size=group_size,
+                    eps=eps,
+                    fp8_min=fp8_min,
+                    fp8_max=fp8_max,
+                    scale_ue8m0=scale_ue8m0,
+                )
         else:
             assert not enable_v2
             sgl_per_token_group_quant_fp8(