sgl-project
diff --git a/‎python/sgl_jax/srt/configs/quantization_config.py‎
Lines changed: 30 additions & 3 deletions b/‎python/sgl_jax/srt/configs/quantization_config.py‎
Lines changed: 30 additions & 3 deletions
diff --git a/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/blockwise_kernel.py‎
Lines changed: 6 additions & 5 deletions b/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/blockwise_kernel.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/util.py‎
Lines changed: 23 additions & 19 deletions b/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/util.py‎
Lines changed: 23 additions & 19 deletions
diff --git a/‎python/sgl_jax/srt/kernels/quantized_matmul/kernel.py‎
Lines changed: 25 additions & 0 deletions b/‎python/sgl_jax/srt/kernels/quantized_matmul/kernel.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎python/sgl_jax/srt/layers/linear.py‎
Lines changed: 9 additions & 13 deletions b/‎python/sgl_jax/srt/layers/linear.py‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎python/sgl_jax/srt/layers/moe.py‎
Lines changed: 19 additions & 0 deletions b/‎python/sgl_jax/srt/layers/moe.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎python/sgl_jax/srt/utils/quantization/quantization_utils.py‎
Lines changed: 8 additions & 4 deletions b/‎python/sgl_jax/srt/utils/quantization/quantization_utils.py‎
Lines changed: 8 additions & 4 deletions
@@ -7,6 +7,7 @@
 
 import os
 from dataclasses import dataclass
+from numbers import Integral
 
 import jax.numpy as jnp
 import yaml
@@ -55,6 +56,32 @@ def _resolve_config_path(config_path: str) -> str:
     )
 
 
+def _normalize_weight_block_size(
+    weight_block_size: list[int] | tuple[int, int] | None,
+) -> tuple[int, int] | None:
+    if weight_block_size is None:
+        return None
+    if not isinstance(weight_block_size, (list, tuple)) or len(weight_block_size) != 2:
+        raise ValueError(
+            "quantization.weight_block_size must be a 2-element list/tuple "
+            f"[block_n, block_k], got {weight_block_size!r}"
+        )
+    block_n, block_k = weight_block_size
+    if not isinstance(block_n, Integral) or not isinstance(block_k, Integral):
+        raise ValueError(
+            "quantization.weight_block_size values must be integers, "
+            f"got {weight_block_size!r}"
+        )
+    block_n = int(block_n)
+    block_k = int(block_k)
+    if block_n <= 0 or block_k <= 0:
+        raise ValueError(
+            "quantization.weight_block_size values must be > 0, "
+            f"got {weight_block_size!r}"
+        )
+    return (block_n, block_k)
+
+
 @dataclass
 class QuantizationConfig:
     """Quantization configuration with explicit settings (no fallbacks).
@@ -65,15 +92,15 @@ class QuantizationConfig:
         moe_activation_dtype: Dtype for MoE activation quantization (None = no quantization)
         is_static_checkpoint: Whether the checkpoint is static (true for checkpoints quantized offline, false for on-the-fly quantization)
         ignored_layers: Optional list of layer name patterns to exclude from quantization
-        weight_block_size: Optional block sizes for static checkpoints (e.g., [128, 128])
+        weight_block_size: Optional block sizes for block quantization (e.g., [128, 128])
     """
 
     linear_rules: list[dict] | None = None
     moe_weight_dtype: jnp.dtype | None = None
     moe_activation_dtype: jnp.dtype | None = None
     is_static_checkpoint: bool = False
     ignored_layers: list[str] | None = None
-    weight_block_size: list[int] | None = None
+    weight_block_size: tuple[int, int] | None = None
 
     @classmethod
     def from_yaml(cls, yaml_path: str) -> "QuantizationConfig":
@@ -127,7 +154,7 @@ def from_yaml(cls, yaml_path: str) -> "QuantizationConfig":
         moe_weight_dtype = _str_to_dtype(moe_section.get("weight_dtype"))
         moe_activation_dtype = _str_to_dtype(moe_section.get("activation_dtype"))
         is_static_checkpoint = quant.get("is_static_checkpoint", False)
-        weight_block_size = quant.get("weight_block_size")
+        weight_block_size = _normalize_weight_block_size(quant.get("weight_block_size"))
 
         return cls(
             linear_rules=linear_rules,
 
@@ -9,10 +9,11 @@
 
 from . import util
 from .tuned_block_sizes import (
-    TunedValue, get_device_vmem_limit, get_tuned_block_sizes)
-from .util import (get_kernel_name,
-                                                         next_multiple,
-                                                         unfold_args)
+    TunedValue,
+    get_device_vmem_limit,
+    get_tuned_block_sizes,
+)
+from .util import get_kernel_name, next_multiple, unfold_args
 
 quantize_tensor = util.quantize_tensor
 MXU_SIZE = 256
@@ -215,7 +216,7 @@ def accum(is_first_step, is_last_step):
             out_specs=pl.BlockSpec((batch_block_size, out_block_size),
                                    lambda b, o, i: (b, o)),
             scratch_shapes=[
-                pltpu.VMEM((batch_block_size, out_block_size), jnp.bfloat16)
+                pltpu.VMEM((batch_block_size, out_block_size), acc_dtype)
             ],
             grid=(n_batch, n_out, n_in),
         ),
 
@@ -5,11 +5,14 @@
 
 import jax
 import jax.numpy as jnp
-from jax._src import dtypes
 
 from .tuned_block_sizes import TunedValue
 
 
+def _dtype_bits(dtype: jnp.dtype) -> int:
+    return jnp.dtype(dtype).itemsize * 8
+
+
 def unfold_args(
     conditions: tuple[jax.Array | bool, ...],
     fn_conditions: tuple[bool, ...],
@@ -191,7 +194,8 @@ def quantize_array(
 
     # TODO(kyuyeunk): Investigate performance gain from non xlu transpose.
     scale = jnp.transpose(x_abs_max / dtype_max)
-    scale_inv = jnp.nan_to_num(1 / scale, dtype_max)
+    scale = jnp.where(scale == 0, 1.0, scale)
+    scale_inv = jnp.nan_to_num(1 / scale, nan=dtype_max, posinf=dtype_max, neginf=-dtype_max)
     return (x * scale_inv).astype(quant_dtype), scale.astype(jnp.float32)
 
 
@@ -215,13 +219,11 @@ def get_vmem_limit(
     """Calculate VMEM limit for the kernel."""
 
     # Calculate in/out VMEM size.
-    x_size = (batch_block_size * in_block_size * dtypes.itemsize_bits(x_dtype))
-    x_abs_max_size = batch_block_size * dtypes.itemsize_bits(scale_dtype)
-    w_q_size = (out_block_size * in_block_size *
-                dtypes.itemsize_bits(w_q_dtype))
-    w_scale_size = out_block_size * dtypes.itemsize_bits(scale_dtype)
-    out_size = (batch_block_size * out_block_size *
-                dtypes.itemsize_bits(out_dtype))
+    x_size = batch_block_size * in_block_size * _dtype_bits(x_dtype)
+    x_abs_max_size = batch_block_size * _dtype_bits(scale_dtype)
+    w_q_size = out_block_size * in_block_size * _dtype_bits(w_q_dtype)
+    w_scale_size = out_block_size * _dtype_bits(scale_dtype)
+    out_size = batch_block_size * out_block_size * _dtype_bits(out_dtype)
 
     vmem_in_out = x_size + x_abs_max_size + w_q_size + w_scale_size + out_size
     vmem_in_out *= 2  # Account for compute and vreg spills.
@@ -235,11 +237,9 @@ def get_vmem_limit(
     vmem_in_out += out_size if (n_batch > 1 or n_out > 1) else 0
 
     # Calculate scratch VMEM size.
-    acc_size = (batch_block_size * out_block_size *
-                dtypes.itemsize_bits(acc_dtype))
-    x_q_size = (batch_block_size * in_block_size *
-                dtypes.itemsize_bits(x_q_dtype))
-    x_scale_size = batch_block_size * dtypes.itemsize_bits(scale_dtype)
+    acc_size = batch_block_size * out_block_size * _dtype_bits(acc_dtype)
+    x_q_size = batch_block_size * in_block_size * _dtype_bits(x_q_dtype)
+    x_scale_size = batch_block_size * _dtype_bits(scale_dtype)
 
     vmem_scratch = acc_size if save_acc else 0
     vmem_scratch += x_q_size + x_scale_size if save_x_q else 0
@@ -277,10 +277,14 @@ def validate_inputs(
     # Verify input shapes.
     if x.shape[1] != w_q.shape[1]:
         raise ValueError(f'{x.shape[1]=} must be equal to {w_q.shape[1]=}')
-    if w_q.shape[0] != w_scale.shape[1] and (w_scale.ndim == 3 and w_q.shape[0]
-                                             != w_scale.shape[2]):
-        raise ValueError(
-            f"{w_q.shape[0]=} must be equal to {w_scale.shape[1]=}")
+    if w_scale.ndim == 2:
+        if w_q.shape[0] != w_scale.shape[1]:
+            raise ValueError(f"{w_q.shape[0]=} must be equal to {w_scale.shape[1]=}")
+    elif w_scale.ndim == 3:
+        if w_q.shape[0] != w_scale.shape[2]:
+            raise ValueError(f"{w_q.shape[0]=} must be equal to {w_scale.shape[2]=}")
+    else:
+        raise ValueError(f"Unsupported {w_scale.ndim=} for quantized weight scale.")
     if x_abs_max is not None and x_abs_max.shape != (1, x.shape[0]):
         raise ValueError(
             f"{x_abs_max.shape=} must be equal to (1, {x.shape[0]=})")
@@ -317,5 +321,5 @@ def quantize_block(data, axis, target_dtype):
     if jnp.issubdtype(target_dtype, jnp.floating):
         data_q = (data / scale).clip(dtype_min, dtype_max).astype(target_dtype)
     else:
-        data_q = jnp.round(data / scale).astype(target_dtype)
+        data_q = jnp.clip(jnp.round(data / scale), dtype_min, dtype_max).astype(target_dtype)
     return data_q, scale
@@ -1,15 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 """Quantized matmul kernel."""
 
+import functools
 import importlib
+import logging
 import math
+import re
 
 import jax
 import jax.numpy as jnp
 from jax import lax
 
 from sgl_jax.srt.utils.quantization.quantization_utils import quantize_tensor_simple
 
+logger = logging.getLogger(__name__)
+
 
 _BLOCKWISE_3RD_KERNEL = None
 _TRIED_LOADING_BLOCKWISE_3RD_KERNEL = False
@@ -32,6 +37,7 @@ def _get_blockwise_3rd_kernel():
         module = importlib.import_module(f"{package}.3rd_quantized_matmul")
         _BLOCKWISE_3RD_KERNEL = getattr(module, "quantized_matmul", None)
     except Exception:
+        logger.debug("Failed to import third-party blockwise quantized matmul kernel.", exc_info=True)
         _BLOCKWISE_3RD_KERNEL = None
     return _BLOCKWISE_3RD_KERNEL
 
@@ -58,6 +64,7 @@ def _get_blockwise_3rd_tuning_api():
         _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES = getattr(module, "get_tuned_block_sizes", None)
         _BLOCKWISE_3RD_TUNED_BLOCK_SIZES = getattr(module, "TUNED_BLOCK_SIZES", None)
     except Exception:
+        logger.debug("Failed to import third-party blockwise tuning metadata.", exc_info=True)
         _BLOCKWISE_3RD_TUNED_VALUE_CLS = None
         _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES = None
         _BLOCKWISE_3RD_TUNED_BLOCK_SIZES = None
@@ -102,13 +109,26 @@ def _candidate(units_value: int) -> int:
     return min(candidates, key=lambda value: (abs(value - x), -value))
 
 
+@functools.lru_cache(maxsize=1)
+def _get_current_tpu_version() -> int:
+    try:
+        kind = jax.devices()[0].device_kind
+    except Exception:
+        return -1
+    match = re.match(r"^TPU[^\d]*(\d+)", kind)
+    if match is None:
+        return -1
+    return int(match.group(1))
+
+
 def _iter_blockwise_tuned_candidates(
     tuned_block_sizes: dict | None,
     n_batch: int,
     n_out: int,
     n_in: int,
     x_q_dtype: jnp.dtype,
     w_q_dtype: jnp.dtype,
+    tpu_version: int,
 ):
     if not tuned_block_sizes:
         return []
@@ -121,6 +141,8 @@ def _iter_blockwise_tuned_candidates(
 
     candidates = []
     for key, value in tuned_block_sizes.items():
+        if getattr(key, "tpu_version", tpu_version) != tpu_version:
+            continue
         if key.w_q_dtype != w_q_dtype_name:
             continue
         if key.x_q_dtype not in compatible_x_dtype_names:
@@ -162,6 +184,7 @@ def _get_safe_blockwise_tuned_value(
         n_in=n_in,
         x_q_dtype=x_q_dtype,
         w_q_dtype=w_q_dtype,
+        tpu_version=_get_current_tpu_version(),
     )
     if compatible_candidates:
         tuned = compatible_candidates[0]
@@ -175,6 +198,7 @@ def _get_safe_blockwise_tuned_value(
                 w_q_dtype=jnp.dtype(w_q_dtype).name,
             )
         except Exception:
+            logger.debug("Failed to query tuned block sizes from third-party kernel.", exc_info=True)
             tuned = None
     if tuned is None:
         tuned = tuned_value_cls(128, 128, 128, 1)
@@ -356,6 +380,7 @@ def xla_quantized_matmul_local(
                     tuned_value=tuned_value,
                 )
             except Exception:
+                logger.debug("Falling back from third-party blockwise kernel to local dequant path.", exc_info=True)
                 out = None
 
         if out is None:
 
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Linear layers."""
 
-import math
 from collections.abc import Sequence
 from functools import partial
 
@@ -165,7 +164,13 @@ def from_linear(
                     )
                 bias = linear.bias.value if linear.bias is not None else None
             else:
-                weight_q = weight.T.astype(weight_dtype)
+                if weight.dtype != weight_dtype:
+                    raise ValueError(
+                        "QuantizedLinear.from_linear(..., is_static_input=True) requires "
+                        "pre-quantized concrete weights or abstract shapes. "
+                        f"Got weight.dtype={weight.dtype}, expected {weight_dtype}."
+                    )
+                weight_q = weight.T
                 if effective_weight_block_size is not None and len(effective_weight_block_size) == 2:
                     block_n, block_k = int(effective_weight_block_size[0]), int(effective_weight_block_size[1])
                     out_blocks = (weight_q.shape[0] + block_n - 1) // block_n
@@ -192,7 +197,7 @@ def from_linear(
             weight_q=weight_q, weight_scale=weight_scale, bias=bias,
             activation_dtype=activation_dtype, mesh=linear.mesh,
             kernel_axes=linear.kernel_axes,
-            skip_bias_add=linear.skip_bias_add or linear.bias is None,
+            skip_bias_add=linear.skip_bias_add,
             params_dtype=linear.params_dtype, weight_block_size=effective_weight_block_size,
             scope_name=f"quantized_{linear.name}",
         )
@@ -212,22 +217,13 @@ def __call__(self, x: jax.Array) -> tuple[jax.Array, jax.Array | None]:
         in_specs = (P(None, input_axis), P(output_axis, input_axis), w_scale_spec)
         out_specs = P(None, output_axis)
 
-        # Handle block size inference
-        effective_weight_block_size = self.weight_block_size
-        if scale_val.ndim == 2 and self.weight_block_size is not None:
-            global_out_size, global_in_size = self.weight_q.value.shape
-            inferred_bs_out = math.ceil(global_out_size / scale_val.shape[0])
-            inferred_bs_in = math.ceil(global_in_size / scale_val.shape[1])
-            if (inferred_bs_out != self.weight_block_size[0] or inferred_bs_in != self.weight_block_size[1]):
-                effective_weight_block_size = (inferred_bs_out, inferred_bs_in)
-
         output = shard_map(
             partial(
                 xla_quantized_matmul_local,
                 quantize_activation=quantize_activation,
                 reduce_axis=input_axis,
                 compute_dtype=self.compute_dtype,
-                weight_block_size=effective_weight_block_size,
+                weight_block_size=self.weight_block_size,
                 activation_quant_dtype=self.activation_dtype,
             ),
             mesh=self.mesh, in_specs=in_specs, out_specs=out_specs, check_vma=False,
 
@@ -329,6 +329,25 @@ def _normalize_scale_for_gmm(
         num_experts, out_dim, in_dim = weight.shape
 
         if scale.ndim == 4:
+            if scale.shape[0] != num_experts or scale.shape[2] != 1 or scale.shape[3] != out_dim:
+                raise ValueError(
+                    f"Unsupported {scale_name} shape {scale.shape} for weight shape {weight.shape}. "
+                    "Expected 4D GMM scale layout [E, k_blocks, 1, out_dim]."
+                )
+            if self.weight_block_size is None:
+                if scale.shape[1] != 1:
+                    raise ValueError(
+                        f"Unsupported {scale_name} shape {scale.shape} for weight shape {weight.shape}. "
+                        "Per-channel 4D GMM scales must have k_blocks=1."
+                    )
+            else:
+                block_size_k = int(self.weight_block_size[1])
+                expected_k_blocks = (in_dim + block_size_k - 1) // block_size_k
+                if scale.shape[1] not in (1, expected_k_blocks):
+                    raise ValueError(
+                        f"Unsupported {scale_name} shape {scale.shape} for weight shape {weight.shape}. "
+                        f"Expected k_blocks dimension to be 1 or {expected_k_blocks}."
+                    )
             return scale
 
         if scale.ndim == 2 and scale.shape == (num_experts, out_dim):
 
@@ -10,7 +10,7 @@
 from jax.sharding import NamedSharding, PartitionSpec as P
 
 from sgl_jax.srt.configs.model_config import ModelConfig
-from sgl_jax.srt.configs.quantization_config import DTYPE_MAP
+from sgl_jax.srt.configs.quantization_config import DTYPE_MAP, _normalize_weight_block_size
 
 logger = logging.getLogger(__name__)
 
@@ -103,6 +103,7 @@ def apply_linear_quantization(
             if "weight_block_size" in rule
             else getattr(quant_config, "weight_block_size", None)
         )
+        weight_block_size = _normalize_weight_block_size(weight_block_size)
 
         # Convert string dtypes to jnp dtypes
         weight_dtype = DTYPE_MAP.get(weight_dtype_str)
@@ -120,7 +121,7 @@ def apply_linear_quantization(
             }
         )
 
-    ignored_layers = getattr(quant_config, "ignored_layers", None) or []
+    ignored_layers = quant_config.ignored_layers or []
 
     def _find_matching_rule(path: str):
         """Find the first rule that matches the given module path."""
@@ -147,8 +148,11 @@ def _replace_linear_recursive(obj, path: str = "", visited: set | None = None):
                 if isinstance(attr_value, LinearBase):
                     # Check if this path matches any rule
                     dot_path = child_path.replace("/", ".")
-                    if any(dot_path.endswith(ignored) or ignored in dot_path for ignored in ignored_layers):
-                        logger.info("Skipping %s - in ignored_layers", child_path)
+                    if any(
+                        dot_path == ignored or dot_path.endswith(f".{ignored}")
+                        for ignored in ignored_layers
+                    ):
+                        logger.info("Skipping %s - in ignored_layers", dot_path)
                         continue
 
                     rule = _find_matching_rule(child_path)