sgl-project
diff --git a/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/blockwise_kernel.py‎
Lines changed: 12 additions & 6 deletions b/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/blockwise_kernel.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/kernel.py‎
Lines changed: 2 additions & 1 deletion b/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/kernel.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/tuned_block_sizes.py‎
Lines changed: 1 addition & 0 deletions b/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/tuned_block_sizes.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/util.py‎
Lines changed: 1 addition & 0 deletions b/‎python/sgl_jax/srt/kernels/quantized_matmul/3rd_quantized_matmul/util.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/sgl_jax/srt/kernels/quantized_matmul/kernel.py‎
Lines changed: 100 additions & 5 deletions b/‎python/sgl_jax/srt/kernels/quantized_matmul/kernel.py‎
Lines changed: 100 additions & 5 deletions
diff --git a/‎python/sgl_jax/srt/layers/linear.py‎
Lines changed: 32 additions & 7 deletions b/‎python/sgl_jax/srt/layers/linear.py‎
Lines changed: 32 additions & 7 deletions
@@ -1,3 +1,4 @@
+# Adapted from https://github.com/vllm-project/tpu-inference/blob/main/tpu_inference/kernels/quantized_matmul/blockwise_kernel.py
 # SPDX-License-Identifier: Apache-2.0
 """Quantized matmul kernel with blockwise quantization support."""
 
@@ -83,7 +84,7 @@ def quantized_matmul_kernel(
     padded_n_out = next_multiple(orig_n_out, out_block_size)
     if orig_n_out < padded_n_out:
         w_q = jnp.pad(w_q, ((0, padded_n_out - orig_n_out), (0, 0)))
-        w_scale = jnp.pad(w_scale, (0, padded_n_out - orig_n_out))
+        w_scale = jnp.pad(w_scale, ((0, 0), (0, 0), (0, padded_n_out - orig_n_out)))
     padded_n_in = next_multiple(orig_n_in, in_block_size)
     if orig_n_in < padded_n_in:
         x = jnp.pad(x, ((0, 0), (0, padded_n_in - orig_n_in)))
@@ -135,16 +136,20 @@ def quantized_matmul_kernel(
     def kernel(lhs_ref, rhs_ref, w_scales_ref, out_ref, acc_scratch):
         pid_k = pl.program_id(2)
         is_first_step = pid_k == 0
-        is_last_step = pid_k == (orig_n_in // in_block_size - 1)
+        is_last_step = pid_k == (n_in - 1)
 
         def accum(is_first_step, is_last_step):
             accumulators = [None] * steps_n
 
             for i in range(steps_k):
                 k_start, k_end = i * block_size, (i + 1) * block_size
-                lhs_sub = lhs_ref[:, k_start:k_end].astype(jnp.float32)
-                lhs_q, lhs_scale = util.quantize_block(lhs_sub, 1, x_q_dtype)
-                lhs_scale = lhs_scale.astype(acc_dtype)
+                if quantize_activation:
+                    lhs_sub = lhs_ref[:, k_start:k_end].astype(jnp.float32)
+                    lhs_q, lhs_scale = util.quantize_block(lhs_sub, 1, x_q_dtype)
+                    lhs_scale = lhs_scale.astype(acc_dtype)
+                else:
+                    lhs_q = lhs_ref[:, k_start:k_end]
+                    lhs_scale = None
 
                 rhs_q_full = rhs_ref[:, k_start:k_end]
                 rhs_scale_full = w_scales_ref[i, :, :].astype(acc_dtype)
@@ -166,7 +171,8 @@ def accum(is_first_step, is_last_step):
                         preferred_element_type=preferred_element_type,
                     )
                     res = dot_res.astype(acc_dtype)
-                    res = res * lhs_scale
+                    if lhs_scale is not None:
+                        res = res * lhs_scale
                     res = res * rhs_scale_slice
                     if i == 0:
                         accumulators[j] = res
 
@@ -1,3 +1,4 @@
+# Adapted from https://github.com/vllm-project/tpu-inference/blob/main/tpu_inference/kernels/quantized_matmul/kernel.py
 # SPDX-License-Identifier: Apache-2.0
 """Quantized matmul kernel."""
 
@@ -184,7 +185,7 @@ def quantized_matmul_kernel(
     padded_n_out = next_multiple(orig_n_out, out_block_size)
     if orig_n_out < padded_n_out:
         w_q = jnp.pad(w_q, ((0, padded_n_out - orig_n_out), (0, 0)))
-        w_scale = jnp.pad(w_scale, (0, padded_n_out - orig_n_out))
+        w_scale = jnp.pad(w_scale, ((0, 0), (0, padded_n_out - orig_n_out)))
     padded_n_in = next_multiple(orig_n_in, in_block_size)
     if orig_n_in < padded_n_in:
         x = jnp.pad(x, ((0, 0), (0, padded_n_in - orig_n_in)))
 
@@ -1,3 +1,4 @@
+# Adapted from https://github.com/vllm-project/tpu-inference/blob/main/tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py
 # SPDX-License-Identifier: Apache-2.0
 """Tuned block sizes for quantized matmul kernel."""
 
 
@@ -1,3 +1,4 @@
+# Adapted from https://github.com/vllm-project/tpu-inference/blob/main/tpu_inference/kernels/quantized_matmul/util.py
 # SPDX-License-Identifier: Apache-2.0
 """Utility functions for quantized matmul kernel."""
 from typing import Any, Callable
 
@@ -15,6 +15,7 @@
 _TRIED_LOADING_BLOCKWISE_3RD_KERNEL = False
 _BLOCKWISE_3RD_TUNED_VALUE_CLS = None
 _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES = None
+_BLOCKWISE_3RD_TUNED_BLOCK_SIZES = None
 _TRIED_LOADING_BLOCKWISE_3RD_TUNING = False
 
 
@@ -39,22 +40,33 @@ def _get_blockwise_3rd_tuning_api():
     """Lazily load third-party tuned-size helpers for blockwise kernel."""
     global _BLOCKWISE_3RD_TUNED_VALUE_CLS
     global _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES
+    global _BLOCKWISE_3RD_TUNED_BLOCK_SIZES
     global _TRIED_LOADING_BLOCKWISE_3RD_TUNING
 
     if _TRIED_LOADING_BLOCKWISE_3RD_TUNING:
-        return _BLOCKWISE_3RD_TUNED_VALUE_CLS, _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES
+        return (
+            _BLOCKWISE_3RD_TUNED_VALUE_CLS,
+            _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES,
+            _BLOCKWISE_3RD_TUNED_BLOCK_SIZES,
+        )
     _TRIED_LOADING_BLOCKWISE_3RD_TUNING = True
 
     try:
         package = __package__ or "sgl_jax.srt.kernels.quantized_matmul"
         module = importlib.import_module(f"{package}.3rd_quantized_matmul.tuned_block_sizes")
         _BLOCKWISE_3RD_TUNED_VALUE_CLS = getattr(module, "TunedValue", None)
         _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES = getattr(module, "get_tuned_block_sizes", None)
+        _BLOCKWISE_3RD_TUNED_BLOCK_SIZES = getattr(module, "TUNED_BLOCK_SIZES", None)
     except Exception:
         _BLOCKWISE_3RD_TUNED_VALUE_CLS = None
         _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES = None
+        _BLOCKWISE_3RD_TUNED_BLOCK_SIZES = None
 
-    return _BLOCKWISE_3RD_TUNED_VALUE_CLS, _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES
+    return (
+        _BLOCKWISE_3RD_TUNED_VALUE_CLS,
+        _BLOCKWISE_3RD_GET_TUNED_BLOCK_SIZES,
+        _BLOCKWISE_3RD_TUNED_BLOCK_SIZES,
+    )
 
 
 def _next_multiple(x: int, m: int) -> int:
@@ -63,6 +75,72 @@ def _next_multiple(x: int, m: int) -> int:
     return ((x + m - 1) // m) * m
 
 
+def _floor_multiple(x: int, m: int) -> int:
+    if m <= 0:
+        return x
+    return max(m, (x // m) * m)
+
+
+def _nearest_power_of_two_multiple(x: int, base: int, upper_bound: int) -> int:
+    if base <= 0:
+        return x
+
+    x = max(base, x)
+    units = max(1, x // base)
+    lower_units = 1 << (units.bit_length() - 1)
+    upper_units = lower_units if lower_units == units else lower_units << 1
+
+    def _candidate(units_value: int) -> int:
+        return units_value * base
+
+    lower = _candidate(lower_units)
+    upper = _candidate(upper_units)
+    candidates = [value for value in (lower, upper) if value <= upper_bound]
+    if not candidates:
+        candidates = [lower]
+
+    return min(candidates, key=lambda value: (abs(value - x), -value))
+
+
+def _iter_blockwise_tuned_candidates(
+    tuned_block_sizes: dict | None,
+    n_batch: int,
+    n_out: int,
+    n_in: int,
+    x_q_dtype: jnp.dtype,
+    w_q_dtype: jnp.dtype,
+):
+    if not tuned_block_sizes:
+        return []
+
+    x_q_dtype_name = jnp.dtype(x_q_dtype).name
+    w_q_dtype_name = jnp.dtype(w_q_dtype).name
+    compatible_x_dtype_names = [x_q_dtype_name]
+    if jnp.issubdtype(w_q_dtype, jnp.integer) and x_q_dtype_name != "int8":
+        compatible_x_dtype_names.append("int8")
+
+    candidates = []
+    for key, value in tuned_block_sizes.items():
+        if key.w_q_dtype != w_q_dtype_name:
+            continue
+        if key.x_q_dtype not in compatible_x_dtype_names:
+            continue
+
+        score = (
+            compatible_x_dtype_names.index(key.x_q_dtype),
+            key.n_in != n_in,
+            abs(key.n_in - n_in),
+            key.n_batch != n_batch,
+            abs(key.n_batch - n_batch),
+            key.n_out != n_out,
+            abs(key.n_out - n_out),
+        )
+        candidates.append((score, value))
+
+    candidates.sort(key=lambda item: item[0])
+    return [value for _, value in candidates]
+
+
 def _get_safe_blockwise_tuned_value(
     n_batch: int,
     n_out: int,
@@ -72,12 +150,22 @@ def _get_safe_blockwise_tuned_value(
     block_size_in: int,
 ):
     """Build a safe tuned value for third-party blockwise kernel on TPU."""
-    tuned_value_cls, get_tuned_block_sizes = _get_blockwise_3rd_tuning_api()
+    tuned_value_cls, get_tuned_block_sizes, tuned_block_sizes = _get_blockwise_3rd_tuning_api()
     if tuned_value_cls is None:
         return None
 
     tuned = None
-    if get_tuned_block_sizes is not None:
+    compatible_candidates = _iter_blockwise_tuned_candidates(
+        tuned_block_sizes=tuned_block_sizes,
+        n_batch=n_batch,
+        n_out=n_out,
+        n_in=n_in,
+        x_q_dtype=x_q_dtype,
+        w_q_dtype=w_q_dtype,
+    )
+    if compatible_candidates:
+        tuned = compatible_candidates[0]
+    elif get_tuned_block_sizes is not None:
         try:
             tuned = get_tuned_block_sizes(
                 n_batch=n_batch,
@@ -94,10 +182,17 @@ def _get_safe_blockwise_tuned_value(
     n_lane_multiplier = max(1, int(tuned.n_lane_multiplier))
     compute_tile_n = 256 * n_lane_multiplier
 
-    batch_block_size = max(1, int(tuned.batch_block_size))
+    batch_block_size = max(1, min(int(tuned.batch_block_size), int(n_batch)))
     out_block_size = _next_multiple(max(int(tuned.out_block_size), compute_tile_n), compute_tile_n)
+    out_block_size = min(out_block_size, _floor_multiple(int(n_out), compute_tile_n))
+    out_block_size = _nearest_power_of_two_multiple(
+        out_block_size,
+        compute_tile_n,
+        _floor_multiple(int(n_out), compute_tile_n),
+    )
     in_block_size = max(int(tuned.in_block_size), int(block_size_in))
     in_block_size = _next_multiple(in_block_size, int(block_size_in))
+    in_block_size = min(in_block_size, _floor_multiple(int(n_in), int(block_size_in)))
 
     return tuned_value_cls(batch_block_size, out_block_size, in_block_size, n_lane_multiplier)
 
 
@@ -8,8 +8,8 @@
 import jax
 import jax.numpy as jnp
 from flax import nnx
+from jax import shard_map
 from jax.sharding import NamedSharding, PartitionSpec as P
-from jax.experimental.shard_map import shard_map
 
 from sgl_jax.srt.kernels.quantized_matmul.kernel import xla_quantized_matmul_local
 from sgl_jax.srt.utils.quantization.quantization_utils import quantize_tensor
@@ -61,12 +61,35 @@ def __init__(
 
     def __call__(self, x: jax.Array) -> jax.Array | tuple[jax.Array, jax.Array]:
         """Forward pass."""
-        out = jnp.dot(x, self.weight.value)
+        x_2d = x.reshape(-1, x.shape[-1]) if x.ndim > 2 else x
+
+        if self.mesh is not None and self.kernel_axes is not None:
+            input_axis, output_axis = self.kernel_axes[0], self.kernel_axes[1]
+
+            def _sharded_dot(lhs: jax.Array, rhs: jax.Array) -> jax.Array:
+                y = jnp.dot(lhs, rhs)
+                if input_axis is not None:
+                    y = jax.lax.psum(y, input_axis)
+                return y
+
+            out = shard_map(
+                _sharded_dot,
+                mesh=self.mesh,
+                in_specs=(P(None, input_axis), P(input_axis, output_axis)),
+                out_specs=P(None, output_axis),
+                check_vma=False,
+            )(x_2d, self.weight.value)
+        else:
+            out = jnp.dot(x_2d, self.weight.value)
+
+        if x.ndim > 2:
+            out = out.reshape(x.shape[:-1] + (out.shape[-1],))
+
+        if self.skip_bias_add:
+            return out, (self.bias.value if self.bias is not None else None)
         if self.bias is not None:
-            if self.skip_bias_add:
-                return out, self.bias.value
             return out + self.bias.value
-        return out
+        return out, None
 
 
 class QuantizedLinear(nnx.Module):
@@ -168,7 +191,8 @@ def from_linear(
         return cls(
             weight_q=weight_q, weight_scale=weight_scale, bias=bias,
             activation_dtype=activation_dtype, mesh=linear.mesh,
-            kernel_axes=linear.kernel_axes, skip_bias_add=linear.skip_bias_add,
+            kernel_axes=linear.kernel_axes,
+            skip_bias_add=linear.skip_bias_add or linear.bias is None,
             params_dtype=linear.params_dtype, weight_block_size=effective_weight_block_size,
             scope_name=f"quantized_{linear.name}",
         )
@@ -212,7 +236,8 @@ def __call__(self, x: jax.Array) -> jax.Array | tuple[jax.Array, jax.Array]:
         if x.ndim > 2:
             output = output.reshape(x.shape[:-1] + (output.shape[-1],))
 
+        if self.skip_bias_add:
+            return output, (self.bias.value if self.bias is not None else None)
         if self.bias is not None:
-            if self.skip_bias_add: return output, self.bias.value
             return output + self.bias.value
         return output
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+# Adapted from https://github.com/vllm-project/tpu-inference/blob/main/tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py`
`1`	`2`	`# SPDX-License-Identifier: Apache-2.0`
`2`	`3`	`"""Tuned block sizes for quantized matmul kernel."""`
`3`	`4`