sgl-project · Johnsonms · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
@@ -0,0 +1,134 @@
+"""
+Benchmark: segment_packbits JIT vs AOT (sgl_kernel)
+
+Measures throughput (µs) across typical batch sizes and segment lengths.
+
+Run:
+    python python/sglang/jit_kernel/benchmark/bench_packbit.py
+"""
+
+import itertools
+import random
+
+import torch
+import triton
+import triton.testing
+
+from sglang.jit_kernel.benchmark.utils import get_benchmark_range, run_benchmark
+from sglang.jit_kernel.packbit import segment_packbits as segment_packbits_jit
+
+try:
+    from sgl_kernel import segment_packbits as segment_packbits_aot
+
+    AOT_AVAILABLE = True
+except ImportError:
+    segment_packbits_aot = None
+    AOT_AVAILABLE = False
+
+DEVICE = "cuda"
+
+LINE_VALS = ["jit", "aot"] if AOT_AVAILABLE else ["jit"]
+LINE_NAMES = ["JIT (new)", "AOT sgl_kernel"] if AOT_AVAILABLE else ["JIT (new)"]
+
+# ---------------------------------------------------------------------------
+# Benchmark configuration
+# ---------------------------------------------------------------------------
+
+BATCH_SIZE_RANGE = get_benchmark_range(
+    full_range=[1, 4, 16, 64],
+    ci_range=[4],
+)
+
+SEG_LEN_RANGE = get_benchmark_range(
+    full_range=[64, 256, 1024, 4096],
+    ci_range=[256],
+)
+
+
+# ---------------------------------------------------------------------------
+# Input helpers
+# ---------------------------------------------------------------------------
+
+
+def make_inputs(bs, seg_len):
+    random.seed(42)
+    input_indptr = torch.zeros(bs + 1, dtype=torch.int32, device=DEVICE)
+    output_indptr = torch.zeros(bs + 1, dtype=torch.int32, device=DEVICE)
+    for i in range(bs):
+        input_indptr[i + 1] = input_indptr[i] + seg_len
+        output_indptr[i + 1] = output_indptr[i] + (seg_len + 7) // 8
+
+    total_in = input_indptr[-1].item()
+    total_out = output_indptr[-1].item()
+    x = torch.randint(0, 2, (total_in,), dtype=torch.bool, device=DEVICE)
+    y = torch.zeros(total_out, dtype=torch.uint8, device=DEVICE)
+
+    return x, input_indptr, output_indptr, y
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["bs", "seg_len"],
+        x_vals=list(itertools.product(BATCH_SIZE_RANGE, SEG_LEN_RANGE)),
+        line_arg="provider",
+        line_vals=LINE_VALS,
+        line_names=LINE_NAMES,
+        styles=[("blue", "--"), ("orange", "-")][: len(LINE_VALS)],
+        ylabel="us",
+        plot_name="segment-packbits-performance",
+        args={},
+    )
+)
+def bench_segment_packbits(bs: int, seg_len: int, provider: str):
+    x, input_indptr, output_indptr, y = make_inputs(bs, seg_len)
+    backup = y.clone()
+
+    if provider == "jit":
+
+        def fn():
+            y.copy_(backup)
+            segment_packbits_jit(x, input_indptr, output_indptr, y, batch_size=bs)
+
+    elif provider == "aot":
+
+        def fn():
+            y.copy_(backup)
+            segment_packbits_aot(x, input_indptr, output_indptr, y, batch_size=bs)
+
+    else:
+        raise ValueError(f"Unknown provider: {provider}")
+
+    return run_benchmark(fn)
+
+
+# ---------------------------------------------------------------------------
+# Quick correctness diff
+# ---------------------------------------------------------------------------
+
+
+def calculate_diff():
+    if not AOT_AVAILABLE:
+        print("sgl_kernel not available — skipping AOT diff check")
+        return
+
+    print("Correctness diff — segment_packbits (JIT vs AOT):")
+    for bs, seg_len in [(1, 64), (4, 256), (8, 1024)]:
+        x, input_indptr, output_indptr, y_jit = make_inputs(bs, seg_len)
+        y_aot = y_jit.clone()
+
+        segment_packbits_jit(x, input_indptr, output_indptr, y_jit, batch_size=bs)
+        segment_packbits_aot(x, input_indptr, output_indptr, y_aot, batch_size=bs)
+
+        status = "OK" if torch.equal(y_jit, y_aot) else "MISMATCH"
+        print(f"  bs={bs:2d} seg_len={seg_len:5d}  [{status}]")
+
+
+if __name__ == "__main__":
+    calculate_diff()
+    print()
+    bench_segment_packbits.run(print_data=True)
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2025 by SGLang team.
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Adapted from
+// https://github.com/sgl-project/sglang/blob/main/sgl-kernel/csrc/speculative/packbit.cu
+
+#include <sgl_kernel/tensor.h>
+#include <sgl_kernel/utils.h>
+
+#include <sgl_kernel/utils.cuh>
+
+#include <flashinfer/quantization.cuh>
+#include <tvm/ffi/container/tensor.h>
+
+namespace {
+
+// ---------------------------------------------------------------------------
+// tvm-ffi entry point
+// ---------------------------------------------------------------------------
+
+// x:             [sum(input_indptr)] bool — input bits (little-endian)
+// input_indptr:  [batch_size + 1] int32 — segment start offsets for input
+// output_indptr: [batch_size + 1] int32 — segment start offsets for output
+// y:             [sum(output_indptr)] uint8 — packed output
+// batch_size:    number of segments
+void segment_packbits(
+    tvm::ffi::TensorView x,
+    tvm::ffi::TensorView input_indptr,
+    tvm::ffi::TensorView output_indptr,
+    tvm::ffi::TensorView y,
+    int64_t batch_size) {
+  using namespace host;
+
+  RuntimeCheck(x.device().device_type == kDLCUDA, "x must be a CUDA tensor");
+  RuntimeCheck(x.ndim() == 1, "x must be 1D");
+  RuntimeCheck(x.is_contiguous(), "x must be contiguous");
+  RuntimeCheck(host::dtype_bytes(x.dtype()) == 1, "x element size must be 1 byte (bool or uint8)");
+
+  RuntimeCheck(input_indptr.ndim() == 1, "input_indptr must be 1D");
+  RuntimeCheck(input_indptr.is_contiguous(), "input_indptr must be contiguous");
+  RuntimeCheck(input_indptr.dtype().code == kDLInt && input_indptr.dtype().bits == 32, "input_indptr must be int32");
+  RuntimeCheck(input_indptr.size(0) >= batch_size + 1, "input_indptr size must be >= batch_size + 1");
+
+  RuntimeCheck(output_indptr.ndim() == 1, "output_indptr must be 1D");
+  RuntimeCheck(output_indptr.is_contiguous(), "output_indptr must be contiguous");
+  RuntimeCheck(output_indptr.dtype().code == kDLInt && output_indptr.dtype().bits == 32, "output_indptr must be int32");
+  RuntimeCheck(output_indptr.size(0) >= batch_size + 1, "output_indptr size must be >= batch_size + 1");
+
+  RuntimeCheck(y.ndim() == 1, "y must be 1D");
+  RuntimeCheck(y.is_contiguous(), "y must be contiguous");
+  RuntimeCheck(y.dtype().code == kDLUInt && y.dtype().bits == 8, "y must be uint8");
+
+  cudaStream_t stream = LaunchKernel::resolve_device(x.device());
+  cudaError_t status = flashinfer::quantization::SegmentPackBits(
+      static_cast<bool*>(x.data_ptr()),
+      static_cast<uint8_t*>(y.data_ptr()),
+      static_cast<int32_t*>(input_indptr.data_ptr()),
+      static_cast<int32_t*>(output_indptr.data_ptr()),
+      static_cast<uint32_t>(batch_size),
+      flashinfer::quantization::BitOrder::kLittle,
+      stream);
+
+  RuntimeCheck(status == cudaSuccess, "segment_packbits failed: ", cudaGetErrorString(status));
+}
+
+}  // namespace
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import pathlib
+from typing import TYPE_CHECKING
+
+import flashinfer
+import torch
+
+from sglang.jit_kernel.utils import cache_once, load_jit
+from sglang.srt.utils.custom_op import register_custom_op
+
+if TYPE_CHECKING:
+    from tvm_ffi.module import Module
+
+
+@cache_once
+def _jit_packbit_module() -> Module:
+    flashinfer_include_path = str(
+        (pathlib.Path(flashinfer.__file__).parent / "data" / "include").resolve()
+    )
+    return load_jit(
+        "packbit",
+        cuda_files=["speculative/packbit.cuh"],
+        cuda_wrappers=[
+            ("segment_packbits", "segment_packbits"),
+        ],
+        extra_include_paths=[flashinfer_include_path],
+    )
+
+
+@register_custom_op(
+    op_name="segment_packbits_out",
+    mutates_args=["y"],
+)
+def segment_packbits(
+    x: torch.Tensor,
+    input_indptr: torch.Tensor,
+    output_indptr: torch.Tensor,
+    y: torch.Tensor,
+    batch_size: int,
+) -> None:
+    """
+    Pack boolean bits into bytes, segment by segment (little-endian bit order).
+
+    Args:
+        x:             [sum(input_indptr)] bool — input bits
+        input_indptr:  [batch_size + 1] int32 — segment start offsets for input
+        output_indptr: [batch_size + 1] int32 — segment start offsets for output
+        y:             [sum(output_indptr)] uint8 — packed output (mutated in-place)
+        batch_size:    number of segments
+    """
+    module = _jit_packbit_module()
+    module.segment_packbits(x, input_indptr, output_indptr, y, batch_size)