diff --git a/.claude/skills/add-cuda-kernel/SKILL.md b/.claude/skills/add-cuda-kernel/SKILL.md
index ee8c74da22..8da3c7d7f2 100644
--- a/.claude/skills/add-cuda-kernel/SKILL.md
+++ b/.claude/skills/add-cuda-kernel/SKILL.md
@@ -625,7 +625,155 @@ Check functions must:
 3. Raise `ValueError` with descriptive message if validation fails
 4. Be decorated with `@supported_compute_capability` to specify supported architectures
 
-## Step 6: Write Tests in `tests/`
+## Step 6: Add a Trace Template
+
+Every new kernel **must** have a `TraceTemplate` so that flashinfer-bench can auto-generate
+benchmark definition files via `@flashinfer_api(trace=...)`.
+
+### 6a. Create the template in `flashinfer/trace/templates/`
+
+Add a file (or extend an existing one) in `flashinfer/trace/templates/`. The
+real `flashinfer/trace/templates/norm.py` is a good reference — it shows two
+variants that share an `op_type` but have distinct `name_prefix` values:
+
+```python
+# flashinfer/trace/templates/norm.py  (real file, simplified for illustration)
+from ..template import Const, Tensor, TraceTemplate, Var
+
+# op_type  – high-level operation category written to the JSON "op_type" field.
+#             Two templates can share the same op_type when they are variants of
+#             the same operation family.
+# name_prefix – base string for the auto-generated filename and JSON "name" field.
+#               Const axis values are appended, e.g. rmsnorm_h4096.json.
+#               Must be unique across templates that share an op_type.
+
+rmsnorm_trace = TraceTemplate(
+    op_type="rmsnorm",          # category: all RMSNorm variants share this
+    name_prefix="rmsnorm",      # specific variant → file: rmsnorm_h<hidden>.json
+    description="Root Mean Square Normalization. Epsilon is fixed at 1e-6.",
+    axes={
+        "batch_size": Var(),                   # runtime-variable: omitted from filename
+        "hidden_size": Const(abbrev="h"),      # baked into filename as "h<value>"
+    },
+    inputs={
+        # json_key "hidden_states" differs from the Python param name "input",
+        # so param= is set explicitly.
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "weight": Tensor(["hidden_size"]),     # key == param, no param= needed
+    },
+    outputs={
+        "output": Tensor(["batch_size", "hidden_size"], dtype_from="input"),
+    },
+    tags=["status:verified"],
+    reference=_rmsnorm_reference,
+)
+
+fused_add_rmsnorm_trace = TraceTemplate(
+    op_type="rmsnorm",              # same category as rmsnorm_trace above
+    name_prefix="fused_add_rmsnorm",  # different variant → fused_add_rmsnorm_h<hidden>.json
+    description="Fused Add + RMSNorm. Epsilon is fixed at 1e-6.",
+    axes={
+        "batch_size": Var(),
+        "hidden_size": Const(abbrev="h"),
+    },
+    inputs={
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "residual": Tensor(["batch_size", "hidden_size"]),
+        "weight": Tensor(["hidden_size"]),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "hidden_size"], dtype_from="input"),
+        "residual": Tensor(
+            ["batch_size", "hidden_size"],
+            dtype_from="input",
+            description="Updated residual (in-place: residual += hidden_states).",
+        ),
+    },
+    tags=["status:verified", "fused"],
+    reference=_fused_add_rmsnorm_reference,
+)
+```
+
+Key rules:
+- `Var()` → value is NOT baked into the generated name or JSON `value`.
+- `Const(abbrev=...)` → value IS extracted and written to JSON.  `abbrev="h"` → `h4096`; `abbrev=""` → omit from filename.
+- Each `Tensor` key defaults to `param=key`; use `param="other_name"` when they differ.
+- `dtype_from="<input_key>"` copies the dtype from that input tensor (use the JSON key, not the param name).
+- For dispatch (one function, multiple templates depending on a kwarg), pass a
+  plain callable as `trace=`:
+  ```python
+  def _my_trace_dispatch(**kwargs):
+      if kwargs.get("mode") == "fast":
+          return fast_trace
+      return slow_trace
+
+  @flashinfer_api(trace=_my_trace_dispatch)
+  def my_op(..., mode="fast"):
+      ...
+  ```
+  See `flashinfer/fused_moe/core.py` + `flashinfer/trace/templates/moe.py` for a
+  real dispatch example keyed on `routing_method_type`.
+
+### 6b. Attach the template to the API
+
+```python
+# flashinfer/norm.py  (real file)
+from .trace.templates.norm import rmsnorm_trace
+
+@flashinfer_api(trace=rmsnorm_trace)
+def rmsnorm(input: torch.Tensor, weight: torch.Tensor, ...) -> torch.Tensor:
+    ...
+```
+
+The `fi_api` tag is derived automatically from `func.__module__ + "." + func.__qualname__`.
+
+### 6c. Register your module for auto-discovery
+
+Open `tests/trace/test_fi_trace_template_consistency.py` and add your module to
+the import list inside `_collect_template_func_pairs()`:
+
+```python
+import flashinfer.norm   # ← add your module here
+```
+
+That's it. `@flashinfer_api(trace=...)` automatically registers every
+`(func, template)` pair in `flashinfer.api_logging._TRACE_REGISTRY` at
+decoration time. Importing the module triggers the decorator, and the
+parameterized tests then check:
+
+1. **Signature consistency**: every non-optional `param=` reference exists in the function's signature.
+2. **Axis coverage**: every `Const` axis appears in at least one tensor's `dim_names` or the function's parameter list.
+3. **End-to-end**: `fi_trace` with auto-generated CPU tensors returns a complete dict
+   (no `"unknown"` dtypes for non-optional inputs, all `Const` axes have values).
+
+If your template uses tuple inputs or exotic dtypes (fp8 scale tensors, etc.),
+add a targeted end-to-end test at the bottom of the file and add your label to
+`_E2E_SKIP` (see the MoE example there).
+
+For **dispatch templates** (callable `trace=`), also set a `.templates`
+attribute on the dispatch function listing all possible return values:
+
+```python
+def _my_trace_dispatch(**kwargs): ...
+_my_trace_dispatch.templates = [fast_trace, slow_trace]
+```
+
+This lets the registry auto-discover and check all variants.
+
+### 6d. Run the consistency tests
+
+```bash
+pytest tests/trace/test_fi_trace_template_consistency.py -v
+```
+
+A failing structural test looks like:
+```
+AssertionError: [rmsnorm] Template 'rmsnorm' has param mismatches:
+  Input 'hidden_states' → param='x' not found in rmsnorm(['input', 'weight', 'eps'])
+```
+which tells you exactly which key is wrong and what names are available.
+
+## Step 7: Write Tests in `tests/`
 
 Create tests in an appropriate subdirectory (e.g., `tests/elementwise/test_scale.py` or create a new subdir if needed):
 
@@ -794,13 +942,15 @@ if __name__ == "__main__":
 ## Summary of Files Created/Modified
 
 ```
-include/flashinfer/scale.cuh              # NEW: CUDA kernel definition
-csrc/scale.cu                              # NEW: PyTorch launcher
-csrc/scale_jit_binding.cu                  # NEW: TVM-FFI binding
-flashinfer/jit/scale.py                    # NEW: JIT generator
-flashinfer/scale.py                        # NEW: Python API
-flashinfer/__init__.py                     # MODIFIED: Export API
-flashinfer/aot.py                          # MODIFIED: Register AOT
-tests/test_scale.py                        # NEW: Unit tests
-benchmarks/bench_scale.py                  # NEW: Benchmark script
+include/flashinfer/scale.cuh                          # NEW: CUDA kernel definition
+csrc/scale.cu                                          # NEW: PyTorch launcher
+csrc/scale_jit_binding.cu                              # NEW: TVM-FFI binding
+flashinfer/jit/scale.py                                # NEW: JIT generator
+flashinfer/scale.py                                    # NEW: Python API (with @flashinfer_api(trace=...))
+flashinfer/trace/templates/scale.py                    # NEW: TraceTemplate definition
+flashinfer/__init__.py                                 # MODIFIED: Export API
+flashinfer/aot.py                                      # MODIFIED: Register AOT
+tests/test_scale.py                                    # NEW: Kernel unit tests
+tests/trace/test_fi_trace_template_consistency.py      # MODIFIED: Add (func, template) pair
+benchmarks/bench_scale.py                              # NEW: Benchmark script
 ```
diff --git a/CLAUDE.md b/CLAUDE.md
index bbd055286a..e74821b306 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -344,6 +344,20 @@ flashinfer/
 7. Write tests in `tests/`
 8. Register in `flashinfer/aot.py` for AOT compilation
 9. Export in `flashinfer/__init__.py`
+10. Add a `TraceTemplate` in `flashinfer/trace/templates/` and wire it via `@flashinfer_api(trace=...)` (see below)
+11. Add an example call in `tests/trace/example.py`, re-run to regenerate `fi_trace_out/`, and commit the new JSON files
+
+### Trace Template Checklist (for new or updated APIs)
+
+Every public API decorated with `@flashinfer_api` should also carry a `trace=` argument so that `fi_trace()` works and auto-dump produces a benchmark definition JSON.
+
+1. **Create or update a `TraceTemplate`** in `flashinfer/trace/templates/<category>.py` (e.g., `norm.py`, `activation.py`, `cascade.py`, `gdn.py`). Define `axes`, `inputs`, `outputs`, and optionally a `reference` function.
+2. **Wire the template** to the API: `@flashinfer_api(trace=my_trace)` on the Python function (or class method's `run()`).
+3. **Add an example call** in `tests/trace/example.py` that exercises the new trace with realistic shapes.
+4. **Regenerate examples**: `rm -rf tests/trace/fi_trace_out && python tests/trace/example.py` — verify the expected JSON appears.
+5. **Update the docstring** in `tests/trace/example.py` to list the new file(s).
+6. **Run tests**: `pytest tests/trace/ -v` — all template-consistency and end-to-end tests must pass.
+7. **Commit the new JSON files** under `tests/trace/fi_trace_out/` alongside the code changes.
 
 **Example implementations:**
 - **Simple**: `flashinfer/norm.py` (RMSNorm) - no Jinja, good starting point
diff --git a/docs/fi_trace.rst b/docs/fi_trace.rst
new file mode 100644
index 0000000000..4002283ada
--- /dev/null
+++ b/docs/fi_trace.rst
@@ -0,0 +1,321 @@
+.. _fi_trace:
+
+fi_trace — Operation Schema Extraction
+=======================================
+
+``fi_trace`` is FlashInfer's operation schema extraction system.  Every
+``@flashinfer_api``-decorated function automatically grows a ``.fi_trace()``
+method that captures the *shape*, *dtype*, and *axis structure* of a call as a
+portable JSON file — without running the GPU kernel.
+
+These JSON files are the input format for `flashinfer-bench
+<https://github.com/flashinfer-ai/flashinfer-bench>`_, the companion benchmark
+toolkit.  Collecting them while running your production workload gives you a
+precise benchmark suite that reflects your actual model and serving scenario.
+
+Quick Start
+-----------
+
+Set two environment variables **before** importing FlashInfer:
+
+.. code-block:: bash
+
+    export FLASHINFER_TRACE_DUMP=1
+    export FLASHINFER_TRACE_DUMP_DIR=./fi_trace_out   # default: ./fi_trace_out
+
+    python my_inference_script.py
+
+FlashInfer writes one ``.json`` file per unique (op, shape) combination.
+Subsequent calls with the same shapes are deduplicated — no duplicate files.
+
+.. code-block:: text
+
+    fi_trace_out/
+    ├── rmsnorm_h7168.json
+    ├── gqa_paged_decode_h32_kv8_d128_ps16.json
+    ├── moe_fp8_block_scale_ds_routing_topk8_ng8_kg4_e32_h7168_i2048.json
+    └── ...
+
+Environment Variables
+---------------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 35 12 20 33
+
+   * - Variable
+     - Type
+     - Default
+     - Description
+   * - ``FLASHINFER_TRACE_DUMP``
+     - int
+     - ``0``
+     - Set to ``1`` to enable automatic JSON dumping on every API call.
+   * - ``FLASHINFER_TRACE_DUMP_DIR``
+     - str
+     - ``./fi_trace_out``
+     - Directory where JSON files are written.
+
+Both variables are read **lazily at call time**, so they can be set after
+``import flashinfer`` (e.g. when using ``python -m``).
+
+JSON File Format
+----------------
+
+Each file describes one operation instance.  Here is an annotated example for
+``rmsnorm`` with ``hidden_size=7168``:
+
+.. code-block:: json
+
+    {
+      "name": "rmsnorm_h7168",
+      "description": "Root Mean Square Normalization. Epsilon is fixed at 1e-6.",
+      "op_type": "rmsnorm",
+      "tags": [
+        "fi_api:flashinfer.norm.rmsnorm",
+        "status:verified"
+      ],
+      "axes": {
+        "batch_size": { "type": "var" },
+        "hidden_size": { "type": "const", "value": 7168 }
+      },
+      "inputs": {
+        "hidden_states": { "shape": ["batch_size", "hidden_size"], "dtype": "bfloat16" },
+        "weight":        { "shape": ["hidden_size"],               "dtype": "bfloat16" }
+      },
+      "outputs": {
+        "output": { "shape": ["batch_size", "hidden_size"], "dtype": "bfloat16" }
+      },
+      "reference": "..."
+    }
+
+Key fields:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 80
+
+   * - Field
+     - Meaning
+   * - ``name``
+     - Auto-generated from ``op_type`` / ``name_prefix`` + const-axis values.
+       Becomes the benchmark name in flashinfer-bench.
+   * - ``op_type``
+     - Identifies the kernel class (``rmsnorm``, ``gqa_paged``, ``moe``, …).
+   * - ``tags``
+     - List of key:value tags.  Always includes ``fi_api:<qualified.name>``
+       and optional metadata like ``status:verified``.
+   * - ``axes``
+     - Symbolic dimensions.  ``"var"`` axes vary at runtime (batch size,
+       sequence length).  ``"const"`` axes are fixed by model config (head
+       dimension, hidden size) and carry a ``"value"``.
+   * - ``inputs`` / ``outputs``
+     - Each entry has ``"shape"`` (list of axis names) and a resolved
+       ``"dtype"``.  Optional inputs carry ``"optional": true``.
+   * - ``reference``
+     - Source of a pure-PyTorch reference implementation for correctness
+       checking (present on ``status:verified`` ops).
+
+Calling ``.fi_trace()`` Directly
+---------------------------------
+
+Every decorated function exposes a ``.fi_trace()`` method.
+You can call it without running the kernel:
+
+.. code-block:: python
+
+    import torch
+    import flashinfer
+
+    q = torch.zeros(32, 32, 128, dtype=torch.bfloat16, device="cuda")
+    k = torch.zeros(64, 16, 8, 128, dtype=torch.bfloat16, device="cuda")
+    v = torch.zeros(64, 16, 8, 128, dtype=torch.bfloat16, device="cuda")
+
+    schema = flashinfer.norm.rmsnorm.fi_trace(
+        hidden_states=torch.zeros(32, 7168, dtype=torch.bfloat16),
+        weight=torch.ones(7168, dtype=torch.bfloat16),
+    )
+    print(schema["name"])   # rmsnorm_h7168
+    print(schema["axes"])   # {'batch_size': {'type': 'var'}, 'hidden_size': {'type': 'const', 'value': 7168}}
+
+To write to a specific directory, pass ``save_dir``:
+
+.. code-block:: python
+
+    schema = flashinfer.norm.rmsnorm.fi_trace(
+        hidden_states=...,
+        weight=...,
+        save_dir="./my_traces",
+    )
+
+Covered Operations
+------------------
+
+The following FlashInfer operations have trace templates and will emit JSON
+files when ``FLASHINFER_TRACE_DUMP=1``:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 35 40
+
+   * - Module
+     - Operation
+     - ``op_type``
+   * - ``flashinfer.norm``
+     - ``rmsnorm``, ``fused_add_rmsnorm``
+     - ``rmsnorm``
+   * - ``flashinfer.sampling``
+     - ``top_k_sampling_from_probs``,
+       ``top_p_sampling_from_probs``,
+       ``top_k_top_p_sampling_from_probs``
+     - ``sampling``
+   * - ``flashinfer.gemm``
+     - ``mm_bf16``, ``mm_fp8``, ``mm_mxfp8``, ``mm_fp4``
+     - ``gemm_bf16`` / ``gemm_fp8`` / ``gemm_mxfp8`` / ``gemm_fp4``
+   * - ``flashinfer.decode``
+     - ``BatchDecodeWithPagedKVCacheWrapper.run``
+     - ``gqa_paged``
+   * - ``flashinfer.prefill``
+     - ``BatchPrefillWithPagedKVCacheWrapper.run``,
+       ``BatchPrefillWithRaggedKVCacheWrapper.run``
+     - ``gqa_paged`` / ``gqa_ragged``
+   * - ``flashinfer.mla``
+     - ``BatchMLAPagedAttentionWrapper.run``
+     - ``mla_paged``
+   * - ``flashinfer.gdn_decode``
+     - ``gated_delta_rule_decode``, ``gated_delta_rule_mtp``
+     - ``gdn``
+   * - ``flashinfer.gdn_prefill``
+     - ``chunk_gated_delta_rule``
+     - ``gdn``
+   * - ``flashinfer.fused_moe``
+     - ``trtllm_fp8_block_scale_moe`` (6 routing types)
+     - ``moe``
+   * - ``flashinfer.fused_moe``
+     - ``trtllm_fp4_block_scale_moe`` (6 routing types)
+     - ``moe``
+
+MoE Routing Types
+-----------------
+
+MoE operations dispatch to per-routing-type templates.  The output filename
+encodes the routing method:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 10 25 65
+
+   * - Value
+     - Name
+     - Filename pattern (FP8 example)
+   * - 0
+     - Default (Softmax → TopK)
+     - ``moe_fp8_block_scale_default_routing_topk8_e32_h7168_i2048.json``
+   * - 1
+     - Renormalize (TopK → Softmax)
+     - ``moe_fp8_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json``
+   * - 2
+     - DeepSeekV3 (Sigmoid + group selection)
+     - ``moe_fp8_block_scale_ds_routing_topk8_ng8_kg4_e32_h7168_i2048.json``
+   * - 3
+     - Llama4 (Top1 → Sigmoid)
+     - ``moe_fp8_block_scale_llama4_routing_topk1_e32_h7168_i2048.json``
+   * - 4
+     - RenormalizeNaive (Softmax → TopK → Renormalize)
+     - ``moe_fp8_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json``
+   * - 5
+     - TopK (no normalisation)
+     - ``moe_fp8_block_scale_topk_routing_topk8_e32_h7168_i2048.json``
+
+Example: Collecting Traces from a Real Workload
+------------------------------------------------
+
+The script below runs a representative set of FlashInfer ops and collects all
+trace JSON files in one pass.  It covers the shapes used in DeepSeek-V3-style
+models with expert-parallel MoE serving.
+
+.. code-block:: bash
+
+    python tests/trace/example.py
+
+The generated files can be passed directly to ``flashinfer-bench``:
+
+.. code-block:: bash
+
+    flashinfer-bench --trace-dir fi_trace_out/ --backends fa2 cudnn cutlass
+
+Adding Trace Support to a New Kernel
+--------------------------------------
+
+When adding a new kernel (see ``CLAUDE.md`` and ``.claude/skills/add-cuda-kernel/SKILL.md``
+for the full tutorial), attach a ``TraceTemplate`` to the ``@flashinfer_api`` decorator:
+
+.. code-block:: python
+
+    from flashinfer.trace.template import Const, Tensor, TraceTemplate, Var
+    from flashinfer.api_logging import flashinfer_api
+
+    rmsnorm_trace = TraceTemplate(
+        op_type="rmsnorm",
+        name_prefix="rmsnorm",
+        description="Root Mean Square Normalization.",
+        axes={
+            "batch_size":  Var(),
+            "hidden_size": Const(abbrev="h"),
+        },
+        inputs={
+            "hidden_states": Tensor(["batch_size", "hidden_size"]),
+            "weight":        Tensor(["hidden_size"]),
+        },
+        outputs={
+            "output": Tensor(["batch_size", "hidden_size"], dtype_from="hidden_states"),
+        },
+        tags=["status:verified"],
+    )
+
+    @flashinfer_api(trace=rmsnorm_trace)
+    def rmsnorm(hidden_states, weight, eps=1e-6):
+        ...
+
+The template is registered automatically in ``_TRACE_REGISTRY`` at decoration
+time and picked up by the consistency tests without any manual registration.
+
+For operations whose template depends on a runtime parameter (e.g.
+``routing_method_type`` for MoE), write a dispatch callable and attach a
+``.templates`` attribute so the registry discovers all variants:
+
+.. code-block:: python
+
+    _TEMPLATES = {0: default_trace, 1: renorm_trace, ...}
+
+    def my_dispatch(**kwargs):
+        return _TEMPLATES.get(int(kwargs.get("routing_method_type", 0)))
+
+    my_dispatch.templates = list(_TEMPLATES.values())
+
+    @flashinfer_api(trace=my_dispatch)
+    def my_moe_op(...):
+        ...
+
+Consistency Tests
+-----------------
+
+FlashInfer ships automated **linter-style tests** that validate every trace
+template without running GPU kernels:
+
+.. code-block:: bash
+
+    pytest tests/trace/test_fi_trace_template_consistency.py -v
+
+The tests check three properties for every registered template:
+
+1. **Signature consistency** — every ``param=`` reference in the template
+   matches a real parameter of the decorated function.
+2. **Axes coverage** — every ``Const`` axis can be resolved from at least one
+   tensor's shape or from a scalar kwarg.
+3. **End-to-end completeness** — calling ``.fi_trace()`` with auto-generated
+   minimal tensors returns a dict where all ``Const`` axes have values and
+   no input/output has ``dtype == "unknown"``.
+
+When you add a template, these tests run automatically with no manual
+registration required.
diff --git a/docs/index.rst b/docs/index.rst
index 028ed54a59..55f4e0a991 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -17,6 +17,7 @@ FlashInfer is a library and kernel generator for Large Language Models that prov
    installation
    cli
    logging
+   fi_trace
    autotuning
 
 .. toctree::
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index 58187ca85c..bbf548aeef 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -187,3 +187,4 @@
 from .xqa import xqa as xqa
 from .xqa import xqa_mla as xqa_mla
 from . import mamba as mamba
+from .fi_trace import fi_trace as fi_trace
diff --git a/flashinfer/activation.py b/flashinfer/activation.py
index 3bdd3df769..c1f4e4dc79 100644
--- a/flashinfer/activation.py
+++ b/flashinfer/activation.py
@@ -22,6 +22,11 @@
 
 from .api_logging import flashinfer_api
 from .jit import gen_act_and_mul_module
+from .trace.templates.activation import (
+    gelu_and_mul_trace,
+    gelu_tanh_and_mul_trace,
+    silu_and_mul_trace,
+)
 from .utils import (
     device_support_pdl,
     register_custom_op,
@@ -67,7 +72,7 @@ def _check_shape(input: torch.Tensor, output: torch.Tensor) -> None:
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=silu_and_mul_trace)
 def silu_and_mul(
     input: torch.Tensor, out: torch.Tensor = None, enable_pdl: Optional[bool] = None
 ) -> torch.Tensor:
@@ -112,7 +117,7 @@ def silu_and_mul(
     return out
 
 
-@flashinfer_api
+@flashinfer_api(trace=gelu_tanh_and_mul_trace)
 def gelu_tanh_and_mul(
     input: torch.Tensor, out: torch.Tensor = None, enable_pdl: Optional[bool] = None
 ) -> torch.Tensor:
@@ -153,7 +158,7 @@ def gelu_tanh_and_mul(
     return out
 
 
-@flashinfer_api
+@flashinfer_api(trace=gelu_and_mul_trace)
 def gelu_and_mul(
     input: torch.Tensor, out: torch.Tensor = None, enable_pdl: Optional[bool] = None
 ) -> torch.Tensor:
diff --git a/flashinfer/api_logging.py b/flashinfer/api_logging.py
index e88bd7d3cf..0213b3da80 100644
--- a/flashinfer/api_logging.py
+++ b/flashinfer/api_logging.py
@@ -24,7 +24,7 @@
 import sys
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Callable, Dict, Tuple, Optional
+from typing import Any, Callable, Dict, List, Tuple, Optional
 import contextlib
 import importlib
 import torch
@@ -1417,7 +1417,162 @@ def _log_function_outputs(func_name: str, result: Any, level: int) -> None:
     _logger.debug("\n".join(lines))
 
 
-def flashinfer_api(func: Callable = None) -> Callable:
+# ---------------------------------------------------------------------------
+# Trace template registry
+# ---------------------------------------------------------------------------
+# Populated automatically by _attach_fi_trace whenever @flashinfer_api is
+# given a trace= argument.  Each entry is (original_func, template, label)
+# where label is the template's name_prefix (or op_type as fallback).
+#
+# For dispatch callables (trace=some_fn), every template listed in
+# some_fn.templates is registered if that attribute exists.
+#
+# Read by tests/trace/test_fi_trace_template_consistency.py to auto-discover
+# all registered templates without requiring manual maintenance.
+_TRACE_REGISTRY: List[Tuple[Callable, Any, str]] = []
+
+
+def _attach_fi_trace(
+    wrapped: Callable,
+    original: Callable,
+    trace_template=None,
+) -> Callable:
+    """Attach a ``fi_trace`` callable to *wrapped*.
+
+    Three resolution strategies, tried in order:
+
+    1. **Dispatch callable** (new interface): if *trace_template* is a
+       plain callable (not a ``TraceTemplate``), it is called at trace time
+       with the bound kwargs and must return the appropriate
+       :class:`~flashinfer.trace.TraceTemplate` for that invocation.  Use
+       this when a single API function needs different templates depending on
+       a runtime parameter (e.g. ``routing_method_type``).
+    2. **Explicit template** (new interface): if *trace_template* is a
+       :class:`~flashinfer.trace.TraceTemplate`, use it directly.
+    3. **Registry lookup** (legacy interface): look up the qualname of
+       *original* in the old ``_REGISTRY`` dict in ``flashinfer.fi_trace``.
+
+    When ``FLASHINFER_TRACE_DUMP=1`` is set and a template is provided, the
+    returned callable also auto-dumps a trace JSON on every invocation
+    (deduplication: same-named files are written only once per process).
+
+    The attachment is a no-op when neither strategy finds a spec.
+    """
+    try:
+        if trace_template is not None:
+            from flashinfer.trace.template import (  # noqa: PLC0415
+                TraceTemplate,
+                _is_trace_dump_enabled,
+            )
+
+            # New interface: derive fi_api from the function's module + qualname.
+            module = getattr(original, "__module__", "") or ""
+            qualname = getattr(original, "__qualname__", "") or ""
+            fi_api = f"{module}.{qualname}" if module else qualname
+
+            if isinstance(trace_template, TraceTemplate):
+                # Static template: pre-build the fi_trace callable once.
+                fi_trace_fn = trace_template.build_fi_trace_fn(fi_api)
+                # Register for auto-discovery by consistency tests.
+                label = trace_template.name_prefix or trace_template.op_type
+                _TRACE_REGISTRY.append((original, trace_template, label))
+            else:
+                # Dispatch callable: *trace_template* is a function
+                # ``(save_dir=None, name=None, **kwargs) -> TraceTemplate``.
+                # Resolve the template at call time and cache per template
+                # instance to avoid rebuilding extractors on every call.
+                # If the dispatch function exposes a .templates iterable,
+                # register each template for auto-discovery.
+                for tpl in getattr(trace_template, "templates", ()):
+                    if isinstance(tpl, TraceTemplate):
+                        _label = tpl.name_prefix or tpl.op_type
+                        _TRACE_REGISTRY.append((original, tpl, _label))
+                _dispatch_fn = trace_template
+                _fi_trace_cache: Dict[int, Callable] = {}
+
+                def fi_trace_fn(
+                    save_dir=None,
+                    name=None,
+                    **kwargs: Any,
+                ) -> Dict[str, Any]:
+                    tpl = _dispatch_fn(**kwargs)
+                    if tpl is None:
+                        return {}
+                    tpl_id = id(tpl)
+                    if tpl_id not in _fi_trace_cache:
+                        _fi_trace_cache[tpl_id] = tpl.build_fi_trace_fn(fi_api)
+                    return _fi_trace_cache[tpl_id](
+                        save_dir=save_dir, name=name, **kwargs
+                    )
+
+            wrapped.fi_trace = fi_trace_fn  # type: ignore[attr-defined]
+
+            # Auto-dump wrapper: checked lazily at call time so that callers
+            # can set FLASHINFER_TRACE_DUMP after importing flashinfer (e.g.
+            # when running via ``python -m``).
+            _inner = wrapped
+            _sig = inspect.signature(original)
+
+            # Track which (function, error-type) pairs have already been warned
+            # about so we emit at most one diagnostic per failure class per process.
+            _autodump_warned: set = set()
+
+            @functools.wraps(_inner)
+            def _auto_dump_wrapper(*args, **kwargs):
+                # Generate trace BEFORE the actual call (crash-safe: schema
+                # depends only on input shapes/dtypes, not on whether the
+                # computation succeeds).
+                if _is_trace_dump_enabled():
+                    try:
+                        bound = _sig.bind(*args, **kwargs)
+                        bound.apply_defaults()
+                        fi_trace_fn(**dict(bound.arguments))
+                    except Exception as _exc:
+                        # Non-fatal: the API call still runs. Warn once per
+                        # (function, error-type) so users get a diagnostic
+                        # instead of silently missing a trace JSON.
+                        _key = (fi_api, type(_exc).__name__)
+                        if _key not in _autodump_warned:
+                            _autodump_warned.add(_key)
+                            import warnings as _warnings  # noqa: PLC0415
+
+                            _warnings.warn(
+                                f"[flashinfer] fi_trace auto-dump failed for "
+                                f"'{fi_api}': {type(_exc).__name__}: {_exc}. "
+                                f"Further occurrences of this error for this API "
+                                f"will be suppressed.",
+                                stacklevel=2,
+                            )
+                return _inner(*args, **kwargs)
+
+            _auto_dump_wrapper.fi_trace = fi_trace_fn  # type: ignore[attr-defined]
+            return _auto_dump_wrapper
+        else:
+            # Legacy registry lookup (kept for backwards compatibility).
+            from flashinfer.fi_trace import _REGISTRY, build_fi_trace_fn  # noqa: PLC0415
+
+            qualname = getattr(original, "__qualname__", "")
+            spec = _REGISTRY.get(qualname)
+            if spec is not None:
+                wrapped.fi_trace = build_fi_trace_fn(spec)  # type: ignore[attr-defined]
+    except Exception as _exc:
+        # Warn instead of silently swallowing: a broken trace template should
+        # be visible to the developer during import, not discovered later as a
+        # confusing AttributeError when calling func.fi_trace(...).
+        _func_name = getattr(original, "__qualname__", repr(original))
+        import warnings  # noqa: PLC0415
+
+        warnings.warn(
+            f"[flashinfer] Failed to attach fi_trace to '{_func_name}': "
+            f"{type(_exc).__name__}: {_exc}\n"
+            f"The function will work normally but fi_trace will be unavailable. "
+            f"Fix the TraceTemplate passed to @flashinfer_api(trace=...).",
+            stacklevel=3,
+        )
+    return wrapped
+
+
+def flashinfer_api(func: Callable = None, *, trace=None) -> Callable:
     """
     Decorator to FlashInfer's APIs.
 
@@ -1489,11 +1644,12 @@ def flashinfer_api(func: Callable = None) -> Callable:
     - The %i pattern is automatically replaced with the process ID for multi-process environments.
     - The logger does not propagate to the root logger to avoid duplicate logs.
     """
-    # If logging is disabled, return original function with zero overhead
+    # If logging is disabled, return original function with zero overhead.
+    # We still attach fi_trace so it is always available regardless of log level.
     if _API_LOG_LEVEL == 0:
         if func is None:
-            return lambda f: f
-        return func
+            return lambda f: _attach_fi_trace(f, f, trace_template=trace)
+        return _attach_fi_trace(func, func, trace_template=trace)
 
     def decorator(f: Callable) -> Callable:
         @functools.wraps(f)
@@ -1561,7 +1717,7 @@ def wrapper(*args, **kwargs):
 
             return result
 
-        return wrapper
+        return _attach_fi_trace(wrapper, f, trace_template=trace)
 
     if func is None:
         return decorator
diff --git a/flashinfer/attention.py b/flashinfer/attention.py
index c4bc4f27dc..5ce30409cc 100644
--- a/flashinfer/attention.py
+++ b/flashinfer/attention.py
@@ -21,6 +21,7 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.attention import batch_attention_run_trace
 from .jit import gen_batch_attention_module
 from .utils import (
     MaskMode,
@@ -135,7 +136,7 @@ def plan(
             causal,
         )
 
-    @flashinfer_api
+    @flashinfer_api(trace=batch_attention_run_trace)
     def run(
         self,
         q: torch.Tensor,
@@ -209,6 +210,8 @@ class BatchAttentionWithAttentionSinkWrapper(BatchPrefillWithPagedKVCacheWrapper
     a convenient interface for using attention sinks during prefill or decode attention.
     """
 
+    # No @flashinfer_api here: parent class BatchPrefillWithPagedKVCacheWrapper
+    # already decorates __init__, so decorating again produces double log entries.
     def __init__(
         self,
         float_workspace_buffer: torch.Tensor,
diff --git a/flashinfer/cascade.py b/flashinfer/cascade.py
index 1de363bb37..bdaaa6234e 100644
--- a/flashinfer/cascade.py
+++ b/flashinfer/cascade.py
@@ -23,6 +23,12 @@
 from .decode import BatchDecodeWithPagedKVCacheWrapper
 from .jit.cascade import gen_cascade_module
 from .prefill import BatchPrefillWithPagedKVCacheWrapper, single_prefill_with_kv_cache
+from .trace.templates.attention import multi_level_cascade_run_trace
+from .trace.templates.cascade import (
+    merge_state_in_place_trace,
+    merge_state_trace,
+    merge_states_trace,
+)
 from .utils import register_custom_op, register_fake_op
 
 
@@ -31,7 +37,7 @@ def get_cascade_module():
     return gen_cascade_module().build_and_load()
 
 
-@flashinfer_api
+@flashinfer_api(trace=merge_state_trace)
 @register_custom_op("flashinfer::merge_state", mutates_args=())
 def merge_state(
     v_a: torch.Tensor, s_a: torch.Tensor, v_b: torch.Tensor, s_b: torch.Tensor
@@ -98,7 +104,7 @@ def _fake_merge_state(
     return v, s
 
 
-@flashinfer_api
+@flashinfer_api(trace=merge_state_in_place_trace)
 @register_custom_op("flashinfer::merge_state_in_place", mutates_args=("v", "s"))
 def merge_state_in_place(
     v: torch.Tensor,
@@ -159,7 +165,7 @@ def _fake_merge_state_in_place(
     pass
 
 
-@flashinfer_api
+@flashinfer_api(trace=merge_states_trace)
 @register_custom_op("flashinfer::merge_states", mutates_args=())
 def merge_states(v: torch.Tensor, s: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""Merge multiple attention states (v, s).
@@ -512,7 +518,7 @@ def plan(
 
     begin_forward = plan
 
-    @flashinfer_api
+    @flashinfer_api(trace=multi_level_cascade_run_trace)
     def run(
         self,
         q: torch.Tensor,
diff --git a/flashinfer/cudnn/decode.py b/flashinfer/cudnn/decode.py
index 195ca2d49d..9b59309534 100644
--- a/flashinfer/cudnn/decode.py
+++ b/flashinfer/cudnn/decode.py
@@ -4,6 +4,7 @@
 import torch
 
 from ..api_logging import flashinfer_api
+from ..trace.templates.attention import cudnn_batch_decode_trace
 from .utils import get_cudnn_fmha_gen_module
 
 try:
@@ -253,7 +254,7 @@ def _batch_decode_with_kv_cache(
     return out
 
 
-@flashinfer_api
+@flashinfer_api(trace=cudnn_batch_decode_trace)
 def cudnn_batch_decode_with_kv_cache(
     q: torch.Tensor,
     k_cache: torch.Tensor,
diff --git a/flashinfer/cudnn/prefill.py b/flashinfer/cudnn/prefill.py
index fc1bbb5f4c..b16d604305 100644
--- a/flashinfer/cudnn/prefill.py
+++ b/flashinfer/cudnn/prefill.py
@@ -4,6 +4,7 @@
 import torch
 
 from ..api_logging import flashinfer_api
+from ..trace.templates.attention import cudnn_batch_prefill_trace
 from .utils import get_cudnn_fmha_gen_module
 
 try:
@@ -558,7 +559,7 @@ def _batch_prefill_with_kv_cache(
         return out, None
 
 
-@flashinfer_api
+@flashinfer_api(trace=cudnn_batch_prefill_trace)
 def cudnn_batch_prefill_with_kv_cache(
     q: torch.Tensor,
     k_cache: torch.Tensor,
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
index 822aca407c..c0daa6859d 100644
--- a/flashinfer/decode.py
+++ b/flashinfer/decode.py
@@ -22,6 +22,11 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.attention import (
+    gqa_paged_decode_trace,
+    single_decode_with_kv_cache_trace,
+    trtllm_batch_decode_trace,
+)
 
 ## NOTE: MLA functions have been moved to mla.py, but we keep the aliases here for backward compatibility.
 from .mla import (
@@ -400,7 +405,7 @@ def single_decode_with_kv_cache(
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
-@flashinfer_api
+@flashinfer_api(trace=single_decode_with_kv_cache_trace)
 def single_decode_with_kv_cache(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -1215,7 +1220,7 @@ def run(
         kv_cache_sf: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
-    @flashinfer_api
+    @flashinfer_api(trace=gqa_paged_decode_trace)
     def run(
         self,
         q: torch.Tensor,
@@ -1577,6 +1582,8 @@ class CUDAGraphBatchDecodeWithPagedKVCacheWrapper(BatchDecodeWithPagedKVCacheWra
     :class:`BatchDecodeWithPagedKVCacheWrapper`
     """
 
+    # No @flashinfer_api here: parent class BatchDecodeWithPagedKVCacheWrapper
+    # already decorates __init__, so decorating again produces double log entries.
     def __init__(
         self,
         workspace_buffer: torch.Tensor,
@@ -2232,7 +2239,7 @@ def _fake_paged_run(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_batch_decode_trace)
 def trtllm_batch_decode_with_kv_cache(
     query: torch.Tensor,
     kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
diff --git a/flashinfer/fi_trace.py b/flashinfer/fi_trace.py
new file mode 100644
index 0000000000..1104eb6f07
--- /dev/null
+++ b/flashinfer/fi_trace.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+fi_trace: Generate `flashinfer-bench <https://github.com/flashinfer-ai/flashinfer-bench>`_
+compatible definition JSON for FlashInfer APIs.
+
+Every ``@flashinfer_api(trace=<template>)``-decorated function supports two
+usage modes:
+
+Auto-dump (recommended)
+-----------------------
+Set environment variables **before** importing flashinfer, then run your
+workload normally.  No explicit ``fi_trace`` call is needed.
+
+.. code-block:: bash
+
+    FLASHINFER_TRACE_DUMP=1 \\
+    FLASHINFER_TRACE_DUMP_DIR=./fi_trace_out \\
+    python my_script.py
+
+Every decorated function writes a ``<name>.json`` file on its **first** call
+for each unique set of const-axis values (e.g. head dimensions, vocab size).
+Subsequent calls with the same shape are deduplicated — the file is written
+only once per process.  The output directory is created automatically.
+
+Explicit call (for selective or programmatic use)
+-------------------------------------------------
+Each decorated function also has a ``.fi_trace(**kwargs)`` attribute.  Pass
+the same tensor arguments you would pass to the real function; fi_trace
+introspects their shapes / dtypes and returns the definition dict.
+
+.. code-block:: python
+
+    import flashinfer, torch
+
+    hidden = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda")
+    weight = torch.ones(4096, dtype=torch.bfloat16, device="cuda")
+
+    defn = flashinfer.rmsnorm.fi_trace(input=hidden, weight=weight)
+
+    import json
+    print(json.dumps(defn, indent=2))
+
+For class-method APIs use the unbound (class-level) form, or the module-level
+helper:
+
+.. code-block:: python
+
+    from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.fi_trace import fi_trace
+
+    defn = BatchDecodeWithPagedKVCacheWrapper.run.fi_trace(
+        q=q_tensor, paged_kv_cache=(k_cache, v_cache)
+    )
+    # or with a live instance:
+    defn = fi_trace(wrapper.run, q=q_tensor, paged_kv_cache=(k, v))
+
+Both modes support an optional ``save_dir`` argument / env-var to control
+where the JSON file is written.  Explicit ``save_dir`` always writes; the
+auto-dump path deduplicates.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
+
+# ---------------------------------------------------------------------------
+# Legacy registry — kept for backwards compatibility.
+# New code should use @flashinfer_api(trace=TraceTemplate(...)) instead.
+# ---------------------------------------------------------------------------
+
+_REGISTRY: Dict[str, Any] = {}
+
+
+def register_fi_trace(qualname: str, spec: Any) -> None:
+    """Register a legacy FiTraceSpec for the function with the given qualname.
+
+    .. deprecated::
+        Use ``@flashinfer_api(trace=TraceTemplate(...))`` instead.
+    """
+    _REGISTRY[qualname] = spec
+
+
+def build_fi_trace_fn(spec: Any) -> Callable[..., Dict[str, Any]]:
+    """Build a fi_trace callable from a legacy FiTraceSpec.
+
+    .. deprecated::
+        Use ``TraceTemplate.build_fi_trace_fn`` instead.
+    """
+    # Import the old implementation from the trace package for backwards compat.
+    from .trace.template import (  # noqa: PLC0415,F401
+        Const,
+        Scalar,
+        Tensor,
+        TraceTemplate,
+        Var,
+    )
+    import json  # noqa: PLC0415
+    import os  # noqa: PLC0415
+    from pathlib import Path  # noqa: PLC0415
+    import torch  # noqa: PLC0415
+
+    _DTYPE_MAP = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.int32: "int32",
+        torch.int64: "int64",
+        torch.int8: "int8",
+        torch.uint8: "uint8",
+    }
+    try:
+        _DTYPE_MAP[torch.float8_e4m3fn] = "float8_e4m3fn"
+        _DTYPE_MAP[torch.float8_e5m2] = "float8_e5m2"
+    except AttributeError:
+        pass
+
+    def _dtype_str(dtype):
+        return _DTYPE_MAP.get(dtype, str(dtype).replace("torch.", ""))
+
+    def _get_tensor(kwargs, param, tuple_idx=None):
+        val = kwargs.get(param)
+        if val is None:
+            return None
+        if tuple_idx is not None:
+            if isinstance(val, (tuple, list)) and len(val) > tuple_idx:
+                val = val[tuple_idx]
+            else:
+                return None
+        return val if isinstance(val, torch.Tensor) else None
+
+    def fi_trace(save_dir=None, **kwargs):
+        axis_values: Dict[str, int] = {}
+        for axis_name, axis_def in spec.axes.items():
+            if axis_def.extract is not None:
+                try:
+                    val = axis_def.extract(kwargs)
+                    if val is not None:
+                        axis_values[axis_name] = int(val)
+                except Exception:
+                    pass
+
+        axes_json: Dict[str, Any] = {}
+        for axis_name, axis_def in spec.axes.items():
+            entry: Dict[str, Any] = {"type": "var" if axis_def.is_var else "const"}
+            if not axis_def.is_var and axis_name in axis_values:
+                entry["value"] = axis_values[axis_name]
+            if axis_def.description:
+                entry["description"] = axis_def.description
+            axes_json[axis_name] = entry
+
+        inputs_json: Dict[str, Any] = {}
+        for inp in spec.inputs:
+            if inp.is_scalar:
+                val = kwargs.get(inp.func_param)
+                dtype = (
+                    _dtype_str(val.dtype)
+                    if isinstance(val, torch.Tensor)
+                    else "float32"
+                )
+                entry = {"shape": None, "dtype": dtype}
+            else:
+                t = _get_tensor(kwargs, inp.func_param, inp.tuple_idx)
+                entry = {
+                    "shape": inp.dim_names,
+                    "dtype": _dtype_str(t.dtype) if t is not None else "unknown",
+                }
+            if inp.optional:
+                entry["optional"] = True
+            if inp.description:
+                entry["description"] = inp.description
+            inputs_json[inp.json_name] = entry
+
+        outputs_json: Dict[str, Any] = {}
+        for out in spec.outputs:
+            dtype = out.dtype
+            if dtype.startswith("from_input:"):
+                src_param = dtype[len("from_input:") :]
+                t = _get_tensor(kwargs, src_param)
+                dtype = _dtype_str(t.dtype) if t is not None else "unknown"
+            entry = {"shape": out.dim_names, "dtype": dtype}
+            if out.description:
+                entry["description"] = out.description
+            outputs_json[out.json_name] = entry
+
+        const_parts = [
+            f"{n}{v}"
+            for n, a in spec.axes.items()
+            if not a.is_var and n in axis_values
+            for v in (axis_values[n],)
+        ]
+        name = spec.op_type + ("_" + "_".join(const_parts) if const_parts else "")
+
+        tags = [f"fi_api:{spec.fi_api}"] + spec.extra_tags
+        result: Dict[str, Any] = {
+            "name": name,
+            "description": spec.description,
+            "op_type": spec.op_type,
+            "tags": tags,
+            "axes": axes_json,
+        }
+        if spec.constraints:
+            result["constraints"] = spec.constraints
+        result["inputs"] = inputs_json
+        result["outputs"] = outputs_json
+
+        _trace_dir = os.environ.get("FLASHINFER_TRACE_DUMP_DIR")
+        effective_dir = save_dir if save_dir is not None else _trace_dir
+        if effective_dir is not None:
+            out_dir = Path(effective_dir)
+            out_dir.mkdir(parents=True, exist_ok=True)
+            out_path = out_dir / f"{name}.json"
+            out_path.write_text(json.dumps(result, indent=2))
+
+        return result
+
+    return fi_trace
+
+
+# ---------------------------------------------------------------------------
+# Public helper: fi_trace(func_or_method, **kwargs)
+# ---------------------------------------------------------------------------
+
+
+def fi_trace(
+    func_or_method: Callable,
+    save_dir: Optional[Union[str, Path]] = None,
+    **kwargs: Any,
+) -> Dict[str, Any]:
+    """Generate a flashinfer-bench definition JSON for any FlashInfer API call.
+
+    Parameters
+    ----------
+    func_or_method:
+        A ``@flashinfer_api``-decorated function or (bound) method.
+    save_dir:
+        Directory where the JSON definition file should be written.
+        Falls back to ``FLASHINFER_TRACE_DUMP_DIR`` env-var when *None*.
+    **kwargs:
+        The same tensor arguments you would pass to the real API.
+
+    Returns
+    -------
+    dict
+        A flashinfer-bench compatible definition dictionary.
+
+    Examples
+    --------
+    Standalone function::
+
+        defn = fi_trace(flashinfer.norm.rmsnorm, input=hidden, weight=weight)
+
+    Bound method (instance.run)::
+
+        defn = fi_trace(wrapper.run, q=q_tensor, paged_kv_cache=(k, v))
+
+    Class-level (unbound)::
+
+        defn = fi_trace(
+            flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper.run,
+            q=q_tensor, paged_kv_cache=(k, v),
+        )
+    """
+    actual_func = getattr(func_or_method, "__func__", func_or_method)
+    trace_fn = getattr(actual_func, "fi_trace", None)
+    if trace_fn is None:
+        qualname = getattr(actual_func, "__qualname__", repr(actual_func))
+        raise ValueError(
+            f"No fi_trace spec is registered for '{qualname}'. "
+            "Only @flashinfer_api(trace=...)-decorated functions support fi_trace."
+        )
+    return trace_fn(save_dir=save_dir, **kwargs)
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
index 5a0814e6aa..0585604b3c 100644
--- a/flashinfer/fused_moe/core.py
+++ b/flashinfer/fused_moe/core.py
@@ -21,6 +21,17 @@
 import torch
 
 from ..api_logging import flashinfer_api
+from ..trace.templates.moe import (
+    cutlass_fused_moe_trace,
+    trtllm_bf16_moe_trace,
+    trtllm_bf16_routed_moe_trace,
+    trtllm_fp4_block_scale_moe_trace_dispatch,
+    trtllm_fp4_block_scale_routed_moe_trace,
+    trtllm_fp8_block_scale_moe_trace_dispatch,
+    trtllm_fp8_block_scale_routed_moe_trace,
+    trtllm_fp8_per_tensor_scale_moe_trace,
+    trtllm_mxint4_block_scale_moe_trace,
+)
 from ..autotuner import (
     AutoTuner,
     DynamicTensorSpec,
@@ -626,7 +637,7 @@ def _fake_cutlass_fused_moe(
 
 
 # ref: https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py#L121
-@flashinfer_api
+@flashinfer_api(trace=cutlass_fused_moe_trace)
 def cutlass_fused_moe(
     input: torch.Tensor,
     token_selected_experts: torch.Tensor,
@@ -2344,7 +2355,7 @@ def _validate_routing_replay_out(
         raise ValueError("routing_replay_out must be contiguous (packed row-major)")
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_bf16_moe_trace)
 def trtllm_bf16_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2452,7 +2463,7 @@ def trtllm_bf16_moe(
         return result
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_bf16_routed_moe_trace)
 def trtllm_bf16_routed_moe(
     topk_ids: torch.Tensor,
     hidden_states: torch.Tensor,
@@ -2557,7 +2568,7 @@ def trtllm_bf16_routed_moe(
         return result
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_fp8_per_tensor_scale_moe_trace)
 def trtllm_fp8_per_tensor_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2658,7 +2669,7 @@ def trtllm_fp8_per_tensor_scale_moe(
         return result
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_fp8_block_scale_moe_trace_dispatch)
 def trtllm_fp8_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2779,7 +2790,7 @@ def trtllm_fp8_block_scale_moe(
         return result
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_fp8_block_scale_routed_moe_trace)
 def trtllm_fp8_block_scale_routed_moe(
     topk_ids: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -2893,7 +2904,7 @@ def trtllm_fp8_block_scale_routed_moe(
         return result
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_fp4_block_scale_moe_trace_dispatch)
 def trtllm_fp4_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -3030,7 +3041,7 @@ def trtllm_fp4_block_scale_moe(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_fp4_block_scale_routed_moe_trace)
 def trtllm_fp4_block_scale_routed_moe(
     topk_ids: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
@@ -3165,7 +3176,7 @@ def trtllm_fp4_block_scale_routed_moe(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_mxint4_block_scale_moe_trace)
 def trtllm_mxint4_block_scale_moe(
     routing_logits: torch.Tensor,
     routing_bias: Optional[torch.Tensor],
diff --git a/flashinfer/fused_moe/cute_dsl/b12x_moe.py b/flashinfer/fused_moe/cute_dsl/b12x_moe.py
index d2cbc8b05b..34916df533 100644
--- a/flashinfer/fused_moe/cute_dsl/b12x_moe.py
+++ b/flashinfer/fused_moe/cute_dsl/b12x_moe.py
@@ -42,11 +42,12 @@
 import torch
 
 from ...api_logging import flashinfer_api
+from ...trace.templates.moe import b12x_fused_moe_trace, b12x_moe_wrapper_run_trace
 from ...utils import supported_compute_capability
 
 
 @supported_compute_capability([120, 121])
-@flashinfer_api
+@flashinfer_api(trace=b12x_fused_moe_trace)
 def b12x_fused_moe(
     x: torch.Tensor,
     w1_weight: torch.Tensor,
@@ -293,7 +294,7 @@ def _allocate_buffers(self) -> None:
             device=self.device,
         )
 
-    @flashinfer_api
+    @flashinfer_api(trace=b12x_moe_wrapper_run_trace)
     def run(
         self,
         x: torch.Tensor,
diff --git a/flashinfer/fused_moe/cute_dsl/fused_moe.py b/flashinfer/fused_moe/cute_dsl/fused_moe.py
index e9d6ed4bed..74af0d5f84 100644
--- a/flashinfer/fused_moe/cute_dsl/fused_moe.py
+++ b/flashinfer/fused_moe/cute_dsl/fused_moe.py
@@ -54,6 +54,10 @@
 import torch
 
 from ...api_logging import flashinfer_api
+from ...trace.templates.moe import (
+    cute_dsl_fused_moe_nvfp4_trace,
+    cute_dsl_moe_wrapper_run_trace,
+)
 from ...autotuner import AutoTuner
 from ...utils import supported_compute_capability
 from .moe_utils import (
@@ -530,7 +534,7 @@ def _forward_with_tactic(
             enable_pdl=enable_pdl,
         )
 
-    @flashinfer_api
+    @flashinfer_api(trace=cute_dsl_moe_wrapper_run_trace)
     def run(
         self,
         x: torch.Tensor,
@@ -686,7 +690,7 @@ def _cute_dsl_fused_moe_nvfp4_impl(
 
 
 @supported_compute_capability([100, 103])
-@flashinfer_api
+@flashinfer_api(trace=cute_dsl_fused_moe_nvfp4_trace)
 def cute_dsl_fused_moe_nvfp4(
     x: torch.Tensor,
     x_sf: torch.Tensor,
diff --git a/flashinfer/gdn_decode.py b/flashinfer/gdn_decode.py
index 08969a73c4..ed270b8582 100644
--- a/flashinfer/gdn_decode.py
+++ b/flashinfer/gdn_decode.py
@@ -35,13 +35,21 @@
 
 try:
     from .api_logging import flashinfer_api
+    from .trace.templates.gdn import (
+        gated_delta_rule_decode_trace,
+        gdn_mtp_trace,
+    )
 
     _FLASHINFER_AVAILABLE = True
 except ImportError:
     _FLASHINFER_AVAILABLE = False
+    gated_delta_rule_decode_trace = None  # type: ignore[assignment]
+    gdn_mtp_trace = None  # type: ignore[assignment]
 
-    # Fallback decorator for standalone usage
-    def flashinfer_api(func):  # type: ignore[misc]
+    # Fallback decorator for standalone usage (accepts trace= kwarg)
+    def flashinfer_api(func=None, *, trace=None):  # type: ignore[misc]
+        if func is None:
+            return lambda f: f
         return func
 
 
@@ -394,7 +402,7 @@ def gated_delta_rule_decode_pretranspose(
 # ============================================================================
 
 
-@flashinfer_api
+@flashinfer_api(trace=gated_delta_rule_decode_trace)
 def gated_delta_rule_decode(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -535,7 +543,7 @@ def gated_delta_rule_decode(
 # ============================================================================
 
 
-@flashinfer_api
+@flashinfer_api(trace=gdn_mtp_trace)
 def gated_delta_rule_mtp(
     q: torch.Tensor,
     k: torch.Tensor,
diff --git a/flashinfer/gdn_prefill.py b/flashinfer/gdn_prefill.py
index 124784ff22..9fae71640d 100644
--- a/flashinfer/gdn_prefill.py
+++ b/flashinfer/gdn_prefill.py
@@ -21,6 +21,7 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.gdn import gdn_prefill_trace
 from .jit.gdn import gen_gdn_prefill_sm90_module
 from .utils import (
     register_custom_op,
@@ -95,7 +96,7 @@ def _fake_gdn_prefill(
     return SimpleNamespace(gdn_prefill=gdn_prefill)
 
 
-@flashinfer_api
+@flashinfer_api(trace=gdn_prefill_trace)
 def chunk_gated_delta_rule(
     q: torch.Tensor,
     k: torch.Tensor,
diff --git a/flashinfer/gemm/gemm_base.py b/flashinfer/gemm/gemm_base.py
index decc213e6f..bc626f005d 100755
--- a/flashinfer/gemm/gemm_base.py
+++ b/flashinfer/gemm/gemm_base.py
@@ -23,6 +23,14 @@
 import torch
 
 from ..api_logging import flashinfer_api
+from ..trace.templates.gemm import (
+    mm_bf16_trace,
+    mm_fp8_trace,
+    mm_mxfp8_trace,
+    mm_fp4_trace,
+)
+from ..trace.templates.attention import segment_gemm_run_trace
+from ..trace.templates.page import tgv_gemm_sm100_trace
 from ..autotuner import (
     AutoTuner,
     ConstraintSpec,
@@ -325,7 +333,7 @@ def _heuristic_func_mm_bf16(
     common_check=_check_mm_bf16_problem_size,
     heuristic_func=_heuristic_func_mm_bf16,
 )
-@flashinfer_api
+@flashinfer_api(trace=mm_bf16_trace)
 def mm_bf16(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -1095,7 +1103,7 @@ def forward(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=tgv_gemm_sm100_trace)
 def tgv_gemm_sm100(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -1437,6 +1445,7 @@ class SegmentGEMMWrapper:
     True
     """
 
+    @flashinfer_api
     def __init__(
         self, float_workspace_buffer: torch.Tensor, backend: str = "auto"
     ) -> None:
@@ -1469,7 +1478,7 @@ def reset_workspace_buffer(
         self._float_workspace_buffer = float_workspace_buffer
         self._int_workspace_buffer = int_workspace_buffer
 
-    @flashinfer_api
+    @flashinfer_api(trace=segment_gemm_run_trace)
     def run(
         self,
         x: torch.Tensor,
@@ -2084,6 +2093,8 @@ def build_cudnn_gemm_fp4_graph_override_shape(
     return graph
 
 
+# Internal helper called from mm_fp4; the user-facing mm_fp4 is already
+# decorated, so decorating here would double-log the same invocation.
 def execute_cudnn_gemm_fp4_graph_override_shape(
     graph,
     a,
@@ -2319,6 +2330,8 @@ def build_cudnn_gemm_mxfp8_graph_override_shape(
     return graph
 
 
+# Internal helper called from mm_mxfp8; the user-facing mm_mxfp8 is already
+# decorated, so decorating here would double-log the same invocation.
 def execute_cudnn_gemm_mxfp8_graph_override_shape(
     graph,
     a,
@@ -2565,6 +2578,8 @@ def build_cudnn_gemm_with_per_tensor_q_graph_override_shape(
     return graph
 
 
+# Internal helper called from mm_fp8 per-tensor path; the user-facing mm_fp8
+# is already decorated, so decorating here would double-log the same invocation.
 def execute_cudnn_gemm_with_per_tensor_q_graph_override_shape(
     graph, a, b, a_scale, b_scale, c_final, workspace, tactic: int = 0
 ):
@@ -2893,6 +2908,8 @@ def build_cudnn_gemm_bf16_graph_override_shape(
     return graph
 
 
+# Internal helper called from mm_bf16; the user-facing mm_bf16 is already
+# decorated, so decorating here would double-log the same invocation.
 def execute_cudnn_gemm_bf16_graph_override_shape(
     graph, a, b, bias, c_final, workspace, tactic: int = 0
 ):
@@ -3161,7 +3178,7 @@ def _expand_block_scale_tensor_shape(block_scale_tensor, batch_size):
     return (tuple(block_scale_shape), tuple(block_scale_stride))
 
 
-@flashinfer_api
+@flashinfer_api(trace=mm_fp8_trace)
 def mm_fp8(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -3990,7 +4007,7 @@ def _heuristic_func_mm_mxfp8(
     common_check=_check_mm_mxfp8_problem_size,
     heuristic_func=_heuristic_func_mm_mxfp8,  # result stored in mm_mxfp8.suitable_auto_backends
 )
-@flashinfer_api
+@flashinfer_api(trace=mm_mxfp8_trace)
 def mm_mxfp8(
     a: torch.Tensor,
     b: torch.Tensor,
@@ -5195,7 +5212,7 @@ def _mxfp8_swizzled_scale_len(m: int, k: int, swizzle_layout: SfLayout) -> int:
     common_check=_check_mm_fp4_problem_size,
     heuristic_func=_heuristic_func_mm_fp4,  # result stored in mm_fp4.suitable_auto_backends
 )
-@flashinfer_api
+@flashinfer_api(trace=mm_fp4_trace)
 def mm_fp4(
     a: torch.Tensor,
     b: torch.Tensor,
diff --git a/flashinfer/mla/_core.py b/flashinfer/mla/_core.py
index 4e8bdd7212..5c9fe22b2f 100644
--- a/flashinfer/mla/_core.py
+++ b/flashinfer/mla/_core.py
@@ -21,6 +21,7 @@
 import torch
 
 from ..api_logging import flashinfer_api
+from ..trace.templates.attention import mla_paged_decode_trace
 from ..jit import gen_batch_mla_module, gen_trtllm_gen_fmha_module, setup_cubin_loader
 from ..jit.mla import gen_mla_module
 from ..utils import (
@@ -469,7 +470,7 @@ def run(
         return_lse_base_on_e: bool = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
-    @flashinfer_api
+    @flashinfer_api(trace=mla_paged_decode_trace)
     def run(
         self,
         q_nope: torch.Tensor,
diff --git a/flashinfer/norm/__init__.py b/flashinfer/norm/__init__.py
index 0f9911a6ed..818376d595 100644
--- a/flashinfer/norm/__init__.py
+++ b/flashinfer/norm/__init__.py
@@ -32,6 +32,15 @@
 import torch
 
 from ..api_logging import flashinfer_api
+from ..trace.templates.norm import (
+    fused_add_rmsnorm_quant_trace,
+    fused_add_rmsnorm_trace,
+    gemma_fused_add_rmsnorm_trace,
+    gemma_rmsnorm_trace,
+    layernorm_trace,
+    rmsnorm_quant_trace,
+    rmsnorm_trace,
+)
 from ..utils import (
     device_support_pdl,
     get_compute_capability,
@@ -94,7 +103,7 @@ def _normalize_scale_tensor(
     return scale.contiguous()
 
 
-@flashinfer_api
+@flashinfer_api(trace=rmsnorm_trace)
 def rmsnorm(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -165,7 +174,7 @@ def _rmsnorm_impl_fake(
     pass
 
 
-@flashinfer_api
+@flashinfer_api(trace=rmsnorm_quant_trace)
 @register_custom_op("flashinfer::rmsnorm_quant", mutates_args=("out",))
 def rmsnorm_quant(
     out: torch.Tensor,
@@ -219,7 +228,7 @@ def _rmsnorm_quant_fake(
     pass
 
 
-@flashinfer_api
+@flashinfer_api(trace=fused_add_rmsnorm_trace)
 @register_custom_op("flashinfer::fused_add_rmsnorm", mutates_args=("input", "residual"))
 def fused_add_rmsnorm(
     input: torch.Tensor,
@@ -271,7 +280,7 @@ def _fused_add_rmsnorm_fake(
     pass
 
 
-@flashinfer_api
+@flashinfer_api(trace=fused_add_rmsnorm_quant_trace)
 @register_custom_op(
     "flashinfer::fused_add_rmsnorm_quant", mutates_args=("out", "residual")
 )
@@ -343,7 +352,7 @@ def _fused_add_rmsnorm_quant_fake(
     pass
 
 
-@flashinfer_api
+@flashinfer_api(trace=gemma_rmsnorm_trace)
 def gemma_rmsnorm(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -414,7 +423,7 @@ def _gemma_rmsnorm_impl_fake(
     pass
 
 
-@flashinfer_api
+@flashinfer_api(trace=gemma_fused_add_rmsnorm_trace)
 @register_custom_op(
     "flashinfer::gemma_fused_add_rmsnorm", mutates_args=("input", "residual")
 )
@@ -470,7 +479,7 @@ def _gemma_fused_add_rmsnorm_fake(
     pass
 
 
-@flashinfer_api
+@flashinfer_api(trace=layernorm_trace)
 @register_custom_op("flashinfer::layernorm", mutates_args=())
 def layernorm(
     input: torch.Tensor,
diff --git a/flashinfer/page.py b/flashinfer/page.py
index 12ea36137f..7fb33cf342 100644
--- a/flashinfer/page.py
+++ b/flashinfer/page.py
@@ -20,6 +20,10 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.page import (
+    append_paged_kv_cache_trace,
+    append_paged_mla_kv_cache_trace,
+)
 from .jit.page import gen_page_module
 from .utils import (
     TensorLayout,
@@ -222,7 +226,7 @@ def get_seq_lens(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=append_paged_mla_kv_cache_trace)
 def append_paged_mla_kv_cache(
     append_ckv: torch.Tensor,
     append_kpe: torch.Tensor,
@@ -272,7 +276,7 @@ def append_paged_mla_kv_cache(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=append_paged_kv_cache_trace)
 def append_paged_kv_cache(
     append_key: torch.Tensor,
     append_value: torch.Tensor,
diff --git a/flashinfer/pod.py b/flashinfer/pod.py
index fe2e36c1ef..4fa2d9bf0d 100644
--- a/flashinfer/pod.py
+++ b/flashinfer/pod.py
@@ -22,6 +22,10 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.attention import (
+    batch_pod_with_paged_kv_cache_run_trace,
+    pod_with_paged_kv_cache_run_trace,
+)
 from .jit import gen_pod_module, gen_batch_pod_module
 from .page import get_seq_lens
 from .prefill import get_batch_prefill_module
@@ -435,7 +439,7 @@ def plan(
 
     begin_forward = plan
 
-    @flashinfer_api
+    @flashinfer_api(trace=pod_with_paged_kv_cache_run_trace)
     def run(
         self,
         # Main params (prefill and decode)
@@ -1015,7 +1019,7 @@ def plan(
 
     begin_forward = plan
 
-    @flashinfer_api
+    @flashinfer_api(trace=batch_pod_with_paged_kv_cache_run_trace)
     def run(
         self,
         # Main params (prefill and decode)
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
index 4ec6a29e7d..24887b1cab 100755
--- a/flashinfer/prefill.py
+++ b/flashinfer/prefill.py
@@ -23,6 +23,13 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.attention import (
+    gqa_paged_prefill_trace,
+    gqa_ragged_prefill_trace,
+    single_prefill_with_kv_cache_trace,
+    trtllm_batch_context_trace,
+)
+from .trace.templates.page import trtllm_fmha_v2_prefill_trace
 from .jit import (
     gen_batch_prefill_module,
     gen_customize_batch_prefill_module,
@@ -1099,7 +1106,7 @@ def single_prefill_with_kv_cache(
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
-@flashinfer_api
+@flashinfer_api(trace=single_prefill_with_kv_cache_trace)
 def single_prefill_with_kv_cache(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -2132,7 +2139,7 @@ def run(
         skip_softmax_threshold_scale_factor: Optional[float] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
-    @flashinfer_api
+    @flashinfer_api(trace=gqa_paged_prefill_trace)
     def run(
         self,
         q: torch.Tensor,
@@ -3186,7 +3193,7 @@ def run(
         enable_pdl: Optional[bool] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
-    @flashinfer_api
+    @flashinfer_api(trace=gqa_ragged_prefill_trace)
     def run(
         self,
         q: torch.Tensor,
@@ -3839,7 +3846,7 @@ def trtllm_ragged_attention_deepseek(
         return out
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_batch_context_trace)
 def trtllm_batch_context_with_kv_cache(
     query: torch.Tensor,
     kv_cache: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
@@ -4228,7 +4235,7 @@ def get_trtllm_fmha_v2_module(
     return gen_fmha_v2_module(input_layout, input_dtype, output_dtype).build_and_load()
 
 
-@flashinfer_api
+@flashinfer_api(trace=trtllm_fmha_v2_prefill_trace)
 def trtllm_fmha_v2_prefill(
     qkv: Union[
         torch.Tensor,
diff --git a/flashinfer/quantization/fp4_quantization.py b/flashinfer/quantization/fp4_quantization.py
index 4cd5cd34f3..5bde56e57f 100644
--- a/flashinfer/quantization/fp4_quantization.py
+++ b/flashinfer/quantization/fp4_quantization.py
@@ -21,6 +21,11 @@
 import torch
 
 from ..api_logging import flashinfer_api
+from ..trace.templates.quantize import (
+    fp4_quantize_trace,
+    mxfp4_quantize_trace,
+    nvfp4_quantize_trace,
+)
 from ..jit import JitSpec
 from ..jit import env as jit_env
 from ..jit import (
@@ -648,7 +653,7 @@ def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=fp4_quantize_trace)
 def fp4_quantize(
     input: torch.Tensor,
     global_scale: Optional[torch.Tensor] = None,
@@ -923,7 +928,7 @@ def shuffle_matrix_sf_a(
     return block_scale_interleave(w_shuffled)
 
 
-@flashinfer_api
+@flashinfer_api(trace=nvfp4_quantize_trace)
 def nvfp4_quantize(
     a,
     a_global_sf,
@@ -1024,7 +1029,7 @@ def nvfp4_quantize(
     return a_fp4, a_sf
 
 
-@flashinfer_api
+@flashinfer_api(trace=mxfp4_quantize_trace)
 def mxfp4_quantize(
     a: torch.Tensor,
     backend: str = "cuda",
diff --git a/flashinfer/quantization/fp8_quantization.py b/flashinfer/quantization/fp8_quantization.py
index f2c9f41249..49e13a8b31 100644
--- a/flashinfer/quantization/fp8_quantization.py
+++ b/flashinfer/quantization/fp8_quantization.py
@@ -5,6 +5,7 @@
 import torch
 
 from ..api_logging import flashinfer_api
+from ..trace.templates.quantize import mxfp8_quantize_trace
 from ..jit.fp8_quantization import gen_mxfp8_quantization_sm100_module
 from ..utils import (
     device_support_pdl,
@@ -158,7 +159,7 @@ def _fake_mxfp8_dequantize_host_sm100(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=mxfp8_quantize_trace)
 def mxfp8_quantize(
     input: torch.Tensor,
     is_sf_swizzled_layout: bool = True,
diff --git a/flashinfer/rope.py b/flashinfer/rope.py
index d39d2e07e6..d8387a0229 100644
--- a/flashinfer/rope.py
+++ b/flashinfer/rope.py
@@ -20,6 +20,18 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.rope import (
+    apply_llama31_rope_inplace_trace,
+    apply_llama31_rope_pos_ids_inplace_trace,
+    apply_llama31_rope_pos_ids_trace,
+    apply_llama31_rope_trace,
+    apply_rope_inplace_trace,
+    apply_rope_pos_ids_inplace_trace,
+    apply_rope_pos_ids_trace,
+    apply_rope_trace,
+    apply_rope_with_cos_sin_cache_inplace_trace,
+    apply_rope_with_cos_sin_cache_trace,
+)
 from .jit.rope import gen_rope_module
 from .utils import register_custom_op, register_fake_op
 
@@ -414,7 +426,7 @@ def _fake_apply_llama31_rope_pos_ids(
     pass
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_rope_inplace_trace)
 def apply_rope_inplace(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -502,7 +514,7 @@ def apply_rope_inplace(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_rope_pos_ids_inplace_trace)
 def apply_rope_pos_ids_inplace(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -561,7 +573,7 @@ def apply_rope_pos_ids_inplace(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_llama31_rope_inplace_trace)
 def apply_llama31_rope_inplace(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -670,7 +682,7 @@ def apply_llama31_rope_inplace(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_llama31_rope_pos_ids_inplace_trace)
 def apply_llama31_rope_pos_ids_inplace(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -749,7 +761,7 @@ def apply_llama31_rope_pos_ids_inplace(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_rope_trace)
 def apply_rope(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -860,7 +872,7 @@ def apply_rope(
     return q_rope, k_rope
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_rope_pos_ids_trace)
 def apply_rope_pos_ids(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -929,7 +941,7 @@ def apply_rope_pos_ids(
     return q_rope, k_rope
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_llama31_rope_trace)
 def apply_llama31_rope(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -1052,7 +1064,7 @@ def apply_llama31_rope(
     return q_rope, k_rope
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_llama31_rope_pos_ids_trace)
 def apply_llama31_rope_pos_ids(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -1140,7 +1152,7 @@ def apply_llama31_rope_pos_ids(
     return q_rope, k_rope
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_rope_with_cos_sin_cache_trace)
 def apply_rope_with_cos_sin_cache(
     positions: torch.Tensor,
     query: torch.Tensor,
@@ -1204,7 +1216,7 @@ def apply_rope_with_cos_sin_cache(
     return query_out, key_out
 
 
-@flashinfer_api
+@flashinfer_api(trace=apply_rope_with_cos_sin_cache_inplace_trace)
 def apply_rope_with_cos_sin_cache_inplace(
     positions: torch.Tensor,
     query: torch.Tensor,
diff --git a/flashinfer/sampling.py b/flashinfer/sampling.py
index 7f7d573679..3ffcf39a3a 100644
--- a/flashinfer/sampling.py
+++ b/flashinfer/sampling.py
@@ -21,6 +21,20 @@
 
 from .api_logging import flashinfer_api
 from .jit.sampling import gen_sampling_module
+from .trace.templates.sampling import (
+    chain_speculative_sampling_trace,
+    min_p_sampling_trace,
+    sampling_from_logits_trace,
+    sampling_from_probs_trace,
+    softmax_trace,
+    top_k_mask_logits_trace,
+    top_k_renorm_probs_trace,
+    top_k_sampling_trace,
+    top_k_top_p_sampling_from_logits_trace,
+    top_k_top_p_sampling_trace,
+    top_p_renorm_probs_trace,
+    top_p_sampling_trace,
+)
 from .utils import (
     _get_cache_buf,
     device_support_pdl,
@@ -719,7 +733,7 @@ def _validate_and_convert_seed_offset(
     return maybe_seed_arr, seed_val, maybe_offset_arr, offset_val
 
 
-@flashinfer_api
+@flashinfer_api(trace=softmax_trace)
 def softmax(
     logits: torch.Tensor,
     temperature: Optional[Union[torch.Tensor, float]] = None,
@@ -777,7 +791,7 @@ def softmax(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=sampling_from_logits_trace)
 def sampling_from_logits(
     logits: torch.Tensor,
     indices: Optional[torch.Tensor] = None,
@@ -857,7 +871,7 @@ def sampling_from_logits(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=sampling_from_probs_trace)
 def sampling_from_probs(
     probs: torch.Tensor,
     indices: Optional[torch.Tensor] = None,
@@ -950,7 +964,7 @@ def sampling_from_probs(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=top_p_sampling_trace)
 def top_p_sampling_from_probs(
     probs: torch.Tensor,
     top_p: Union[torch.Tensor, float],
@@ -1062,7 +1076,7 @@ def top_p_sampling_from_probs(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=top_k_sampling_trace)
 def top_k_sampling_from_probs(
     probs: torch.Tensor,
     top_k: Union[torch.Tensor, int],
@@ -1174,7 +1188,7 @@ def top_k_sampling_from_probs(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=min_p_sampling_trace)
 def min_p_sampling_from_probs(
     probs: torch.Tensor,
     min_p: Union[torch.Tensor, float],
@@ -1282,7 +1296,7 @@ def min_p_sampling_from_probs(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=top_k_top_p_sampling_from_logits_trace)
 def top_k_top_p_sampling_from_logits(
     logits: torch.Tensor,
     top_k: Union[torch.Tensor, int],
@@ -1428,7 +1442,7 @@ def top_k_top_p_sampling_from_logits(
         raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
 
 
-@flashinfer_api
+@flashinfer_api(trace=top_k_top_p_sampling_trace)
 def top_k_top_p_sampling_from_probs(
     probs: torch.Tensor,
     top_k: Union[torch.Tensor, int],
@@ -1570,7 +1584,7 @@ def top_k_top_p_sampling_from_probs(
         raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
 
 
-@flashinfer_api
+@flashinfer_api(trace=top_p_renorm_probs_trace)
 def top_p_renorm_probs(
     probs: torch.Tensor,
     top_p: Union[torch.Tensor, float],
@@ -1659,7 +1673,7 @@ def top_p_renorm_probs(
 top_p_renorm_prob = top_p_renorm_probs
 
 
-@flashinfer_api
+@flashinfer_api(trace=top_k_renorm_probs_trace)
 def top_k_renorm_probs(
     probs: torch.Tensor,
     top_k: Union[torch.Tensor, int],
@@ -1736,7 +1750,7 @@ def top_k_renorm_probs(
 top_k_renorm_prob = top_k_renorm_probs
 
 
-@flashinfer_api
+@flashinfer_api(trace=top_k_mask_logits_trace)
 def top_k_mask_logits(
     logits: torch.Tensor, top_k: Union[torch.Tensor, int]
 ) -> torch.Tensor:
@@ -1808,7 +1822,7 @@ def top_k_mask_logits(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=chain_speculative_sampling_trace)
 def chain_speculative_sampling(
     draft_probs,
     draft_token_ids,
diff --git a/flashinfer/sparse.py b/flashinfer/sparse.py
index ed847d5cd9..7e0f3d90cb 100644
--- a/flashinfer/sparse.py
+++ b/flashinfer/sparse.py
@@ -20,6 +20,10 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.attention import (
+    block_sparse_attention_run_trace,
+    variable_block_sparse_attention_run_trace,
+)
 from .decode import get_batch_decode_module
 from .prefill import _compute_page_mask_indptr, get_batch_prefill_module
 from .quantization import segment_packbits
@@ -486,7 +490,7 @@ def forward(
         self._rope_theta = rope_theta
         return self.run(q, k, v, scale_q, scale_k, scale_v)
 
-    @flashinfer_api
+    @flashinfer_api(trace=block_sparse_attention_run_trace)
     def run(
         self,
         q: torch.Tensor,
@@ -1031,7 +1035,7 @@ def forward(
         self._rope_theta = rope_theta
         return self.run(q, k, v)
 
-    @flashinfer_api
+    @flashinfer_api(trace=variable_block_sparse_attention_run_trace)
     def run(
         self,
         q: torch.Tensor,
diff --git a/flashinfer/trace/__init__.py b/flashinfer/trace/__init__.py
new file mode 100644
index 0000000000..308235d5b4
--- /dev/null
+++ b/flashinfer/trace/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+flashinfer.trace — TraceTemplate system for fi_trace.
+
+Usage::
+
+    from flashinfer.trace import TraceTemplate, Var, Const, Tensor, Scalar
+"""
+
+from .template import Const, Scalar, Tensor, TraceTemplate, Var, _TRACE_DUMP_DIR
+
+__all__ = ["TraceTemplate", "Var", "Const", "Tensor", "Scalar", "_TRACE_DUMP_DIR"]
diff --git a/flashinfer/trace/template.py b/flashinfer/trace/template.py
new file mode 100644
index 0000000000..184e558721
--- /dev/null
+++ b/flashinfer/trace/template.py
@@ -0,0 +1,515 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+TraceTemplate and associated classes for the fi_trace system.
+
+Design
+------
+A :class:`TraceTemplate` describes the schema of a FlashInfer operation
+independently from any specific Python function.  Templates live in
+``flashinfer/trace/templates/`` and are referenced by the
+``@flashinfer_api(trace=<template>)`` decorator.
+
+Axis extraction is **automatic**: the extraction logic is derived from the
+``dim_names`` of the ``Tensor`` inputs — no lambda functions required.
+
+Example::
+
+    from flashinfer.trace.template import TraceTemplate, Var, Const, Tensor, Scalar
+
+    rmsnorm_trace = TraceTemplate(
+        op_type="rmsnorm",
+        axes={"num_tokens": Var(), "hidden_size": Const()},
+        inputs={
+            "input":  Tensor(["num_tokens", "hidden_size"]),
+            "weight": Tensor(["hidden_size"]),
+            "eps":    Scalar("float32"),
+        },
+        outputs={"output": Tensor(["num_tokens", "hidden_size"])},
+    )
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+
+# These are read lazily at each call so that the caller can set them after
+# importing flashinfer (e.g. in scripts run with ``python -m``).
+
+
+def _get_trace_dump_dir() -> Optional[str]:
+    """Return the current FLASHINFER_TRACE_DUMP_DIR value (may be None)."""
+    return os.environ.get("FLASHINFER_TRACE_DUMP_DIR")
+
+
+def _is_trace_dump_enabled() -> bool:
+    """Return True if auto-dump is currently enabled via FLASHINFER_TRACE_DUMP."""
+    return os.environ.get("FLASHINFER_TRACE_DUMP", "0") not in ("0", "")
+
+
+# Keep these module-level names for backwards compatibility with any code that
+# imports them directly; they reflect the value at module-load time and are
+# NOT updated if the env var changes later.
+_TRACE_DUMP_DIR: Optional[str] = os.environ.get("FLASHINFER_TRACE_DUMP_DIR")
+_TRACE_DUMP_ENABLED: bool = _is_trace_dump_enabled()
+
+# In-memory deduplication: names of traces already written this process.
+_DUMPED_NAMES: set = set()
+
+# ---------------------------------------------------------------------------
+# Dtype helpers
+# ---------------------------------------------------------------------------
+
+_DTYPE_MAP: Dict[torch.dtype, str] = {
+    torch.float32: "float32",
+    torch.float16: "float16",
+    torch.bfloat16: "bfloat16",
+    torch.float8_e4m3fn: "float8_e4m3fn",
+    torch.float8_e5m2: "float8_e5m2",
+    torch.int32: "int32",
+    torch.int64: "int64",
+    torch.int8: "int8",
+    torch.uint8: "uint8",
+}
+
+
+def _dtype_str(dtype: torch.dtype) -> str:
+    return _DTYPE_MAP.get(dtype, str(dtype).replace("torch.", ""))
+
+
+def _get_tensor(
+    kwargs: Dict[str, Any],
+    param: str,
+    tuple_idx: Optional[int] = None,
+) -> Optional[torch.Tensor]:
+    val = kwargs.get(param)
+    if val is None:
+        return None
+    if tuple_idx is not None:
+        if isinstance(val, (tuple, list)) and len(val) > tuple_idx:
+            val = val[tuple_idx]
+        else:
+            return None
+    return val if isinstance(val, torch.Tensor) else None
+
+
+# ---------------------------------------------------------------------------
+# Axis markers
+# ---------------------------------------------------------------------------
+
+
+class Var:
+    """Runtime-variable axis (e.g., ``batch_size``, ``seq_len``)."""
+
+    def __init__(self, description: str = "") -> None:
+        self.description = description
+
+
+class Const:
+    """Compile-time-constant axis (e.g., ``hidden_size``, ``num_heads``).
+
+    Parameters
+    ----------
+    description:
+        Human-readable description included in the JSON.
+    abbrev:
+        Short prefix used in the auto-generated file name.
+
+        * ``None`` (default) — use the axis name as-is (backwards compatible).
+        * ``""`` — omit this axis from the file name entirely.
+        * Any other string — use that as the prefix, e.g. ``"h"`` produces
+          ``h32`` for ``num_qo_heads=32``.
+    """
+
+    def __init__(self, description: str = "", abbrev: Optional[str] = None) -> None:
+        self.description = description
+        self.abbrev = abbrev
+
+
+# ---------------------------------------------------------------------------
+# Input / Output descriptors
+# ---------------------------------------------------------------------------
+
+
+class Tensor:
+    """Descriptor for a tensor input or output.
+
+    Parameters
+    ----------
+    dim_names:
+        Ordered list of axis names for each tensor dimension.
+    param:
+        Python parameter name to look up in ``kwargs``.  Defaults to the
+        key name in the ``inputs``/``outputs`` dict.
+    tuple_idx:
+        When the parameter is a tuple (e.g. ``paged_kv_cache=(k, v)``),
+        the index into that tuple.
+    dtype:
+        For *outputs*: explicit dtype string such as ``"float32"``.
+        For *inputs*: ignored — dtype is read from the actual tensor.
+    dtype_from:
+        For *outputs*: name of an input ``param`` whose dtype to copy.
+        Takes precedence over ``dtype`` when both are set.
+    optional:
+        Whether the tensor may be absent.
+    description:
+        Human-readable description (included in the JSON).
+    """
+
+    def __init__(
+        self,
+        dim_names: List[str],
+        *,
+        param: Optional[str] = None,
+        tuple_idx: Optional[int] = None,
+        dtype: Optional[str] = None,
+        dtype_from: Optional[str] = None,
+        optional: bool = False,
+        description: str = "",
+    ) -> None:
+        self.dim_names = dim_names
+        self.param = param
+        self.tuple_idx = tuple_idx
+        self.dtype = dtype
+        self.dtype_from = dtype_from
+        self.optional = optional
+        self.description = description
+
+
+class Scalar:
+    """Descriptor for a scalar (non-tensor) input.
+
+    Parameters
+    ----------
+    dtype:
+        Fixed dtype string (e.g. ``"float32"``).
+    param:
+        Python parameter name. Defaults to the key name in the dict.
+    optional:
+        Whether the scalar may be absent.
+    description:
+        Human-readable description.
+    """
+
+    def __init__(
+        self,
+        dtype: str = "float32",
+        *,
+        param: Optional[str] = None,
+        optional: bool = False,
+        description: str = "",
+    ) -> None:
+        self.dtype = dtype
+        self.param = param
+        self.optional = optional
+        self.description = description
+
+
+# ---------------------------------------------------------------------------
+# TraceTemplate
+# ---------------------------------------------------------------------------
+
+
+class TraceTemplate:
+    """Complete schema for generating a flashinfer-bench definition JSON.
+
+    Parameters
+    ----------
+    op_type:
+        Operation type string (e.g. ``"rmsnorm"``, ``"gqa_paged"``).
+    name_prefix:
+        Short, human-readable prefix used in the generated file name and the
+        ``name`` field of the JSON.  When *None* (default) the prefix falls
+        back to ``op_type``.  Set this explicitly when two templates share the
+        same ``op_type`` and would otherwise produce identical file names
+        (e.g. ``"gqa_paged_decode"`` vs ``"gqa_paged_prefill"`` both have
+        ``op_type="gqa_paged"``).
+    axes:
+        Ordered ``dict`` of ``axis_name → Var() | Const()``.
+    inputs:
+        Ordered ``dict`` of ``json_name → Tensor | Scalar``.
+    outputs:
+        Ordered ``dict`` of ``json_name → Tensor | Scalar``.
+    reference:
+        Optional Python callable that implements the reference computation.
+    constraints:
+        Optional list of Python-expression strings (flashinfer-bench schema).
+    tags:
+        Additional tags (beyond the mandatory ``fi_api:...`` tag).
+    description:
+        Description field for the output JSON.
+    """
+
+    def __init__(
+        self,
+        op_type: str,
+        axes: Dict[str, Union[Var, Const]],
+        inputs: Dict[str, Union[Tensor, Scalar]],
+        outputs: Dict[str, Union[Tensor, Scalar]],
+        *,
+        name_prefix: Optional[str] = None,
+        reference: Optional[Callable] = None,
+        constraints: Optional[List[str]] = None,
+        tags: Optional[List[str]] = None,
+        description: str = "",
+    ) -> None:
+        self.op_type = op_type
+        self.name_prefix = name_prefix
+        self.axes = axes
+        self.inputs = inputs
+        self.outputs = outputs
+        self.reference = reference
+        self.constraints = constraints or []
+        self.tags = tags or []
+        self.description = description
+
+    # ------------------------------------------------------------------
+    # Axis extraction (automatic)
+    # ------------------------------------------------------------------
+
+    def _build_axis_extractors(
+        self,
+    ) -> Dict[str, Callable[[Dict[str, Any]], Optional[int]]]:
+        """Build per-axis extraction callables from tensor dim_names.
+
+        For each axis in ``self.axes``, scan all ``Tensor`` inputs to find
+        which tensor contains that axis and at which dimension index.  The
+        resulting callable reads ``kwargs[param][tuple_idx].shape[dim_idx]``
+        at call time.
+        """
+        extractors: Dict[str, Callable[[Dict[str, Any]], Optional[int]]] = {}
+        for axis_name in self.axes:
+            # Strategy 1: find the first Tensor input whose dim_names mention
+            # this axis and read the corresponding shape dimension.
+            for json_key, descriptor in self.inputs.items():
+                if not isinstance(descriptor, Tensor):
+                    continue
+                if axis_name not in descriptor.dim_names:
+                    continue
+                param = descriptor.param if descriptor.param is not None else json_key
+                tidx = descriptor.tuple_idx
+                dim_idx = descriptor.dim_names.index(axis_name)
+
+                def _make_extractor(
+                    p: str, ti: Optional[int], di: int
+                ) -> Callable[[Dict[str, Any]], Optional[int]]:
+                    def extractor(kw: Dict[str, Any]) -> Optional[int]:
+                        t = _get_tensor(kw, p, ti)
+                        if t is None or di >= t.ndim:
+                            return None
+                        return int(t.shape[di])
+
+                    return extractor
+
+                extractors[axis_name] = _make_extractor(param, tidx, dim_idx)
+                break  # Use first match only.
+
+            if axis_name in extractors:
+                continue
+
+            # Strategy 2: fall back to reading the axis value directly from a
+            # scalar kwarg whose name matches the axis name.  This handles
+            # integer arguments like ``top_k``, ``n_group``, ``topk_group``.
+            def _make_scalar_extractor(
+                name: str,
+            ) -> Callable[[Dict[str, Any]], Optional[int]]:
+                def extractor(kw: Dict[str, Any]) -> Optional[int]:
+                    val = kw.get(name)
+                    if val is None:
+                        return None
+                    try:
+                        return int(val)
+                    except (TypeError, ValueError):
+                        return None
+
+                return extractor
+
+            extractors[axis_name] = _make_scalar_extractor(axis_name)
+
+        return extractors
+
+    # ------------------------------------------------------------------
+    # fi_trace callable factory
+    # ------------------------------------------------------------------
+
+    def build_fi_trace_fn(self, fi_api: str) -> Callable[..., Dict[str, Any]]:
+        """Return a ``fi_trace(save_dir=None, **kwargs)`` callable.
+
+        Parameters
+        ----------
+        fi_api:
+            Fully qualified Python name of the decorated function
+            (e.g. ``"flashinfer.norm.rmsnorm"``).
+        """
+        axis_extractors = self._build_axis_extractors()
+        template = self  # capture in closure
+
+        def fi_trace(
+            save_dir: Optional[Union[str, Path]] = None,
+            name: Optional[str] = None,
+            **kwargs: Any,
+        ) -> Dict[str, Any]:
+            # ── 1. Extract axis values ─────────────────────────────────────
+            axis_values: Dict[str, int] = {}
+            for axis_name, extractor in axis_extractors.items():
+                try:
+                    val = extractor(kwargs)
+                    if val is not None:
+                        axis_values[axis_name] = val
+                except Exception:
+                    pass
+
+            # ── 3. Build "axes" section ────────────────────────────────────
+            axes_json: Dict[str, Any] = {}
+            for axis_name, marker in template.axes.items():
+                is_var = isinstance(marker, Var)
+                entry: Dict[str, Any] = {"type": "var" if is_var else "const"}
+                if not is_var and axis_name in axis_values:
+                    entry["value"] = axis_values[axis_name]
+                if marker.description:
+                    entry["description"] = marker.description
+                axes_json[axis_name] = entry
+
+            # ── 4. Build "inputs" section ──────────────────────────────────
+            inputs_json: Dict[str, Any] = {}
+            for json_key, descriptor in template.inputs.items():
+                if isinstance(descriptor, Scalar):
+                    entry = {"shape": None, "dtype": descriptor.dtype}
+                else:
+                    param = (
+                        descriptor.param if descriptor.param is not None else json_key
+                    )
+                    t = _get_tensor(kwargs, param, descriptor.tuple_idx)
+                    entry = {
+                        "shape": descriptor.dim_names,
+                        "dtype": _dtype_str(t.dtype) if t is not None else "unknown",
+                    }
+                if descriptor.optional:
+                    entry["optional"] = True
+                if descriptor.description:
+                    entry["description"] = descriptor.description
+                inputs_json[json_key] = entry
+
+            # ── 5. Build "outputs" section ─────────────────────────────────
+            outputs_json: Dict[str, Any] = {}
+            for json_key, descriptor in template.outputs.items():
+                if isinstance(descriptor, Scalar):
+                    entry = {"shape": None, "dtype": descriptor.dtype}
+                else:
+                    # Resolve dtype for outputs
+                    dtype: str
+                    if descriptor.dtype_from is not None:
+                        ref_param = descriptor.dtype_from
+                        ref_t = _get_tensor(kwargs, ref_param)
+                        dtype = (
+                            _dtype_str(ref_t.dtype) if ref_t is not None else "unknown"
+                        )
+                    elif descriptor.dtype is not None:
+                        dtype = descriptor.dtype
+                    else:
+                        # Auto-infer: find first input tensor with overlapping dims
+                        dtype = "unknown"
+                        for in_key, in_desc in template.inputs.items():
+                            if not isinstance(in_desc, Tensor):
+                                continue
+                            if any(
+                                d in in_desc.dim_names for d in descriptor.dim_names
+                            ):
+                                in_param = (
+                                    in_desc.param
+                                    if in_desc.param is not None
+                                    else in_key
+                                )
+                                ref_t = _get_tensor(kwargs, in_param, in_desc.tuple_idx)
+                                if ref_t is not None:
+                                    dtype = _dtype_str(ref_t.dtype)
+                                    break
+                    entry = {"shape": descriptor.dim_names, "dtype": dtype}
+                if descriptor.optional:
+                    entry["optional"] = True
+                if descriptor.description:
+                    entry["description"] = descriptor.description
+                outputs_json[json_key] = entry
+
+            # ── 6. Resolve name (explicit override or auto-generate) ──────
+            if name is None:
+                # Use name_prefix from the template when set (preferred: short,
+                # semantic names like "gqa_paged_decode", "gdn_mtp").
+                # Fall back to op_type otherwise.
+                prefix = (
+                    template.name_prefix
+                    if template.name_prefix is not None
+                    else template.op_type
+                )
+                const_parts = []
+                for n, marker in template.axes.items():
+                    if not isinstance(marker, Const) or n not in axis_values:
+                        continue
+                    # abbrev="" → omit from name; abbrev=None → use axis name
+                    pfx = marker.abbrev if marker.abbrev is not None else n
+                    if pfx == "":
+                        continue
+                    const_parts.append(f"{pfx}{axis_values[n]}")
+                name = prefix + ("_" + "_".join(const_parts) if const_parts else "")
+
+            # ── 7. Assemble definition ─────────────────────────────────────
+            all_tags = [f"fi_api:{fi_api}"] + template.tags
+            result: Dict[str, Any] = {
+                "name": name,
+                "description": template.description,
+                "op_type": template.op_type,
+                "tags": all_tags,
+                "axes": axes_json,
+            }
+            if template.constraints:
+                result["constraints"] = template.constraints
+            result["inputs"] = inputs_json
+            result["outputs"] = outputs_json
+            if template.reference is not None:
+                try:
+                    import inspect  # noqa: PLC0415
+
+                    result["reference"] = inspect.getsource(template.reference)
+                except (OSError, TypeError):
+                    pass
+
+            # ── 8. Write JSON file if requested ───────────────────────────
+            # Deduplication only applies to auto-dump (save_dir=None): once a
+            # named trace has been auto-dumped this process, skip re-writing it.
+            # Explicit save_dir= calls always write (no dedup).
+            effective_dir = save_dir if save_dir is not None else _get_trace_dump_dir()
+            _is_auto_dump = save_dir is None
+            if effective_dir is not None and (
+                not _is_auto_dump or name not in _DUMPED_NAMES
+            ):
+                out_dir = Path(effective_dir)
+                out_dir.mkdir(parents=True, exist_ok=True)
+                out_path = out_dir / f"{name}.json"
+                out_path.write_text(json.dumps(result, indent=2))
+                if _is_auto_dump:
+                    _DUMPED_NAMES.add(name)
+
+            return result
+
+        fi_trace.__doc__ = (
+            f"Generate a flashinfer-bench definition JSON for op_type='{self.op_type}'.\n\n"
+            f"FlashInfer API: {fi_api}\n"
+        )
+        return fi_trace
diff --git a/flashinfer/trace/templates/__init__.py b/flashinfer/trace/templates/__init__.py
new file mode 100644
index 0000000000..9cf7020299
--- /dev/null
+++ b/flashinfer/trace/templates/__init__.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Per-op TraceTemplate definitions for FlashInfer APIs.
+
+How to add a new template
+-------------------------
+1. **Choose or create a file.**
+   Group templates by op_type. Existing files:
+   - ``norm.py``       — rmsnorm, fused_add_rmsnorm
+   - ``sampling.py``   — top-k / top-p sampling
+   - ``gemm.py``       — bf16 / fp8 GEMM
+   - ``attention.py``  — gqa_paged, gqa_ragged, mla_paged, dsa_paged
+   - ``gdn.py``        — gated delta-net decode
+   - ``moe.py``        — mixture-of-experts
+   Create a new file for a genuinely new op_type (e.g. ``conv.py``).
+
+2. **Define the template.**  Example::
+
+       from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+       my_op_trace = TraceTemplate(
+           op_type="my_op",
+           description="One-line description.",
+           axes={
+               "batch_size": Var(),           # runtime-variable
+               "hidden_size": Const(),         # fixed by model config
+           },
+           inputs={
+               # Key = JSON name = Python param name (override with param=)
+               "x": Tensor(["batch_size", "hidden_size"]),
+               "weight": Tensor(["hidden_size"]),
+               "eps": Scalar("float32"),
+           },
+           outputs={
+               "out": Tensor(["batch_size", "hidden_size"], dtype_from="x"),
+           },
+           tags=["status:verified"],
+       )
+
+   Key rules:
+   - ``Var()``   → axis value is NOT baked into the generated name or JSON value.
+   - ``Const()`` → axis value IS extracted from a tensor and written to JSON.
+   - Axis values are extracted **automatically** from the first ``Tensor`` input
+     whose ``dim_names`` list contains that axis name.
+   - For tuple parameters (e.g. ``paged_kv_cache=(k, v)``), set
+     ``param="paged_kv_cache"`` and ``tuple_idx=0`` / ``tuple_idx=1``.
+   - For output dtype, prefer ``dtype_from="<input_param>"`` to copy from an
+     input tensor, or set ``dtype="float32"`` for a fixed dtype.
+
+3. **Attach to the API.**  In the API file::
+
+       from .trace.templates.my_file import my_op_trace
+
+       @flashinfer_api(trace=my_op_trace)
+       def my_op(x, weight, eps=1e-6):
+           ...
+
+   The ``fi_api`` tag is derived automatically from
+   ``func.__module__ + "." + func.__qualname__``.
+
+4. **Test it.**  Add a test to ``tests/test_fi_trace.py``::
+
+       def test_my_op_fi_trace():
+           defn = flashinfer.my_module.my_op.fi_trace(x=x_tensor, weight=w_tensor)
+           assert defn["op_type"] == "my_op"
+           assert defn["axes"]["hidden_size"]["value"] == 4096
+"""
diff --git a/flashinfer/trace/templates/activation.py b/flashinfer/trace/templates/activation.py
new file mode 100644
index 0000000000..89ba279992
--- /dev/null
+++ b/flashinfer/trace/templates/activation.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for activation functions."""
+
+import torch
+import torch.nn.functional as F
+
+from ..template import Const, Tensor, TraceTemplate, Var
+
+# ── SiLU and Mul ─────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _silu_and_mul_reference(input):
+    """Fused SiLU + Mul: silu(input[..., :H]) * input[..., H:]"""
+    half = input.shape[-1] // 2
+    return F.silu(input[..., :half]) * input[..., half:]
+
+
+silu_and_mul_trace = TraceTemplate(
+    op_type="activation",
+    name_prefix="silu_and_mul",
+    description="Fused SiLU + Mul: silu(x[:H]) * x[H:]. Used in LLaMA/Mistral FFN.",
+    axes={
+        "num_tokens": Var(description="Total number of tokens (batch_size * seq_len)."),
+        "hidden_size": Const(
+            abbrev="h", description="Output hidden size (input is 2*h)."
+        ),
+    },
+    inputs={
+        "input": Tensor(
+            ["num_tokens", "hidden_size"],
+            param="input",
+            description="Gated input tensor of shape [num_tokens, 2*hidden_size].",
+        ),
+    },
+    outputs={
+        "output": Tensor(["num_tokens", "hidden_size"], dtype_from="input"),
+    },
+    tags=["status:verified", "fused"],
+    reference=_silu_and_mul_reference,
+)
+
+# ── GeLU Tanh and Mul ────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gelu_tanh_and_mul_reference(input):
+    """Fused GeLU (tanh approx) + Mul: gelu_tanh(x[:H]) * x[H:]"""
+    half = input.shape[-1] // 2
+    return F.gelu(input[..., :half], approximate="tanh") * input[..., half:]
+
+
+gelu_tanh_and_mul_trace = TraceTemplate(
+    op_type="activation",
+    name_prefix="gelu_tanh_and_mul",
+    description="Fused GeLU (tanh approx) + Mul: gelu_tanh(x[:H]) * x[H:]. Used in BERT/GPT FFN.",
+    axes={
+        "num_tokens": Var(description="Total number of tokens."),
+        "hidden_size": Const(
+            abbrev="h", description="Output hidden size (input is 2*h)."
+        ),
+    },
+    inputs={
+        "input": Tensor(
+            ["num_tokens", "hidden_size"],
+            param="input",
+            description="Gated input tensor of shape [num_tokens, 2*hidden_size].",
+        ),
+    },
+    outputs={
+        "output": Tensor(["num_tokens", "hidden_size"], dtype_from="input"),
+    },
+    tags=["status:verified", "fused"],
+    reference=_gelu_tanh_and_mul_reference,
+)
+
+# ── GeLU and Mul ─────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gelu_and_mul_reference(input):
+    """Fused GeLU (exact) + Mul: gelu(x[:H]) * x[H:]"""
+    half = input.shape[-1] // 2
+    return F.gelu(input[..., :half]) * input[..., half:]
+
+
+gelu_and_mul_trace = TraceTemplate(
+    op_type="activation",
+    name_prefix="gelu_and_mul",
+    description="Fused GeLU (exact) + Mul: gelu(x[:H]) * x[H:].",
+    axes={
+        "num_tokens": Var(description="Total number of tokens."),
+        "hidden_size": Const(
+            abbrev="h", description="Output hidden size (input is 2*h)."
+        ),
+    },
+    inputs={
+        "input": Tensor(
+            ["num_tokens", "hidden_size"],
+            param="input",
+            description="Gated input tensor of shape [num_tokens, 2*hidden_size].",
+        ),
+    },
+    outputs={
+        "output": Tensor(["num_tokens", "hidden_size"], dtype_from="input"),
+    },
+    tags=["status:verified", "fused"],
+    reference=_gelu_and_mul_reference,
+)
diff --git a/flashinfer/trace/templates/attention.py b/flashinfer/trace/templates/attention.py
new file mode 100644
index 0000000000..6fe489eaec
--- /dev/null
+++ b/flashinfer/trace/templates/attention.py
@@ -0,0 +1,1685 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for attention operations.
+
+Pick the template whose input schema matches your call site. Rows that share
+KV layout / indexing / stage are interchangeable from a consumer's viewpoint;
+the backend column indicates which kernel the API wraps.
+
++---------------------------+-------------------+---------------------------+-------------------------+---------+-----------------+
+| Template                  | Batching          | KV layout                 | Indexing                | Stage   | Backend         |
++===========================+===================+===========================+=========================+=========+=================+
+| ``single_decode``         | single request    | contiguous                | none                    | decode  | any (no plan)   |
+| ``single_prefill``        | single request    | contiguous                | none                    | prefill | any (no plan)   |
+| ``gqa_paged_decode``      | batched, ragged   | paged tuple (k, v)        | kv_indptr + kv_indices  | decode  | FA2/FA3/cuDNN   |
+| ``gqa_paged_prefill``     | batched, ragged   | paged tuple (k, v)        | +qo_indptr              | prefill | FA2/FA3/cuDNN   |
+| ``gqa_ragged``            | batched, ragged   | contiguous                | qo_indptr + kv_indptr   | prefill | FA2/FA3         |
+| ``mla_paged_decode``      | batched, ragged   | paged MLA (ckv + kpe)     | kv_indptr + kv_indices  | decode  | DeepSeek MLA    |
+| ``mla_paged_prefill``     | batched, ragged   | paged MLA (ckv + kpe)     | +qo_indptr              | prefill | DeepSeek MLA    |
+| ``dsa_paged``             | batched           | paged MLA                 | sparse_indices (top-K)  | both    | sparse DSA      |
+| ``trtllm_batch_decode``   | batched           | paged, interleaved single | block_tables + seq_lens | decode  | TRT-LLM SM100+  |
+| ``trtllm_batch_context``  | batched           | paged, interleaved single | block_tables + cum_*    | prefill | TRT-LLM SM100+  |
+| ``cudnn_batch_decode``    | batched           | paged, separate k/v       | block_tables            | decode  | cuDNN (no plan) |
+| ``cudnn_batch_prefill``   | batched, var-len  | paged or contiguous       | actual_seq_lens_*       | prefill | cuDNN (no plan) |
++---------------------------+-------------------+---------------------------+-------------------------+---------+-----------------+
+"""
+
+import math
+
+import torch
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+
+# ── GQA paged decode ─────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gqa_paged_decode_reference(q, k_cache, v_cache, kv_indptr, kv_indices, sm_scale):
+    batch_size, num_qo_heads, head_dim = q.shape
+    _, page_size, num_kv_heads, _ = k_cache.shape
+
+    output = torch.zeros(
+        (batch_size, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device
+    )
+    lse = torch.full(
+        (batch_size, num_qo_heads), -float("inf"), dtype=torch.float32, device=q.device
+    )
+
+    gqa_ratio = num_qo_heads // num_kv_heads
+    k_cache_f32 = k_cache.to(torch.float32)
+    v_cache_f32 = v_cache.to(torch.float32)
+
+    for b in range(batch_size):
+        page_start = int(kv_indptr[b].item())
+        page_end = int(kv_indptr[b + 1].item())
+        if page_start >= page_end:
+            output[b].zero_()
+            continue
+        # kv_indices are page IDs. Gather pages first, then flatten the
+        # [num_selected_pages, page_size] axis into a single token axis.
+        page_ids = kv_indices[page_start:page_end].to(torch.long)
+        k_b = k_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)
+        v_b = v_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)
+        q_b = q[b].to(torch.float32)  # [num_qo_heads, head_dim]
+        for h in range(num_qo_heads):
+            kv_h = h // gqa_ratio
+            logits = torch.matmul(q_b[h], k_b[:, kv_h].T) * sm_scale
+            lse[b, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)
+            attn = torch.softmax(logits, dim=-1)
+            output[b, h] = torch.matmul(attn, v_b[:, kv_h]).to(torch.bfloat16)
+
+    return output, lse
+
+
+gqa_paged_decode_trace = TraceTemplate(
+    op_type="gqa_paged",
+    name_prefix="gqa_paged_decode",
+    description=(
+        "Batched GQA decode (1 query per seq) with a paged KV cache as a "
+        "(k_cache, v_cache) tuple and ragged kv_indptr+kv_indices baked in at "
+        "plan() time. Wraps BatchDecodeWithPagedKVCacheWrapper.run()."
+    ),
+    axes={
+        "batch_size": Var(description="Total number of query tokens."),
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "num_pages": Var(),
+        "page_size": Const(abbrev="ps"),
+        "len_indptr": Var(description="Length of kv_indptr array."),
+        "num_kv_indices": Var(description="Total number of KV page indices."),
+    },
+    inputs={
+        "q": Tensor(["batch_size", "num_qo_heads", "head_dim"]),
+        # k_cache / v_cache come from paged_kv_cache=(k, v)
+        "k_cache": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            param="paged_kv_cache",
+            tuple_idx=0,
+        ),
+        "v_cache": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            param="paged_kv_cache",
+            tuple_idx=1,
+        ),
+        "kv_indptr": Tensor(
+            ["len_indptr"],
+            optional=True,
+            description="KV page offsets for each sequence. Set during plan(), not run().",
+        ),
+        "kv_indices": Tensor(
+            ["num_kv_indices"],
+            optional=True,
+            description="Page IDs for KV cache lookups. Set during plan(), not run().",
+        ),
+        "sm_scale": Scalar(
+            "float32",
+            optional=True,
+            description="Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run().",
+        ),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "num_qo_heads", "head_dim"], dtype_from="q"),
+        "lse": Tensor(
+            ["batch_size", "num_qo_heads"],
+            dtype="float32",
+            description="The 2-based log-sum-exp of attention logits.",
+        ),
+    },
+    constraints=[
+        "len_indptr == batch_size + 1",
+        "num_kv_indices == kv_indptr[-1].item()",
+    ],
+    tags=["stage:decode", "status:verified"],
+    reference=_gqa_paged_decode_reference,
+)
+
+# ── GQA paged prefill ────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gqa_paged_prefill_reference(
+    q, k_cache, v_cache, qo_indptr, kv_indptr, kv_indices, sm_scale
+):
+    total_q, num_qo_heads, head_dim = q.shape
+    num_pages, page_size, num_kv_heads, _ = k_cache.shape
+    len_indptr = qo_indptr.shape[0]
+
+    output = torch.zeros(
+        (total_q, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device
+    )
+    lse = torch.full(
+        (total_q, num_qo_heads), -float("inf"), dtype=torch.float32, device=q.device
+    )
+
+    gqa_ratio = num_qo_heads // num_kv_heads
+    q_f32 = q.to(torch.float32)
+    k_cache_f32 = k_cache.to(torch.float32)
+    v_cache_f32 = v_cache.to(torch.float32)
+
+    for b in range(len_indptr - 1):
+        q_start = int(qo_indptr[b].item())
+        q_end = int(qo_indptr[b + 1].item())
+        kv_start = int(kv_indptr[b].item())
+        kv_end = int(kv_indptr[b + 1].item())
+        if q_start >= q_end or kv_start >= kv_end:
+            continue
+        # kv_indices are page IDs. Gather pages and flatten to a token axis.
+        page_ids = kv_indices[kv_start:kv_end].to(torch.long)
+        k_b = k_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)
+        v_b = v_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)
+        num_kv_tokens = k_b.shape[0]
+        q_b = q_f32[q_start:q_end]
+        delta = num_kv_tokens - q_b.shape[0]
+        for q_idx in range(q_b.shape[0]):
+            max_kv = min(q_idx + 1 + delta, num_kv_tokens)
+            if max_kv <= 0:
+                continue
+            global_q = q_start + q_idx
+            for h in range(num_qo_heads):
+                kv_h = h // gqa_ratio
+                logits = torch.matmul(q_b[q_idx, h], k_b[:max_kv, kv_h].T) * sm_scale
+                lse[global_q, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)
+                attn = torch.softmax(logits, dim=-1)
+                output[global_q, h] = torch.matmul(attn, v_b[:max_kv, kv_h]).to(
+                    torch.bfloat16
+                )
+
+    return output, lse
+
+
+gqa_paged_prefill_trace = TraceTemplate(
+    op_type="gqa_paged",
+    name_prefix="gqa_paged_prefill",
+    description=(
+        "Batched GQA prefill (multi-token per seq, causal) with a paged KV "
+        "cache. Adds qo_indptr to gqa_paged_decode's indptr/indices. Wraps "
+        "BatchPrefillWithPagedKVCacheWrapper.run()."
+    ),
+    axes={
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "page_size": Const(abbrev="ps"),
+        "len_indptr": Var(description="Length of indptr arrays (batch_size + 1)."),
+        "total_q": Var(description="Total number of query tokens."),
+        "num_kv_indices": Var(description="Total number of KV page indices."),
+        "num_pages": Var(),
+    },
+    inputs={
+        "q": Tensor(["total_q", "num_qo_heads", "head_dim"]),
+        "k_cache": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            param="paged_kv_cache",
+            tuple_idx=0,
+        ),
+        "v_cache": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            param="paged_kv_cache",
+            tuple_idx=1,
+        ),
+        "qo_indptr": Tensor(
+            ["len_indptr"],
+            optional=True,
+            description="Query offsets for each sequence. Set during plan(), not run().",
+        ),
+        "kv_indptr": Tensor(
+            ["len_indptr"],
+            optional=True,
+            description="KV page offsets for each sequence. Set during plan(), not run().",
+        ),
+        "kv_indices": Tensor(
+            ["num_kv_indices"],
+            optional=True,
+            description="Page IDs for KV cache lookups. Set during plan(), not run().",
+        ),
+        "sm_scale": Scalar(
+            "float32",
+            optional=True,
+            description="Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run().",
+        ),
+    },
+    outputs={
+        "output": Tensor(["total_q", "num_qo_heads", "head_dim"], dtype_from="q"),
+        "lse": Tensor(
+            ["total_q", "num_qo_heads"],
+            dtype="float32",
+            description="The 2-based log-sum-exp of attention logits.",
+        ),
+    },
+    constraints=[
+        "total_q == qo_indptr[-1].item()",
+        "num_kv_indices == kv_indptr[-1].item()",
+    ],
+    tags=["stage:prefill", "status:verified"],
+    reference=_gqa_paged_prefill_reference,
+)
+
+# ── GQA ragged prefill ───────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gqa_ragged_prefill_reference(q, k, v, qo_indptr, kv_indptr, sm_scale):
+    total_q, num_qo_heads, head_dim = q.shape
+    total_kv, num_kv_heads, _ = k.shape
+    len_indptr = qo_indptr.shape[0]
+
+    output = torch.zeros(
+        (total_q, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device
+    )
+    lse = torch.full(
+        (total_q, num_qo_heads), -float("inf"), dtype=torch.float32, device=q.device
+    )
+
+    gqa_ratio = num_qo_heads // num_kv_heads
+    q_f32 = q.to(torch.float32)
+    k_f32 = k.to(torch.float32)
+    v_f32 = v.to(torch.float32)
+
+    for b in range(len_indptr - 1):
+        q_start = int(qo_indptr[b].item())
+        q_end = int(qo_indptr[b + 1].item())
+        kv_start = int(kv_indptr[b].item())
+        kv_end = int(kv_indptr[b + 1].item())
+        if q_start >= q_end or kv_start >= kv_end:
+            continue
+        q_b = q_f32[q_start:q_end]  # [S, num_qo_heads, head_dim]
+        k_b = k_f32[kv_start:kv_end]  # [T, num_kv_heads, head_dim]
+        v_b = v_f32[kv_start:kv_end]
+        num_q_tokens = q_b.shape[0]
+        num_kv_tokens = k_b.shape[0]
+        delta = num_kv_tokens - num_q_tokens
+        for q_idx in range(num_q_tokens):
+            max_kv = min(q_idx + 1 + delta, num_kv_tokens)
+            if max_kv <= 0:
+                continue
+            global_q = q_start + q_idx
+            for h in range(num_qo_heads):
+                kv_h = h // gqa_ratio
+                logits = torch.matmul(q_b[q_idx, h], k_b[:max_kv, kv_h].T) * sm_scale
+                lse[global_q, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)
+                attn = torch.softmax(logits, dim=-1)
+                output[global_q, h] = torch.matmul(attn, v_b[:max_kv, kv_h]).to(
+                    torch.bfloat16
+                )
+
+    return output, lse
+
+
+gqa_ragged_prefill_trace = TraceTemplate(
+    op_type="gqa_ragged",
+    name_prefix="gqa_ragged",
+    description=(
+        "Batched GQA prefill (causal) with contiguous (non-paged) K/V tensors "
+        "and qo_indptr/kv_indptr offsets baked in at plan() time. Wraps "
+        "BatchPrefillWithRaggedKVCacheWrapper.run()."
+    ),
+    axes={
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "len_indptr": Var(description="Length of indptr arrays (batch_size + 1)."),
+        "total_q": Var(description="Total number of query tokens."),
+        "total_kv": Var(description="Total key-value tokens across all sequences."),
+    },
+    inputs={
+        "q": Tensor(["total_q", "num_qo_heads", "head_dim"]),
+        "k": Tensor(["total_kv", "num_kv_heads", "head_dim"]),
+        "v": Tensor(["total_kv", "num_kv_heads", "head_dim"]),
+        "qo_indptr": Tensor(
+            ["len_indptr"],
+            optional=True,
+            description="Query offsets for each sequence. Set during plan(), not run().",
+        ),
+        "kv_indptr": Tensor(
+            ["len_indptr"],
+            optional=True,
+            description="Key-value offsets for each sequence. Set during plan(), not run().",
+        ),
+        "sm_scale": Scalar(
+            "float32",
+            optional=True,
+            description="Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run().",
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["total_q", "num_qo_heads", "head_dim"],
+            dtype_from="q",
+            description="Attention output tensor.",
+        ),
+        "lse": Tensor(
+            ["total_q", "num_qo_heads"],
+            dtype="float32",
+            description="The 2-based log-sum-exp of attention logits.",
+        ),
+    },
+    constraints=[
+        "total_q == qo_indptr[-1].item()",
+        "total_kv == kv_indptr[-1].item()",
+    ],
+    tags=["stage:prefill", "status:verified"],
+    reference=_gqa_ragged_prefill_reference,
+)
+
+# ── MLA paged decode (DeepSeek-V3 style) ─────────────────────────────────────
+
+
+@torch.no_grad()
+def _mla_paged_decode_reference(
+    q_nope, q_pe, ckv_cache, kpe_cache, kv_indptr, kv_indices, sm_scale
+):
+    batch_size, num_qo_heads, head_dim_ckv = q_nope.shape
+    _, _, head_dim_kpe = q_pe.shape
+
+    # [num_pages, page_size, head_dim_*] — keep the page dim; flatten after gather.
+    Kc_all = ckv_cache.to(torch.float32)
+    Kp_all = kpe_cache.to(torch.float32)
+
+    output = torch.zeros(
+        (batch_size, num_qo_heads, head_dim_ckv),
+        dtype=torch.bfloat16,
+        device=q_nope.device,
+    )
+    lse = torch.full(
+        (batch_size, num_qo_heads),
+        -float("inf"),
+        dtype=torch.float32,
+        device=q_nope.device,
+    )
+
+    for b in range(batch_size):
+        page_beg = int(kv_indptr[b].item())
+        page_end = int(kv_indptr[b + 1].item())
+        if page_beg >= page_end:
+            output[b].zero_()
+            continue
+        # kv_indices are page IDs; gather pages then flatten to a token axis.
+        page_ids = kv_indices[page_beg:page_end].to(torch.long)
+        Kc = Kc_all[page_ids].reshape(-1, head_dim_ckv)  # [L, head_dim_ckv]
+        Kp = Kp_all[page_ids].reshape(-1, head_dim_kpe)  # [L, head_dim_kpe]
+        qn = q_nope[b].to(torch.float32)  # [num_qo_heads, head_dim_ckv]
+        qp = q_pe[b].to(torch.float32)  # [num_qo_heads, head_dim_kpe]
+        logits = ((qn @ Kc.T) + (qp @ Kp.T)) * sm_scale  # [num_qo_heads, L]
+        lse[b] = torch.logsumexp(logits, dim=-1) / math.log(2.0)
+        output[b] = (torch.softmax(logits, dim=-1) @ Kc).to(torch.bfloat16)
+
+    return output, lse
+
+
+mla_paged_decode_trace = TraceTemplate(
+    op_type="mla_paged",
+    name_prefix="mla_paged_decode",
+    description=(
+        "Batched MLA decode (DeepSeek-V2/V3/R1). Query and KV are split into "
+        "NoPE (ckv, head_dim_ckv=512) and RoPE (kpe, head_dim_kpe=64) parts: "
+        "inputs are (q_nope, q_pe) and (ckv_cache, kpe_cache). "
+        "Wraps BatchMLAPagedAttentionWrapper.run() post matrix-absorption."
+    ),
+    axes={
+        "batch_size": Var(),
+        "num_qo_heads": Const(
+            description="Number of query heads after tensor parallel split.",
+            abbrev="h",
+        ),
+        "head_dim_ckv": Const(abbrev="ckv"),
+        "head_dim_kpe": Const(abbrev="kpe"),
+        "page_size": Const(abbrev="ps"),
+        "num_pages": Var(
+            description="Total number of allocated pages in the KV cache."
+        ),
+        "len_indptr": Var(description="Length of kv_indptr array."),
+        "num_kv_indices": Var(description="Total number of KV page indices."),
+    },
+    inputs={
+        "q_nope": Tensor(
+            ["batch_size", "num_qo_heads", "head_dim_ckv"],
+            description="Query tensor without positional encoding component.",
+        ),
+        "q_pe": Tensor(
+            ["batch_size", "num_qo_heads", "head_dim_kpe"],
+            description="Query positional encoding component.",
+        ),
+        "ckv_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_ckv"],
+            description="Compressed key-value cache.",
+        ),
+        "kpe_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_kpe"],
+            description="Key positional encoding cache.",
+        ),
+        "kv_indptr": Tensor(
+            ["len_indptr"],
+            optional=True,
+            description="KV page offsets for each sequence. Set during plan(), not run().",
+        ),
+        "kv_indices": Tensor(
+            ["num_kv_indices"],
+            optional=True,
+            description="Page indices for KV cache lookups. Set during plan(), not run().",
+        ),
+        "sm_scale": Scalar(
+            "float32",
+            optional=True,
+            description=(
+                "Softmax scale. Default is (1/sqrt(128 + 64) = 1/sqrt(192)), "
+                "based on head dimensions before matrix absorption. Set during plan(), not run()."
+            ),
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["batch_size", "num_qo_heads", "head_dim_ckv"], dtype_from="q_nope"
+        ),
+        "lse": Tensor(
+            ["batch_size", "num_qo_heads"],
+            dtype="float32",
+            description="The 2-based log-sum-exp of attention logits.",
+        ),
+    },
+    constraints=[
+        "len_indptr == batch_size + 1",
+        "num_kv_indices == kv_indptr[-1].item()",
+    ],
+    tags=["stage:decode", "status:verified"],
+    reference=_mla_paged_decode_reference,
+)
+
+# ── MLA paged prefill (DeepSeek-V3 style, causal) ────────────────────────────
+
+
+@torch.no_grad()
+def _mla_paged_prefill_reference(
+    q_nope, q_pe, ckv_cache, kpe_cache, qo_indptr, kv_indptr, kv_indices, sm_scale
+):
+    total_q, num_qo_heads, head_dim_ckv = q_nope.shape
+    _, _, head_dim_kpe = q_pe.shape
+    len_indptr = qo_indptr.shape[0]
+
+    # [num_pages, page_size, head_dim_*] — keep the page dim; flatten after gather.
+    Kc_all = ckv_cache.to(torch.float32)
+    Kp_all = kpe_cache.to(torch.float32)
+
+    output = torch.zeros(
+        (total_q, num_qo_heads, head_dim_ckv),
+        dtype=torch.bfloat16,
+        device=q_nope.device,
+    )
+    lse = torch.full(
+        (total_q, num_qo_heads),
+        -float("inf"),
+        dtype=torch.float32,
+        device=q_nope.device,
+    )
+
+    for b in range(len_indptr - 1):
+        q_start = int(qo_indptr[b].item())
+        q_end = int(qo_indptr[b + 1].item())
+        kv_start = int(kv_indptr[b].item())
+        kv_end = int(kv_indptr[b + 1].item())
+        if q_start >= q_end or kv_start >= kv_end:
+            continue
+        # kv_indices are page IDs; gather pages then flatten to a token axis.
+        page_ids = kv_indices[kv_start:kv_end].to(torch.long)
+        Kc = Kc_all[page_ids].reshape(-1, head_dim_ckv)  # [L, head_dim_ckv]
+        Kp = Kp_all[page_ids].reshape(-1, head_dim_kpe)  # [L, head_dim_kpe]
+        num_kv_tokens = Kc.shape[0]
+        qn_b = q_nope[q_start:q_end].to(
+            torch.float32
+        )  # [S, num_qo_heads, head_dim_ckv]
+        qp_b = q_pe[q_start:q_end].to(torch.float32)  # [S, num_qo_heads, head_dim_kpe]
+        seq_len = q_end - q_start
+        delta = num_kv_tokens - seq_len
+        for q_idx in range(seq_len):
+            max_kv = min(q_idx + 1 + delta, num_kv_tokens)
+            if max_kv <= 0:
+                continue
+            global_q = q_start + q_idx
+            qn = qn_b[q_idx]  # [num_qo_heads, head_dim_ckv]
+            qp = qp_b[q_idx]  # [num_qo_heads, head_dim_kpe]
+            logits = ((qn @ Kc[:max_kv].T) + (qp @ Kp[:max_kv].T)) * sm_scale
+            lse[global_q] = torch.logsumexp(logits, dim=-1) / math.log(2.0)
+            output[global_q] = (torch.softmax(logits, dim=-1) @ Kc[:max_kv]).to(
+                torch.bfloat16
+            )
+
+    return output, lse
+
+
+mla_paged_prefill_trace = TraceTemplate(
+    op_type="mla_paged",
+    name_prefix="mla_paged_prefill",
+    description=(
+        "Batched MLA prefill (multi-token per seq, causal). Same "
+        "(q_nope, q_pe) / (ckv_cache, kpe_cache) split as mla_paged_decode "
+        "plus qo_indptr for variable query lengths."
+    ),
+    axes={
+        "num_qo_heads": Const(
+            description="Number of query heads after tensor parallel split.",
+            abbrev="h",
+        ),
+        "head_dim_ckv": Const(abbrev="ckv"),
+        "head_dim_kpe": Const(abbrev="kpe"),
+        "page_size": Const(abbrev="ps"),
+        "total_q": Var(description="Total number of query tokens."),
+        "num_pages": Var(
+            description="Total number of allocated pages in the KV cache."
+        ),
+        "len_indptr": Var(description="Length of indptr arrays (batch_size + 1)."),
+        "num_kv_indices": Var(description="Total number of KV page indices."),
+    },
+    inputs={
+        "q_nope": Tensor(
+            ["total_q", "num_qo_heads", "head_dim_ckv"],
+            description="Query tensor without positional encoding component.",
+        ),
+        "q_pe": Tensor(
+            ["total_q", "num_qo_heads", "head_dim_kpe"],
+            description="Query positional encoding component.",
+        ),
+        "ckv_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_ckv"],
+            description="Compressed key-value cache.",
+        ),
+        "kpe_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_kpe"],
+            description="Key positional encoding cache.",
+        ),
+        "qo_indptr": Tensor(
+            ["len_indptr"],
+            description="Query token offsets for each sequence.",
+        ),
+        "kv_indptr": Tensor(
+            ["len_indptr"],
+            description="KV page offsets for each sequence.",
+        ),
+        "kv_indices": Tensor(
+            ["num_kv_indices"],
+            description="Page indices for KV cache lookups.",
+        ),
+        "sm_scale": Scalar(
+            "float32",
+            description=(
+                "Softmax scale. Default is (1/sqrt(128 + 64) = 1/sqrt(192)), "
+                "based on head dimensions before matrix absorption."
+            ),
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["total_q", "num_qo_heads", "head_dim_ckv"], dtype_from="q_nope"
+        ),
+        "lse": Tensor(
+            ["total_q", "num_qo_heads"],
+            dtype="float32",
+            description="The 2-based log-sum-exp of attention logits.",
+        ),
+    },
+    constraints=[
+        "total_q == qo_indptr[-1].item()",
+        "num_kv_indices == kv_indptr[-1].item()",
+    ],
+    tags=["stage:prefill", "status:verified"],
+    reference=_mla_paged_prefill_reference,
+)
+
+# ── DSA (Dense Sparse Attention) paged ────────────────────────────────────────
+
+
+@torch.no_grad()
+def _dsa_paged_reference(q_nope, q_pe, ckv_cache, kpe_cache, sparse_indices, sm_scale):
+    """
+    Batched Native Sparse Attention (DSA) reference implementation.
+
+    Uses sparse_indices to select top-K KV cache entries per token.
+    Values of -1 in sparse_indices indicate padding (ignored).
+    """
+    num_tokens, num_qo_heads, head_dim_ckv = q_nope.shape
+    head_dim_kpe = q_pe.shape[-1]
+    device = q_nope.device
+
+    # Squeeze page dimension when page_size=1; otherwise flatten pages.
+    Kc_all = ckv_cache.reshape(-1, head_dim_ckv).to(torch.float32)
+    Kp_all = kpe_cache.reshape(-1, head_dim_kpe).to(torch.float32)
+
+    output = torch.zeros(
+        (num_tokens, num_qo_heads, head_dim_ckv), dtype=torch.bfloat16, device=device
+    )
+    lse = torch.full(
+        (num_tokens, num_qo_heads), -float("inf"), dtype=torch.float32, device=device
+    )
+
+    for t in range(num_tokens):
+        indices = sparse_indices[t]
+        valid_mask = indices != -1
+        valid_indices = indices[valid_mask]
+        if valid_indices.numel() == 0:
+            output[t].zero_()
+            continue
+        tok_idx = valid_indices.to(torch.long)
+        Kc = Kc_all[tok_idx]
+        Kp = Kp_all[tok_idx]
+        qn = q_nope[t].to(torch.float32)
+        qp = q_pe[t].to(torch.float32)
+        logits = (qn @ Kc.T) + (qp @ Kp.T)
+        logits_scaled = logits * sm_scale
+        lse[t] = torch.logsumexp(logits_scaled, dim=-1) / math.log(2.0)
+        attn = torch.softmax(logits_scaled, dim=-1)
+        output[t] = (attn @ Kc).to(torch.bfloat16)
+
+    return output, lse
+
+
+dsa_paged_trace = TraceTemplate(
+    op_type="dsa_paged",
+    name_prefix="dsa_sparse_attention",
+    description=(
+        "DSA (Dense Sparse Attention): MLA latent layout + per-query top-K "
+        "selection via sparse_indices (-1 = padding). Covers decode and "
+        "prefill; no kv_indptr/indices."
+    ),
+    axes={
+        "num_tokens": Var(
+            description="Number of tokens (batch_size for decode, total_num_tokens for prefill)."
+        ),
+        "num_qo_heads": Const(
+            description="Number of query heads after tensor parallel split.",
+            abbrev="h",
+        ),
+        "head_dim_ckv": Const(
+            description="Compressed KV head dimension.",
+            abbrev="ckv",
+        ),
+        "head_dim_kpe": Const(
+            description="Key positional encoding dimension.",
+            abbrev="kpe",
+        ),
+        "topk": Const(
+            description="Number of top-K KV cache entries selected for sparse attention.",
+            abbrev="topk",
+        ),
+        "page_size": Const(
+            description="Page size for KV cache.",
+            abbrev="ps",
+        ),
+        "num_pages": Var(
+            description="Total number of allocated pages in the KV cache."
+        ),
+    },
+    inputs={
+        "q_nope": Tensor(
+            ["num_tokens", "num_qo_heads", "head_dim_ckv"],
+            description="Query tensor without positional encoding component.",
+        ),
+        "q_pe": Tensor(
+            ["num_tokens", "num_qo_heads", "head_dim_kpe"],
+            description="Query positional encoding component.",
+        ),
+        "ckv_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_ckv"],
+            description="Compressed key-value cache.",
+        ),
+        "kpe_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_kpe"],
+            description="Key positional encoding cache.",
+        ),
+        "sparse_indices": Tensor(
+            ["num_tokens", "topk"],
+            description="Sparse indices selecting top-K KV cache entries per token. -1 = padding.",
+        ),
+        "sm_scale": Scalar(
+            "float32",
+            description=(
+                "Softmax scale. For MLA pre-absorption: 1/sqrt(head_dim_qk + head_dim_kpe)."
+            ),
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["num_tokens", "num_qo_heads", "head_dim_ckv"],
+            dtype_from="q_nope",
+            description="Attention output tensor.",
+        ),
+        "lse": Tensor(
+            ["num_tokens", "num_qo_heads"],
+            dtype="float32",
+            description="The 2-based log-sum-exp of attention logits.",
+        ),
+    },
+    constraints=[
+        "sparse_indices.shape[0] == num_tokens",
+        "sparse_indices.shape[-1] == topk",
+        "ckv_cache.shape[1] == page_size",
+    ],
+    tags=["status:verified", "sparse:topk"],
+    reference=_dsa_paged_reference,
+)
+
+# ── Single prefill / single decode (non-batched) ──────────────────────────────
+
+
+@torch.no_grad()
+def _single_decode_reference(q, k, v, **kwargs):
+    """Single-request decode: q @ K.T → softmax → @ V, broadcasting GQA."""
+    num_qo_heads, head_dim = q.shape
+    kv_len, num_kv_heads, _ = k.shape
+    gqa_ratio = num_qo_heads // num_kv_heads
+    sm_scale = kwargs.get("sm_scale")
+    if sm_scale is None:
+        sm_scale = 1.0 / math.sqrt(head_dim)
+    output = torch.zeros_like(q, dtype=torch.float32)
+    for h in range(num_qo_heads):
+        kv_h = h // gqa_ratio
+        logits = (
+            torch.matmul(q[h].to(torch.float32), k[:, kv_h].to(torch.float32).T)
+            * sm_scale
+        )
+        attn = torch.softmax(logits, dim=-1)
+        output[h] = torch.matmul(attn, v[:, kv_h].to(torch.float32))
+    return output.to(q.dtype)
+
+
+@torch.no_grad()
+def _single_prefill_reference(q, k, v, **kwargs):
+    """Single-request prefill: standard SDPA with optional causal mask."""
+    qo_len, num_qo_heads, head_dim = q.shape
+    kv_len, num_kv_heads, _ = k.shape
+    gqa_ratio = num_qo_heads // num_kv_heads
+    causal = bool(kwargs.get("causal", False))
+    sm_scale = kwargs.get("sm_scale")
+    if sm_scale is None:
+        sm_scale = 1.0 / math.sqrt(head_dim)
+    output = torch.zeros_like(q, dtype=torch.float32)
+    delta = kv_len - qo_len
+    for h in range(num_qo_heads):
+        kv_h = h // gqa_ratio
+        logits = (
+            torch.matmul(q[:, h].to(torch.float32), k[:, kv_h].to(torch.float32).T)
+            * sm_scale
+        )
+        if causal:
+            mask = torch.full_like(logits, float("-inf"))
+            for qi in range(qo_len):
+                mask[qi, : qi + 1 + max(0, delta)] = 0.0
+            logits = logits + mask
+        attn = torch.softmax(logits, dim=-1)
+        output[:, h] = torch.matmul(attn, v[:, kv_h].to(torch.float32))
+    return output.to(q.dtype)
+
+
+single_decode_with_kv_cache_trace = TraceTemplate(
+    op_type="single_decode",
+    name_prefix="single_decode",
+    description=(
+        "Single-request decode. Q has no batch dim "
+        "([num_qo_heads, head_dim]); K and V are contiguous "
+        "([kv_len, num_kv_heads, head_dim]). No paging, no plan()."
+    ),
+    axes={
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "kv_len": Var(description="Length of the K/V context."),
+    },
+    inputs={
+        "q": Tensor(["num_qo_heads", "head_dim"]),
+        "k": Tensor(
+            ["kv_len", "num_kv_heads", "head_dim"],
+            description="Key cache, shape varies with kv_layout (default NHD).",
+        ),
+        "v": Tensor(
+            ["kv_len", "num_kv_heads", "head_dim"],
+            description="Value cache, shape varies with kv_layout (default NHD).",
+        ),
+    },
+    outputs={
+        "output": Tensor(["num_qo_heads", "head_dim"], dtype_from="q"),
+    },
+    tags=["status:verified", "stage:decode"],
+    reference=_single_decode_reference,
+)
+
+single_prefill_with_kv_cache_trace = TraceTemplate(
+    op_type="single_prefill",
+    name_prefix="single_prefill",
+    description=(
+        "Single-request prefill. Q is [qo_len, H, D]; K, V are contiguous "
+        "[kv_len, Hkv, D]. No paging, no plan(). Optional causal mask and "
+        "custom_mask."
+    ),
+    axes={
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "qo_len": Var(description="Length of the query sequence."),
+        "kv_len": Var(description="Length of the K/V sequence."),
+    },
+    inputs={
+        "q": Tensor(["qo_len", "num_qo_heads", "head_dim"]),
+        "k": Tensor(["kv_len", "num_kv_heads", "head_dim"]),
+        "v": Tensor(["kv_len", "num_kv_heads", "head_dim"]),
+    },
+    outputs={
+        "output": Tensor(["qo_len", "num_qo_heads", "head_dim"], dtype_from="q"),
+    },
+    tags=["status:verified", "stage:prefill"],
+    reference=_single_prefill_reference,
+)
+
+# ── TRTLLM paged attention ────────────────────────────────────────────────────
+# kv_cache shape is [num_pages, 1 or 2, num_kv_heads, page_size, head_dim] in HND
+# (or NHD equivalents). The "1 or 2" axis is 1 for single-tensor interleaved
+# layout and 2 for [K, V] split; we model it as a separate dim "kv_cache_dim".
+
+_TRTLLM_AXES: dict[str, Var | Const] = {
+    "num_tokens": Var(description="Total query tokens across the batch."),
+    "num_heads": Const(abbrev="h"),
+    "num_kv_heads": Const(abbrev="kv"),
+    "head_dim": Const(abbrev="d"),
+    "page_size": Const(abbrev="ps"),
+    "num_pages": Var(),
+    "kv_cache_dim": Const(
+        abbrev="",
+        description="1 for interleaved (K,V) single tensor; 2 for separate K/V halves.",
+    ),
+    "batch_size": Var(),
+}
+
+
+@torch.no_grad()
+def _trtllm_kv_from_cache(kv_cache, kv_cache_dim, num_heads, side):
+    """Split a TRT-LLM paged kv_cache tensor into either K or V slice.
+
+    kv_cache: [num_pages, kv_cache_dim, num_kv_heads, page_size, head_dim]
+    kv_cache_dim == 1: K/V interleaved head-wise along num_kv_heads
+    kv_cache_dim == 2: kv_cache[:, 0] is K, kv_cache[:, 1] is V
+    """
+    if kv_cache_dim == 2:
+        return kv_cache[:, 0] if side == "k" else kv_cache[:, 1]
+    # Interleaved along heads: even = K, odd = V.
+    sel = 0 if side == "k" else 1
+    return kv_cache[:, 0, sel::2]
+
+
+@torch.no_grad()
+def _trtllm_paged_attention_reference(
+    query, kv_cache, block_tables, seq_lens, causal=False, **kwargs
+):
+    """Shared reference for trtllm_batch_{decode, context}.
+
+    Treats query as [num_tokens, num_heads, head_dim]; expands each batch's
+    variable-length query tokens against its paged KV slice and applies
+    optional causal mask.
+
+    ``kv_layout`` selects the per-page memory layout:
+      * ``"HND"`` (default): ``[num_pages, kv_cache_dim, num_kv_heads, page_size, head_dim]``
+      * ``"NHD"``           : ``[num_pages, kv_cache_dim, page_size, num_kv_heads, head_dim]``
+    """
+    kv_layout = kwargs.get("kv_layout", "HND")
+    num_tokens, num_heads, head_dim = query.shape
+    if kv_layout == "HND":
+        num_pages, kv_cache_dim, num_kv_heads, page_size, _ = kv_cache.shape
+    else:
+        num_pages, kv_cache_dim, page_size, num_kv_heads, _ = kv_cache.shape
+    gqa_ratio = num_heads // num_kv_heads
+    bmm1_scale = float(kwargs.get("bmm1_scale", 1.0 / math.sqrt(head_dim)) or 1.0)
+    bmm2_scale = float(kwargs.get("bmm2_scale", 1.0) or 1.0)
+    cum_seq_lens_q = kwargs.get("cum_seq_lens_q")
+    batch_size = block_tables.shape[0]
+    output = torch.zeros_like(query, dtype=torch.float32)
+    for b in range(batch_size):
+        n_pages_used = (int(seq_lens[b].item()) + page_size - 1) // page_size
+        pages = block_tables[b, :n_pages_used].to(torch.long)
+        kv_len = int(seq_lens[b].item())
+        k_b = _trtllm_kv_from_cache(kv_cache[pages], kv_cache_dim, num_heads, "k")
+        v_b = _trtllm_kv_from_cache(kv_cache[pages], kv_cache_dim, num_heads, "v")
+        if kv_layout == "HND":
+            # [n_pages, Hk, PS, D] → [Hk, n_pages * PS, D] (per-head flatten).
+            k_flat = k_b.transpose(1, 2).reshape(-1, num_kv_heads, head_dim)[:kv_len]
+            v_flat = v_b.transpose(1, 2).reshape(-1, num_kv_heads, head_dim)[:kv_len]
+        else:
+            # NHD: [n_pages, PS, Hk, D] reshapes directly.
+            k_flat = k_b.reshape(-1, num_kv_heads, head_dim)[:kv_len]
+            v_flat = v_b.reshape(-1, num_kv_heads, head_dim)[:kv_len]
+        # Figure out which query tokens belong to this batch.
+        if cum_seq_lens_q is not None:
+            q_start = int(cum_seq_lens_q[b].item())
+            q_end = int(cum_seq_lens_q[b + 1].item())
+        else:
+            q_start = b * (num_tokens // batch_size)
+            q_end = q_start + (num_tokens // batch_size)
+        q_b = query[q_start:q_end].to(torch.float32)
+        for h in range(num_heads):
+            kv_h = h // gqa_ratio
+            logits = (
+                torch.matmul(q_b[:, h], k_flat[:, kv_h].to(torch.float32).T)
+                * bmm1_scale
+            )
+            if causal:
+                qi = q_end - q_start
+                delta = kv_len - qi
+                mask = torch.full_like(logits, float("-inf"))
+                for i in range(qi):
+                    mask[i, : i + 1 + max(0, delta)] = 0.0
+                logits = logits + mask
+            attn = torch.softmax(logits, dim=-1)
+            output[q_start:q_end, h] = (
+                torch.matmul(attn, v_flat[:, kv_h].to(torch.float32)) * bmm2_scale
+            )
+    return output.to(query.dtype)
+
+
+@torch.no_grad()
+def _trtllm_batch_decode_reference(
+    query, kv_cache, workspace_buffer, block_tables, seq_lens, max_seq_len, **kwargs
+):
+    return _trtllm_paged_attention_reference(
+        query, kv_cache, block_tables, seq_lens, causal=False, **kwargs
+    )
+
+
+@torch.no_grad()
+def _trtllm_batch_context_reference(
+    query,
+    kv_cache,
+    workspace_buffer,
+    block_tables,
+    seq_lens,
+    max_q_len,
+    max_kv_len,
+    bmm1_scale,
+    bmm2_scale,
+    batch_size,
+    cum_seq_lens_q,
+    cum_seq_lens_kv,
+    **kwargs,
+):
+    return _trtllm_paged_attention_reference(
+        query,
+        kv_cache,
+        block_tables,
+        seq_lens,
+        causal=True,
+        bmm1_scale=bmm1_scale,
+        bmm2_scale=bmm2_scale,
+        cum_seq_lens_q=cum_seq_lens_q,
+    )
+
+
+trtllm_batch_decode_trace = TraceTemplate(
+    op_type="trtllm_paged",
+    name_prefix="trtllm_batch_decode",
+    description=(
+        "SM100+ TRT-LLM paged decode. Single interleaved kv_cache "
+        "[num_pages, 1 or 2, Hkv, page_size, D], rectangular block_tables, "
+        "two scales (bmm1_scale post-QK, bmm2_scale post-softmax·V) for "
+        "FP8/FP4 numerics. Supports q_len_per_req > 1 for spec decoding."
+    ),
+    axes=_TRTLLM_AXES,
+    inputs={
+        "query": Tensor(["num_tokens", "num_heads", "head_dim"]),
+        "kv_cache": Tensor(
+            ["num_pages", "kv_cache_dim", "num_kv_heads", "page_size", "head_dim"],
+            description="Paged KV cache; kv_cache_dim is 1 (interleaved) or 2 (K+V).",
+        ),
+        "block_tables": Tensor(
+            ["batch_size", "max_pages_per_seq"],
+            dtype="int32",
+            description="Page table mapping per sequence.",
+        ),
+        "seq_lens": Tensor(
+            ["batch_size"],
+            dtype="int32",
+            description="Actual KV sequence length per batch entry.",
+        ),
+        "max_seq_len": Scalar(
+            "int32", description="Maximum K/V sequence length in the batch."
+        ),
+        "bmm1_scale": Scalar(
+            "float32", optional=True, description="Scale applied after Q @ K^T."
+        ),
+        "bmm2_scale": Scalar(
+            "float32", optional=True, description="Scale applied after softmax @ V."
+        ),
+    },
+    outputs={
+        "output": Tensor(["num_tokens", "num_heads", "head_dim"], dtype_from="query"),
+    },
+    tags=["status:verified", "stage:decode", "backend:trtllm"],
+    reference=_trtllm_batch_decode_reference,
+)
+
+# Add max_pages_per_seq axis used above
+trtllm_batch_decode_trace.axes["max_pages_per_seq"] = Var(
+    description="Maximum number of pages per sequence (block_tables width)."
+)
+
+trtllm_batch_context_trace = TraceTemplate(
+    op_type="trtllm_paged",
+    name_prefix="trtllm_batch_context",
+    description=(
+        "SM100+ TRT-LLM paged context/prefill. Prefill twin of "
+        "trtllm_batch_decode: same interleaved kv_cache and block_tables, "
+        "but adds cum_seq_lens_q/cum_seq_lens_kv for variable-length "
+        "queries."
+    ),
+    axes={
+        **_TRTLLM_AXES,
+        "max_pages_per_seq": Var(
+            description="Maximum number of pages per sequence (block_tables width)."
+        ),
+    },
+    inputs={
+        "query": Tensor(["num_tokens", "num_heads", "head_dim"]),
+        "kv_cache": Tensor(
+            ["num_pages", "kv_cache_dim", "num_kv_heads", "page_size", "head_dim"],
+            description="Paged KV cache; kv_cache_dim is 1 or 2.",
+        ),
+        "block_tables": Tensor(
+            ["batch_size", "max_pages_per_seq"],
+            dtype="int32",
+            description="Page table mapping per sequence.",
+        ),
+        "seq_lens": Tensor(
+            ["batch_size"],
+            dtype="int32",
+            description="Actual KV sequence length per batch entry.",
+        ),
+        "max_q_len": Scalar(
+            "int32", description="Maximum query sequence length in the batch."
+        ),
+        "max_kv_len": Scalar(
+            "int32", description="Maximum K/V sequence length in the batch."
+        ),
+        "bmm1_scale": Scalar("float32", description="Scale applied after Q @ K^T."),
+        "bmm2_scale": Scalar("float32", description="Scale applied after softmax @ V."),
+        "batch_size_scalar": Scalar("int32", param="batch_size"),
+        "cum_seq_lens_q": Tensor(
+            ["batch_size_plus_1_q"],
+            dtype="int32",
+            description="Cumulative Q sequence lengths, shape batch_size + 1.",
+        ),
+        "cum_seq_lens_kv": Tensor(
+            ["batch_size_plus_1_kv"],
+            dtype="int32",
+            description="Cumulative KV sequence lengths, shape batch_size + 1.",
+        ),
+    },
+    outputs={
+        "output": Tensor(["num_tokens", "num_heads", "head_dim"], dtype_from="query"),
+    },
+    tags=["status:verified", "stage:prefill", "backend:trtllm"],
+    reference=_trtllm_batch_context_reference,
+)
+trtllm_batch_context_trace.axes["batch_size_plus_1_q"] = Var(
+    description="batch_size + 1."
+)
+trtllm_batch_context_trace.axes["batch_size_plus_1_kv"] = Var(
+    description="batch_size + 1."
+)
+
+# ── cuDNN paged attention ─────────────────────────────────────────────────────
+
+_CUDNN_PAGED_AXES: dict[str, Var | Const] = {
+    "batch_size": Var(),
+    "total_num_pages": Var(),
+    "num_pages_per_seq": Var(
+        description="block_tables.shape[-1]; max pages used by any seq."
+    ),
+    "num_heads_qo": Const(abbrev="h"),
+    "num_heads_kv": Const(abbrev="kv"),
+    "head_dim": Const(abbrev="d"),
+    "page_size": Const(abbrev="ps"),
+}
+
+
+@torch.no_grad()
+def _cudnn_batch_decode_reference(
+    q, k_cache, v_cache, scale, workspace_buffer, max_sequence_kv, **kwargs
+):
+    """Reference for cudnn_batch_decode_with_kv_cache.
+
+    K/V layout: [total_num_pages, num_heads_kv, page_size, head_dim] (HND).
+    block_tables: [batch_size, num_pages_per_seq] gathers per-sequence pages.
+    actual_seq_lens_kv (optional) gives the true length of each sequence.
+    """
+    batch_size, num_heads_qo, head_dim = q.shape
+    _, num_heads_kv, page_size, _ = k_cache.shape
+    gqa_ratio = num_heads_qo // num_heads_kv
+    block_tables = kwargs.get("block_tables")
+    actual_seq_lens_kv = kwargs.get("actual_seq_lens_kv")
+    output = torch.zeros_like(q, dtype=torch.float32)
+    for b in range(batch_size):
+        if block_tables is None:
+            pages = torch.tensor([b], device=q.device, dtype=torch.long)
+        else:
+            row = block_tables[b]
+            pages = row[row >= 0].to(torch.long)
+        kv_len = (
+            int(actual_seq_lens_kv[b].item())
+            if actual_seq_lens_kv is not None
+            else int(max_sequence_kv)
+        )
+        # Gather + flatten: [num_heads_kv, L, head_dim] after permute.
+        k_b = (
+            k_cache[pages]
+            .permute(1, 0, 2, 3)
+            .reshape(num_heads_kv, -1, head_dim)[:, :kv_len]
+        )
+        v_b = (
+            v_cache[pages]
+            .permute(1, 0, 2, 3)
+            .reshape(num_heads_kv, -1, head_dim)[:, :kv_len]
+        )
+        for h in range(num_heads_qo):
+            kv_h = h // gqa_ratio
+            logits = torch.matmul(
+                q[b, h].to(torch.float32), k_b[kv_h].to(torch.float32).T
+            ) * float(scale)
+            attn = torch.softmax(logits, dim=-1)
+            output[b, h] = torch.matmul(attn, v_b[kv_h].to(torch.float32))
+    return output.to(q.dtype)
+
+
+@torch.no_grad()
+def _cudnn_batch_prefill_reference(
+    q,
+    k_cache,
+    v_cache,
+    scale,
+    workspace_buffer,
+    max_token_per_sequence,
+    max_sequence_kv,
+    actual_seq_lens_q,
+    actual_seq_lens_kv,
+    causal,
+    return_lse,
+    **kwargs,
+):
+    """Reference for cudnn_batch_prefill_with_kv_cache (variable-length)."""
+    num_tokens, num_heads_qo, head_dim = q.shape
+    _, num_heads_kv, page_size, _ = k_cache.shape
+    gqa_ratio = num_heads_qo // num_heads_kv
+    block_tables = kwargs.get("block_tables")
+    batch_size = actual_seq_lens_q.shape[0]
+    q_offsets = torch.cat(
+        [
+            torch.zeros(1, dtype=torch.int64, device=q.device),
+            actual_seq_lens_q.to(torch.int64).cumsum(0),
+        ]
+    )
+    output = torch.zeros_like(q, dtype=torch.float32)
+    lse = torch.full(
+        (num_tokens, num_heads_qo),
+        -float("inf"),
+        dtype=torch.float32,
+        device=q.device,
+    )
+    for b in range(batch_size):
+        q_start = int(q_offsets[b].item())
+        q_end = int(q_offsets[b + 1].item())
+        if q_end <= q_start:
+            continue
+        kv_len = int(actual_seq_lens_kv[b].item())
+        if block_tables is None:
+            pages = torch.tensor([b], device=q.device, dtype=torch.long)
+        else:
+            row = block_tables[b]
+            pages = row[row >= 0].to(torch.long)
+        k_b = (
+            k_cache[pages]
+            .permute(1, 0, 2, 3)
+            .reshape(num_heads_kv, -1, head_dim)[:, :kv_len]
+        )
+        v_b = (
+            v_cache[pages]
+            .permute(1, 0, 2, 3)
+            .reshape(num_heads_kv, -1, head_dim)[:, :kv_len]
+        )
+        qi = q_end - q_start
+        delta = kv_len - qi
+        for h in range(num_heads_qo):
+            kv_h = h // gqa_ratio
+            qh = q[q_start:q_end, h].to(torch.float32)
+            logits = torch.matmul(qh, k_b[kv_h].to(torch.float32).T) * float(scale)
+            if causal:
+                mask = torch.full_like(logits, float("-inf"))
+                for i in range(qi):
+                    mask[i, : i + 1 + max(0, delta)] = 0.0
+                logits = logits + mask
+            lse[q_start:q_end, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)
+            attn = torch.softmax(logits, dim=-1)
+            output[q_start:q_end, h] = torch.matmul(attn, v_b[kv_h].to(torch.float32))
+    return (output.to(q.dtype), lse if return_lse else None)
+
+
+cudnn_batch_decode_trace = TraceTemplate(
+    op_type="cudnn_paged",
+    name_prefix="cudnn_batch_decode",
+    description=(
+        "Standalone cuDNN paged decode. Separate k_cache/v_cache "
+        "[total_num_pages, Hkv, page_size, D], rectangular block_tables, "
+        "single sm_scale. No plan() — block_tables passed at call time."
+    ),
+    axes=_CUDNN_PAGED_AXES,
+    inputs={
+        "q": Tensor(["batch_size", "num_heads_qo", "head_dim"]),
+        "k_cache": Tensor(["total_num_pages", "num_heads_kv", "page_size", "head_dim"]),
+        "v_cache": Tensor(["total_num_pages", "num_heads_kv", "page_size", "head_dim"]),
+        "scale": Scalar("float32", description="Softmax scale, typically 1/sqrt(d)."),
+        "max_sequence_kv": Scalar(
+            "int32", description="Maximum K/V sequence length (s_kv_max)."
+        ),
+        "block_tables": Tensor(
+            ["batch_size", "num_pages_per_seq"],
+            dtype="int32",
+            optional=True,
+            description="Per-sequence page-id mapping.",
+        ),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "num_heads_qo", "head_dim"], dtype_from="q"),
+    },
+    tags=["status:verified", "stage:decode", "backend:cudnn"],
+    reference=_cudnn_batch_decode_reference,
+)
+
+cudnn_batch_prefill_trace = TraceTemplate(
+    op_type="cudnn_paged",
+    name_prefix="cudnn_batch_prefill",
+    description=(
+        "Standalone cuDNN paged prefill with variable-length sequences. "
+        "Per-seq lengths via actual_seq_lens_q/kv (not indptr); accepts "
+        "paged (block_tables) or contiguous K/V. No plan()."
+    ),
+    axes={
+        **_CUDNN_PAGED_AXES,
+        "num_tokens": Var(description="Total query tokens across the batch."),
+    },
+    inputs={
+        "q": Tensor(["num_tokens", "num_heads_qo", "head_dim"]),
+        "k_cache": Tensor(["total_num_pages", "num_heads_kv", "page_size", "head_dim"]),
+        "v_cache": Tensor(["total_num_pages", "num_heads_kv", "page_size", "head_dim"]),
+        "scale": Scalar("float32", description="Softmax scale."),
+        "max_token_per_sequence": Scalar(
+            "int32", description="Maximum query tokens per sequence."
+        ),
+        "max_sequence_kv": Scalar("int32", description="Maximum K/V sequence length."),
+        "actual_seq_lens_q": Tensor(
+            ["batch_size"],
+            dtype="int32",
+            description="Actual query sequence length per batch entry.",
+        ),
+        "actual_seq_lens_kv": Tensor(
+            ["batch_size"],
+            dtype="int32",
+            description="Actual KV sequence length per batch entry.",
+        ),
+        "block_tables": Tensor(
+            ["batch_size", "num_pages_per_seq"],
+            dtype="int32",
+            optional=True,
+        ),
+        "causal": Scalar("int32", description="Bool: apply causal mask."),
+        "return_lse": Scalar("int32", description="Bool: also return LSE."),
+    },
+    outputs={
+        "output": Tensor(["num_tokens", "num_heads_qo", "head_dim"], dtype_from="q"),
+        "lse": Tensor(
+            ["num_tokens", "num_heads_qo"],
+            dtype="float32",
+            optional=True,
+            description="Only produced when return_lse=True.",
+        ),
+    },
+    tags=["status:verified", "stage:prefill", "backend:cudnn"],
+    reference=_cudnn_batch_prefill_reference,
+)
+
+
+# ── Misc wrapper .run() templates ────────────────────────────────────────────
+# These six wrappers live on top of existing kernels; their trace schemas
+# follow their Python-level run() signatures.
+
+batch_attention_run_trace = TraceTemplate(
+    op_type="gqa_paged",
+    name_prefix="batch_attention_run",
+    description=(
+        "BatchAttention.run(): unified decode+prefill wrapper with paged KV "
+        "cache (tuple or interleaved tensor). plan() bakes in routing; run() "
+        "takes q and paged kv_cache."
+    ),
+    axes={
+        "num_qo_tokens": Var(description="Total query tokens."),
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Var(
+            description="Set during plan(); not a dim of the run() signature."
+        ),
+        "head_dim": Const(abbrev="d"),
+    },
+    inputs={
+        "q": Tensor(["num_qo_tokens", "num_qo_heads", "head_dim"]),
+        "kv_cache": Tensor(
+            ["num_qo_tokens", "num_qo_heads", "head_dim"],
+            description="Paged KV cache tensor or tuple (layout varies).",
+        ),
+    },
+    outputs={
+        "output": Tensor(["num_qo_tokens", "num_qo_heads", "head_dim"], dtype_from="q"),
+        "lse": Tensor(
+            ["num_qo_tokens", "num_qo_heads"],
+            dtype="float32",
+            description="The 2-based log-sum-exp of attention logits.",
+        ),
+    },
+    tags=["status:verified"],
+)
+
+
+_POD_AXES: dict[str, Var | Const] = {
+    "num_qo_heads": Const(abbrev="h"),
+    "num_kv_heads": Const(abbrev="kv"),
+    "head_dim": Const(abbrev="d"),
+    "prefill_len": Var(description="Total prefill query tokens."),
+    "decode_batch_size": Var(description="Number of decode queries."),
+    "num_pages": Var(),
+    "page_size": Const(abbrev="ps"),
+}
+
+pod_with_paged_kv_cache_run_trace = TraceTemplate(
+    op_type="pod",
+    name_prefix="pod_run",
+    description=(
+        "PODWithPagedKVCacheWrapper.run(): Prefill-On-Decode fused attention. "
+        "Takes separate prefill (q_p, k_p, v_p) + decode (q_d, "
+        "paged_kv_cache_d) workloads and fuses them into a single call."
+    ),
+    axes=_POD_AXES,
+    inputs={
+        "q_p": Tensor(["prefill_len", "num_qo_heads", "head_dim"]),
+        "k_p": Tensor(["prefill_len", "num_kv_heads", "head_dim"]),
+        "v_p": Tensor(["prefill_len", "num_kv_heads", "head_dim"]),
+        "q_d": Tensor(["decode_batch_size", "num_qo_heads", "head_dim"]),
+        "paged_kv_cache_d": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            description="Paged KV cache for the decode branch.",
+        ),
+    },
+    outputs={
+        "output_p": Tensor(
+            ["prefill_len", "num_qo_heads", "head_dim"], dtype_from="q_p"
+        ),
+        "output_d": Tensor(
+            ["decode_batch_size", "num_qo_heads", "head_dim"], dtype_from="q_d"
+        ),
+    },
+    tags=["status:verified", "stage:pod"],
+)
+
+
+batch_pod_with_paged_kv_cache_run_trace = TraceTemplate(
+    op_type="pod",
+    name_prefix="batch_pod_run",
+    description=(
+        "BatchPODWithPagedKVCacheWrapper.run(): batched Prefill-On-Decode. "
+        "Both prefill and decode use paged KV caches."
+    ),
+    axes=_POD_AXES,
+    inputs={
+        "q_p": Tensor(["prefill_len", "num_qo_heads", "head_dim"]),
+        "paged_kv_cache_p": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            description="Paged KV cache for the prefill branch.",
+        ),
+        "q_d": Tensor(["decode_batch_size", "num_qo_heads", "head_dim"]),
+        "paged_kv_cache_d": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            description="Paged KV cache for the decode branch.",
+        ),
+    },
+    outputs={
+        "output_p": Tensor(
+            ["prefill_len", "num_qo_heads", "head_dim"], dtype_from="q_p"
+        ),
+        "output_d": Tensor(
+            ["decode_batch_size", "num_qo_heads", "head_dim"], dtype_from="q_d"
+        ),
+    },
+    tags=["status:verified", "stage:pod"],
+)
+
+
+block_sparse_attention_run_trace = TraceTemplate(
+    op_type="block_sparse",
+    name_prefix="block_sparse_run",
+    description=(
+        "BlockSparseAttentionWrapper.run(): block-sparse attention over "
+        "q/k/v with a block-level mask baked in at plan() time."
+    ),
+    axes={
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "qo_len": Var(description="Query sequence length."),
+        "kv_len": Var(description="Key/value sequence length."),
+    },
+    inputs={
+        "q": Tensor(["qo_len", "num_qo_heads", "head_dim"]),
+        "k": Tensor(["kv_len", "num_kv_heads", "head_dim"]),
+        "v": Tensor(["kv_len", "num_kv_heads", "head_dim"]),
+    },
+    outputs={
+        "output": Tensor(["qo_len", "num_qo_heads", "head_dim"], dtype_from="q"),
+    },
+    tags=["status:verified", "sparse:block"],
+)
+
+
+variable_block_sparse_attention_run_trace = TraceTemplate(
+    op_type="block_sparse",
+    name_prefix="var_block_sparse_run",
+    description=(
+        "VariableBlockSparseAttentionWrapper.run(): variable-length block-"
+        "sparse attention. Same q/k/v layout as block_sparse but sequence "
+        "lengths vary across the batch and the block mask is per-row."
+    ),
+    axes={
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "qo_len": Var(description="Query sequence length (variable)."),
+        "kv_len": Var(description="Key/value sequence length (variable)."),
+    },
+    inputs={
+        "q": Tensor(["qo_len", "num_qo_heads", "head_dim"]),
+        "k": Tensor(["kv_len", "num_kv_heads", "head_dim"]),
+        "v": Tensor(["kv_len", "num_kv_heads", "head_dim"]),
+    },
+    outputs={
+        "output": Tensor(["qo_len", "num_qo_heads", "head_dim"], dtype_from="q"),
+    },
+    tags=["status:verified", "sparse:block"],
+)
+
+
+multi_level_cascade_run_trace = TraceTemplate(
+    op_type="cascade_attention",
+    name_prefix="multi_level_cascade_run",
+    description=(
+        "MultiLevelCascadeAttentionWrapper.run(): cascade attention across "
+        "multiple shared-prefix levels. Internally merges per-level "
+        "attention states with logsumexp."
+    ),
+    axes={
+        "batch_size": Var(),
+        "num_qo_heads": Const(abbrev="h"),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "num_pages": Var(),
+        "page_size": Const(abbrev="ps"),
+    },
+    inputs={
+        "q": Tensor(["batch_size", "num_qo_heads", "head_dim"]),
+        "paged_kv_cache": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            description="Paged KV cache (tuple or single tensor).",
+        ),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "num_qo_heads", "head_dim"], dtype_from="q"),
+    },
+    tags=["status:verified", "cascade"],
+)
+
+
+@torch.no_grad()
+def _batch_attention_run_reference(q, kv_cache, **_unused):
+    """SDPA over q and a paged kv_cache tuple (k_cache, v_cache). Assumes
+    head_dim is the last axis and each sequence's K/V is the full cache."""
+    if isinstance(kv_cache, tuple):
+        k_cache, v_cache = kv_cache
+    else:
+        k_cache = kv_cache[:, 0]
+        v_cache = kv_cache[:, 1]
+    num_tokens, num_qo_heads, head_dim = q.shape
+    # Flatten paged cache; assume one sequence.
+    k_flat = k_cache.reshape(-1, k_cache.shape[-2], head_dim).to(torch.float32)
+    v_flat = v_cache.reshape(-1, v_cache.shape[-2], head_dim).to(torch.float32)
+    num_kv_heads = k_flat.shape[1]
+    gqa_ratio = num_qo_heads // num_kv_heads
+    sm_scale = 1.0 / math.sqrt(head_dim)
+    output = torch.zeros_like(q, dtype=torch.float32)
+    lse = torch.full(
+        (num_tokens, num_qo_heads),
+        -float("inf"),
+        dtype=torch.float32,
+        device=q.device,
+    )
+    for h in range(num_qo_heads):
+        kv_h = h // gqa_ratio
+        logits = (q[:, h].to(torch.float32) @ k_flat[:, kv_h].T) * sm_scale
+        lse[:, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)
+        attn = torch.softmax(logits, dim=-1)
+        output[:, h] = attn @ v_flat[:, kv_h]
+    return output.to(q.dtype), lse
+
+
+@torch.no_grad()
+def _pod_run_reference(q_p, k_p, v_p, q_d, paged_kv_cache_d, **_unused):
+    """POD reference: independent prefill + decode attention passes."""
+    p_out = _single_prefill_reference(q_p, k_p, v_p, causal=True)
+    dec_kv = (
+        paged_kv_cache_d
+        if isinstance(paged_kv_cache_d, tuple)
+        else (paged_kv_cache_d[:, 0], paged_kv_cache_d[:, 1])
+    )
+    d_out, _ = _batch_attention_run_reference(q_d, dec_kv)
+    return p_out, d_out
+
+
+@torch.no_grad()
+def _batch_pod_run_reference(q_p, paged_kv_cache_p, q_d, paged_kv_cache_d, **_unused):
+    """Batch POD: paged prefill + paged decode (both via batch_attention)."""
+    pkv_p = (
+        paged_kv_cache_p
+        if isinstance(paged_kv_cache_p, tuple)
+        else (paged_kv_cache_p[:, 0], paged_kv_cache_p[:, 1])
+    )
+    pkv_d = (
+        paged_kv_cache_d
+        if isinstance(paged_kv_cache_d, tuple)
+        else (paged_kv_cache_d[:, 0], paged_kv_cache_d[:, 1])
+    )
+    p_out, _ = _batch_attention_run_reference(q_p, pkv_p)
+    d_out, _ = _batch_attention_run_reference(q_d, pkv_d)
+    return p_out, d_out
+
+
+@torch.no_grad()
+def _block_sparse_run_reference(q, k, v, **_unused):
+    """Dense SDPA fallback for block-sparse attention (ignores block mask)."""
+    return _single_prefill_reference(q, k, v, causal=False)
+
+
+@torch.no_grad()
+def _multi_level_cascade_run_reference(q, paged_kv_cache, **_unused):
+    """Single-level cascade approximation: plain batched SDPA."""
+    out, _ = _batch_attention_run_reference(q, paged_kv_cache)
+    return out
+
+
+@torch.no_grad()
+def _segment_gemm_run_reference(x, weights, **_unused):
+    """Batched matmul: per-segment weights applied to stacked rows. Assumes
+    the caller passes a seg_indptr via kwargs; falls back to broadcasting
+    the first weight if unavailable."""
+    seg_indptr = _unused.get("seg_indptr")
+    if seg_indptr is None:
+        return torch.matmul(x.to(torch.float32), weights[0].to(torch.float32)).to(
+            x.dtype
+        )
+    out = torch.zeros(
+        (x.shape[0], weights.shape[-1]),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    for i in range(weights.shape[0]):
+        start = int(seg_indptr[i].item())
+        end = int(seg_indptr[i + 1].item())
+        out[start:end] = x[start:end].to(torch.float32) @ weights[i].to(torch.float32)
+    return out.to(x.dtype)
+
+
+# Attach references to the templates declared above.
+batch_attention_run_trace.reference = _batch_attention_run_reference
+pod_with_paged_kv_cache_run_trace.reference = _pod_run_reference
+batch_pod_with_paged_kv_cache_run_trace.reference = _batch_pod_run_reference
+block_sparse_attention_run_trace.reference = _block_sparse_run_reference
+variable_block_sparse_attention_run_trace.reference = _block_sparse_run_reference
+multi_level_cascade_run_trace.reference = _multi_level_cascade_run_reference
+
+
+segment_gemm_run_trace = TraceTemplate(
+    op_type="segment_gemm",
+    name_prefix="segment_gemm_run",
+    description=(
+        "SegmentGEMMWrapper.run(): variable-size batched GEMM over "
+        "concatenated row segments. x is a ragged stack of per-segment "
+        "inputs; weights may be shared or per-segment."
+    ),
+    axes={
+        "total_rows": Var(description="Total rows across all segments."),
+        "K": Const(abbrev="k"),
+        "N": Const(abbrev="n"),
+        "batch_size": Var(description="Number of segments."),
+    },
+    inputs={
+        "x": Tensor(
+            ["total_rows", "K"],
+            description="Stacked segment inputs, row-concatenated.",
+        ),
+        "weights": Tensor(
+            ["batch_size", "K", "N"],
+            description="Per-segment weight tensors (may be shared across segments).",
+        ),
+    },
+    outputs={
+        "output": Tensor(["total_rows", "N"], dtype_from="x"),
+    },
+    tags=["status:verified"],
+)
+segment_gemm_run_trace.reference = _segment_gemm_run_reference
diff --git a/flashinfer/trace/templates/cascade.py b/flashinfer/trace/templates/cascade.py
new file mode 100644
index 0000000000..6bcf5aae82
--- /dev/null
+++ b/flashinfer/trace/templates/cascade.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for cascade/merge attention state operations."""
+
+import math
+
+import torch
+
+from ..template import Const, Tensor, TraceTemplate, Var
+
+# ── Merge State ───────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _merge_state_reference(v_a, s_a, v_b, s_b):
+    """Merge two attention (V, S) states via numerically stable log-sum-exp."""
+    # s_a, s_b are log2-scale logsumexp values; convert to natural scale
+    s_a = s_a.to(torch.float32) * math.log(2.0)
+    s_b = s_b.to(torch.float32) * math.log(2.0)
+    v_a = v_a.to(torch.float32)
+    v_b = v_b.to(torch.float32)
+    s_max = torch.maximum(s_a, s_b)
+    exp_a = torch.exp(s_a - s_max)
+    exp_b = torch.exp(s_b - s_max)
+    exp_sum = exp_a + exp_b
+    v_merged = (
+        v_a * exp_a.unsqueeze(-1) + v_b * exp_b.unsqueeze(-1)
+    ) / exp_sum.unsqueeze(-1)
+    s_merged = (s_max + torch.log(exp_sum)) / math.log(2.0)
+    return v_merged.to(v_a.dtype), s_merged.to(torch.float32)
+
+
+merge_state_trace = TraceTemplate(
+    op_type="cascade_merge",
+    name_prefix="merge_state",
+    description="Merge two attention (V, S) states for cascade/speculative attention.",
+    axes={
+        "seq_len": Var(description="Number of query tokens."),
+        "num_heads": Const(abbrev="h"),
+        "head_dim": Const(abbrev="d"),
+    },
+    inputs={
+        "v_a": Tensor(
+            ["seq_len", "num_heads", "head_dim"],
+            description="Attention output from KV segment A.",
+        ),
+        "s_a": Tensor(
+            ["seq_len", "num_heads"],
+            dtype="float32",
+            description="Logsumexp (base-2) from KV segment A.",
+        ),
+        "v_b": Tensor(
+            ["seq_len", "num_heads", "head_dim"],
+            description="Attention output from KV segment B.",
+        ),
+        "s_b": Tensor(
+            ["seq_len", "num_heads"],
+            dtype="float32",
+            description="Logsumexp (base-2) from KV segment B.",
+        ),
+    },
+    outputs={
+        "v_merged": Tensor(["seq_len", "num_heads", "head_dim"], dtype_from="v_a"),
+        "s_merged": Tensor(["seq_len", "num_heads"], dtype="float32"),
+    },
+    tags=["status:verified"],
+    reference=_merge_state_reference,
+)
+
+# ── Merge State In-Place ──────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _merge_state_in_place_reference(v, s, v_other, s_other, mask=None):
+    """In-place LSE-weighted merge of (v, s) with (v_other, s_other).
+
+    When ``mask`` is provided, only rows where mask is True are merged;
+    other rows are returned unchanged. Scales are base-2 logsumexp as in
+    ``_merge_state_reference``.
+    """
+    s_a = s.to(torch.float32) * math.log(2.0)
+    s_b = s_other.to(torch.float32) * math.log(2.0)
+    v_a = v.to(torch.float32)
+    v_b = v_other.to(torch.float32)
+    s_max = torch.maximum(s_a, s_b)
+    exp_a = torch.exp(s_a - s_max)
+    exp_b = torch.exp(s_b - s_max)
+    exp_sum = exp_a + exp_b
+    v_merged = (
+        v_a * exp_a.unsqueeze(-1) + v_b * exp_b.unsqueeze(-1)
+    ) / exp_sum.unsqueeze(-1)
+    s_merged = (s_max + torch.log(exp_sum)) / math.log(2.0)
+    if mask is not None:
+        m = mask.to(torch.bool)
+        v_merged = torch.where(m[:, None, None], v_merged, v_a)
+        s_merged = torch.where(m[:, None], s_merged, s.to(torch.float32))
+    return v_merged.to(v.dtype), s_merged.to(torch.float32)
+
+
+merge_state_in_place_trace = TraceTemplate(
+    op_type="cascade_merge",
+    name_prefix="merge_state_in_place",
+    description="Merge attention (V, S) states in-place. v and s are updated with merged result.",
+    axes={
+        "seq_len": Var(description="Number of query tokens."),
+        "num_heads": Const(abbrev="h"),
+        "head_dim": Const(abbrev="d"),
+    },
+    inputs={
+        "v": Tensor(
+            ["seq_len", "num_heads", "head_dim"],
+            description="Attention output (updated in-place with merged result).",
+        ),
+        "s": Tensor(
+            ["seq_len", "num_heads"],
+            dtype="float32",
+            description="Logsumexp (base-2) (updated in-place).",
+        ),
+        "v_other": Tensor(
+            ["seq_len", "num_heads", "head_dim"],
+            description="Other attention output to merge in.",
+        ),
+        "s_other": Tensor(
+            ["seq_len", "num_heads"],
+            dtype="float32",
+            description="Other logsumexp (base-2) to merge in.",
+        ),
+        "mask": Tensor(
+            ["seq_len"],
+            optional=True,
+            description="Boolean mask; if set, only merge where mask is True.",
+        ),
+    },
+    outputs={
+        "v": Tensor(
+            ["seq_len", "num_heads", "head_dim"],
+            dtype_from="v",
+            description="Updated v (in-place).",
+        ),
+        "s": Tensor(
+            ["seq_len", "num_heads"],
+            dtype="float32",
+            description="Updated s (in-place).",
+        ),
+    },
+    tags=["status:verified"],
+    reference=_merge_state_in_place_reference,
+)
+
+# ── Merge States ──────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _merge_states_reference(v, s):
+    """Merge num_states attention (V, S) states via numerically stable log-sum-exp."""
+    # v: [seq_len, num_states, num_heads, head_dim]
+    # s: [seq_len, num_states, num_heads]  (log2 scale)
+    s_nat = s.to(torch.float32) * math.log(2.0)
+    v_f32 = v.to(torch.float32)
+    s_max, _ = s_nat.max(dim=1, keepdim=True)
+    exp_s = torch.exp(s_nat - s_max)  # [seq_len, num_states, num_heads]
+    exp_sum = exp_s.sum(dim=1, keepdim=True)
+    weights = exp_s / exp_sum  # [seq_len, num_states, num_heads]
+    v_merged = (v_f32 * weights.unsqueeze(-1)).sum(dim=1)
+    s_merged = (s_max.squeeze(1) + torch.log(exp_sum.squeeze(1))) / math.log(2.0)
+    return v_merged.to(v.dtype), s_merged.to(torch.float32)
+
+
+merge_states_trace = TraceTemplate(
+    op_type="cascade_merge",
+    name_prefix="merge_states",
+    description="Merge multiple (num_states) attention (V, S) states.",
+    axes={
+        "seq_len": Var(description="Number of query tokens."),
+        "num_states": Var(description="Number of KV segments to merge."),
+        "num_heads": Const(abbrev="h"),
+        "head_dim": Const(abbrev="d"),
+    },
+    inputs={
+        "v": Tensor(
+            ["seq_len", "num_states", "num_heads", "head_dim"],
+            description="Attention outputs from all KV segments.",
+        ),
+        "s": Tensor(
+            ["seq_len", "num_states", "num_heads"],
+            dtype="float32",
+            description="Logsumexp (base-2) values from all KV segments.",
+        ),
+    },
+    outputs={
+        "v_merged": Tensor(["seq_len", "num_heads", "head_dim"], dtype_from="v"),
+        "s_merged": Tensor(["seq_len", "num_heads"], dtype="float32"),
+    },
+    tags=["status:verified"],
+    reference=_merge_states_reference,
+)
diff --git a/flashinfer/trace/templates/gdn.py b/flashinfer/trace/templates/gdn.py
new file mode 100644
index 0000000000..2cf68d6c98
--- /dev/null
+++ b/flashinfer/trace/templates/gdn.py
@@ -0,0 +1,565 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for Gated Delta Net (GDN) operations."""
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+# ── GDN decode ────────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gdn_decode_reference(q, k, v, state, A_log, a, dt_bias, b, scale):
+    """
+    Gated Delta Net decode reference implementation (k-last layout).
+
+    State layout: [B, H, V, K] (k-last, K dimension at the end)
+
+    Gate computation:
+    g = exp(-exp(A_log) * softplus(a + dt_bias))
+    beta = sigmoid(b)
+
+    Delta rule update:
+    state_new = g * state_old + k^T @ (beta * v + (1-beta) * k @ state_old) - k^T @ (k @ state_old)
+    output = scale * q @ state_new
+    """
+    B, T, num_q_heads, K = q.shape
+    _, _, num_k_heads, _ = k.shape
+    _, _, num_v_heads, V = v.shape
+    num_heads = num_v_heads
+    device = q.device
+
+    if scale is None or scale == 0.0:
+        scale = 1.0 / math.sqrt(K)
+
+    x = a.float() + dt_bias.float()  # [B, 1, HV]
+    g = torch.exp(-torch.exp(A_log.float()) * F.softplus(x))  # [B, 1, HV]
+    beta = torch.sigmoid(b.float())  # [B, 1, HV]
+
+    q_f32 = q.squeeze(1).float()
+    k_f32 = k.squeeze(1).float()
+    v_f32 = v.squeeze(1).float()
+    g_f32 = g.squeeze(1).float()
+    beta_f32 = beta.squeeze(1).float()
+
+    if state is not None:
+        state_f32 = state.float()
+    else:
+        state_f32 = torch.zeros(B, num_heads, V, K, dtype=torch.float32, device=device)
+
+    q_exp = q_f32.repeat_interleave(num_v_heads // num_q_heads, dim=1)
+    k_exp = k_f32.repeat_interleave(num_v_heads // num_k_heads, dim=1)
+
+    new_state = torch.zeros_like(state_f32)
+    output = torch.zeros(B, num_heads, V, dtype=torch.float32, device=device)
+
+    for b_idx in range(B):
+        for h_idx in range(num_heads):
+            q_h = q_exp[b_idx, h_idx]
+            k_h = k_exp[b_idx, h_idx]
+            v_h = v_f32[b_idx, h_idx]
+            h_state = (
+                state_f32[b_idx, h_idx].clone().transpose(-1, -2)
+            )  # [V,K] -> [K,V]
+            g_val = g_f32[b_idx, h_idx]
+            beta_val = beta_f32[b_idx, h_idx]
+
+            old_state = g_val * h_state
+            old_v = k_h @ old_state
+            new_v = beta_val * v_h + (1 - beta_val) * old_v
+            state_remove = k_h.unsqueeze(1) @ old_v.unsqueeze(0)
+            state_update = k_h.unsqueeze(1) @ new_v.unsqueeze(0)
+            h_state = old_state - state_remove + state_update
+
+            output[b_idx, h_idx] = scale * (q_h @ h_state)
+            new_state[b_idx, h_idx] = h_state.transpose(-1, -2)  # [K,V] -> [V,K]
+
+    output = output.unsqueeze(1).to(torch.bfloat16)
+    return output, new_state
+
+
+gated_delta_rule_decode_trace = TraceTemplate(
+    op_type="gdn",
+    name_prefix="gdn_decode",
+    description=(
+        "Gated Delta Net decode with GVA configuration and k-last state layout. "
+        "Single-token generation with recurrent state update."
+    ),
+    axes={
+        "batch_size": Var(
+            description="Number of sequences being decoded concurrently."
+        ),
+        "seq_len": Const(
+            description="Sequence length (always 1 for single-token decode).", abbrev=""
+        ),
+        "num_q_heads": Const(
+            description="Number of query heads (same as key heads in GVA mode).",
+            abbrev="qk",
+        ),
+        "num_k_heads": Const(description="Number of key heads.", abbrev=""),
+        "num_v_heads": Const(
+            description="Number of value heads (GVA: more value heads than query heads).",
+            abbrev="v",
+        ),
+        "head_size": Const(
+            description="Dimension of each attention head (K dimension in query/key space, V dimension in value space).",
+            abbrev="d",
+        ),
+    },
+    inputs={
+        "q": Tensor(
+            ["batch_size", "seq_len", "num_q_heads", "head_size"],
+            description="Query tensor for single token decode.",
+        ),
+        "k": Tensor(
+            ["batch_size", "seq_len", "num_k_heads", "head_size"],
+            description="Key tensor for single token decode.",
+        ),
+        "v": Tensor(
+            ["batch_size", "seq_len", "num_v_heads", "head_size"],
+            description="Value tensor for single token decode.",
+        ),
+        "state": Tensor(
+            ["batch_size", "num_v_heads", "head_size", "head_size"],
+            optional=True,
+            description="Recurrent state in k-last layout [B, H, V, K].",
+        ),
+        "A_log": Tensor(
+            ["num_v_heads"],
+            description="Log decay parameter (learnable). Used to compute g = exp(-exp(A_log) * softplus(a + dt_bias)).",
+        ),
+        "a": Tensor(
+            ["batch_size", "seq_len", "num_v_heads"],
+            description="Input-dependent decay from projection.",
+        ),
+        "dt_bias": Tensor(
+            ["num_v_heads"],
+            description="Decay bias (learnable). Added to 'a' before softplus.",
+        ),
+        "b": Tensor(
+            ["batch_size", "seq_len", "num_v_heads"],
+            description="Update gate input from projection. beta = sigmoid(b).",
+        ),
+        "scale": Scalar(
+            "float32",
+            optional=True,
+            description="Scale factor. Default is 1/sqrt(head_size).",
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["batch_size", "seq_len", "num_v_heads", "head_size"],
+            dtype="bfloat16",
+            description="Attention output. Shape follows num_v_heads in GVA mode.",
+        ),
+        "new_state": Tensor(
+            ["batch_size", "num_v_heads", "head_size", "head_size"],
+            dtype="float32",
+            description="Updated recurrent state in k-last layout [B, H, V, K].",
+        ),
+    },
+    constraints=[
+        "num_v_heads >= num_q_heads",
+        "num_v_heads % num_q_heads == 0",
+        "num_k_heads == num_q_heads",
+    ],
+    tags=["stage:decode", "status:verified"],
+    reference=_gdn_decode_reference,
+)
+
+# ── GDN prefill ───────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gdn_prefill_reference(q, k, v, state, A_log, a, dt_bias, b, cu_seqlens, scale):
+    """
+    Gated Delta Net prefill reference implementation (k-last layout).
+
+    State layout: [H, V, K] (k-last, K dimension at the end)
+
+    Gate computation:
+    g = exp(-exp(A_log) * softplus(a + dt_bias))
+    beta = sigmoid(b)
+
+    Delta rule update:
+    state_new = g * state_old + k^T @ (beta * v + (1-beta) * k @ state_old) - k^T @ (k @ state_old)
+    output = scale * q @ state_new
+    """
+    total_seq_len, num_q_heads, head_size = q.shape
+    num_v_heads = v.shape[1]
+    num_k_heads = k.shape[1]
+    num_sab_heads = max(num_q_heads, num_v_heads)
+    num_seqs = cu_seqlens.size(0) - 1
+    device = q.device
+
+    if scale is None or scale == 0.0:
+        scale = 1.0 / math.sqrt(head_size)
+
+    x = a.float() + dt_bias.float()  # [total_seq_len, HV]
+    g = torch.exp(-torch.exp(A_log.float()) * F.softplus(x))  # [total_seq_len, HV]
+    beta = torch.sigmoid(b.float())  # [total_seq_len, HV]
+
+    q_exp = q.repeat_interleave(num_v_heads // num_q_heads, dim=1)
+    k_exp = k.repeat_interleave(num_v_heads // num_k_heads, dim=1)
+
+    output = torch.zeros(
+        (total_seq_len, num_sab_heads, head_size), dtype=torch.bfloat16, device=device
+    )
+    new_state = torch.zeros(
+        (num_seqs, num_sab_heads, head_size, head_size),
+        dtype=torch.float32,
+        device=device,
+    )
+
+    for seq_idx in range(num_seqs):
+        seq_start = int(cu_seqlens[seq_idx].item())
+        seq_end = int(cu_seqlens[seq_idx + 1].item())
+        seq_len = seq_end - seq_start
+        if seq_len <= 0:
+            continue
+
+        if state is not None:
+            state_HKV = (
+                state[seq_idx].clone().float().transpose(-1, -2)
+            )  # [H,V,K] -> [H,K,V]
+        else:
+            state_HKV = torch.zeros(
+                (num_sab_heads, head_size, head_size),
+                dtype=torch.float32,
+                device=device,
+            )
+
+        for i in range(seq_len):
+            t = seq_start + i
+            q_H1K = q_exp[t].unsqueeze(1).float()
+            k_H1K = k_exp[t].unsqueeze(1).float()
+            v_H1V = v[t].unsqueeze(1).float()
+            g_H11 = g[t].unsqueeze(1).unsqueeze(2)
+            beta_H11 = beta[t].unsqueeze(1).unsqueeze(2)
+
+            old_state_HKV = g_H11 * state_HKV
+            old_v_H1V = q_H1K.float() @ old_state_HKV  # reuse shape pattern
+            old_v_H1V = k_H1K @ old_state_HKV
+            new_v_H1V = beta_H11 * v_H1V + (1 - beta_H11) * old_v_H1V
+            state_remove = torch.einsum(
+                "hkl,hlv->hkv", k_H1K.transpose(-1, -2), old_v_H1V
+            )
+            state_update = torch.einsum(
+                "hkl,hlv->hkv", k_H1K.transpose(-1, -2), new_v_H1V
+            )
+            state_HKV = old_state_HKV - state_remove + state_update
+
+            o_H1V = scale * (q_H1K @ state_HKV)
+            output[t] = o_H1V.squeeze(1).to(torch.bfloat16)
+
+        new_state[seq_idx] = state_HKV.transpose(-1, -2)  # [H,K,V] -> [H,V,K]
+
+    return output, new_state
+
+
+gdn_prefill_trace = TraceTemplate(
+    op_type="gdn",
+    name_prefix="gdn_prefill",
+    description=(
+        "Gated Delta Net prefill with GVA configuration and k-last state layout. "
+        "The state is in k-last layout [N, H, V, K]."
+    ),
+    axes={
+        "total_seq_len": Var(
+            description="Total number of tokens across all sequences in the batch."
+        ),
+        "num_seqs": Var(description="Number of sequences in the batch."),
+        "num_q_heads": Const(
+            description="Number of query heads (same as key heads in GVA mode).",
+            abbrev="qk",
+        ),
+        "num_k_heads": Const(description="Number of key heads.", abbrev=""),
+        "num_v_heads": Const(
+            description="Number of value heads (GVA: more value heads than query heads).",
+            abbrev="v",
+        ),
+        "head_size": Const(
+            description="Dimension of each attention head (K dimension in query/key space, V dimension in value space).",
+            abbrev="d",
+        ),
+        "len_cu_seqlens": Var(description="Length of cu_seqlens array (num_seqs + 1)."),
+    },
+    inputs={
+        "q": Tensor(
+            ["total_seq_len", "num_q_heads", "head_size"],
+            description="Query tensor.",
+        ),
+        "k": Tensor(
+            ["total_seq_len", "num_k_heads", "head_size"],
+            description="Key tensor.",
+        ),
+        "v": Tensor(
+            ["total_seq_len", "num_v_heads", "head_size"],
+            description="Value tensor.",
+        ),
+        "state": Tensor(
+            ["num_seqs", "num_v_heads", "head_size", "head_size"],
+            param="initial_state",
+            optional=True,
+            description="Recurrent state in k-last layout [N, H, V, K].",
+        ),
+        "A_log": Tensor(
+            ["num_v_heads"],
+            optional=True,
+            description="Log decay parameter (conceptual; not passed directly — precomputed into g).",
+        ),
+        "a": Tensor(
+            ["total_seq_len", "num_v_heads"],
+            param="g",
+            description="Precomputed gate values (g = exp(-exp(A_log) * softplus(a + dt_bias))).",
+        ),
+        "dt_bias": Tensor(
+            ["num_v_heads"],
+            optional=True,
+            description="Decay bias (conceptual; not passed directly — precomputed into g).",
+        ),
+        "b": Tensor(
+            ["total_seq_len", "num_v_heads"],
+            param="beta",
+            description="Update gate values (beta = sigmoid(b)).",
+        ),
+        "cu_seqlens": Tensor(
+            ["len_cu_seqlens"],
+            description="Cumulative sequence lengths for variable-length batching.",
+        ),
+        "scale": Scalar(
+            "float32",
+            optional=True,
+            description="Scale factor. Default is 1/sqrt(head_size).",
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["total_seq_len", "num_v_heads", "head_size"],
+            dtype="bfloat16",
+            description="Attention output. Shape follows num_v_heads in GVA mode.",
+        ),
+        "new_state": Tensor(
+            ["num_seqs", "num_v_heads", "head_size", "head_size"],
+            dtype="float32",
+            description="Updated recurrent state in k-last layout [N, H, V, K].",
+        ),
+    },
+    constraints=[
+        "num_v_heads >= num_q_heads",
+        "num_v_heads % num_q_heads == 0",
+        "num_k_heads == num_q_heads",
+        "len_cu_seqlens == num_seqs + 1",
+        "total_seq_len == cu_seqlens[-1].item()",
+    ],
+    tags=["stage:prefill", "status:verified"],
+    reference=_gdn_prefill_reference,
+)
+
+# ── GDN MTP (Multi-Token Prediction) ─────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gdn_mtp_reference(
+    q,
+    k,
+    v,
+    initial_state,
+    initial_state_indices,
+    A_log,
+    a,
+    dt_bias,
+    b,
+    scale,
+    intermediate_states_buffer=None,
+):
+    """
+    Gated Delta Net MTP (Multi-Token Prediction) reference implementation.
+
+    State layout: [pool_size, H, V, K] (k-last, K dimension at the end)
+
+    Gate computation:
+    g = exp(-exp(A_log) * softplus(a + dt_bias))
+    beta = sigmoid(b)
+
+    For each token t in sequence:
+        state_new = g_t * state_old + k_t^T @ (beta_t * v_t + (1-beta_t) * k_t @ state_old) - k_t^T @ (k_t @ state_old)
+        output_t = scale * q_t @ state_new
+        state_old = state_new  # Update for next token
+    """
+    B, T, num_q_heads, head_size = q.shape
+    _, _, num_k_heads, _ = k.shape
+    _, _, num_v_heads, _ = v.shape
+    device = q.device
+
+    if scale is None or scale == 0.0:
+        scale = 1.0 / math.sqrt(head_size)
+
+    x = a.float() + dt_bias.float()  # [B, T, HV]
+    g = torch.exp(-torch.exp(A_log.float()) * F.softplus(x))  # [B, T, HV]
+    beta = torch.sigmoid(b.float())  # [B, T, HV]
+
+    q_exp = q.repeat_interleave(num_v_heads // num_q_heads, dim=2)  # [B, T, HV, K]
+    k_exp = k.repeat_interleave(num_v_heads // num_k_heads, dim=2)  # [B, T, HV, K]
+
+    output = torch.zeros(
+        (B, T, num_v_heads, head_size), dtype=torch.bfloat16, device=device
+    )
+    cache_intermediate = intermediate_states_buffer is not None
+    final_state = initial_state.clone().float()
+
+    for b_idx in range(B):
+        state_idx = int(initial_state_indices[b_idx].item())
+        state_HVK = (
+            initial_state[state_idx].clone().float().transpose(-1, -2)
+        )  # [H,V,K] -> [H,K,V]
+
+        for t in range(T):
+            q_HK = q_exp[b_idx, t].float()  # [HV, K]
+            k_HK = k_exp[b_idx, t].float()  # [HV, K]
+            v_HV = v[b_idx, t].float()  # [HV, V]
+            g_H = g[b_idx, t]  # [HV]
+            beta_H = beta[b_idx, t]  # [HV]
+
+            for h_idx in range(num_v_heads):
+                q_h = q_HK[h_idx]
+                k_h = k_HK[h_idx]
+                v_h = v_HV[h_idx]
+                h_state = state_HVK[h_idx]
+                g_val = g_H[h_idx]
+                beta_val = beta_H[h_idx]
+
+                old_state = g_val * h_state
+                old_v = k_h @ old_state
+                new_v = beta_val * v_h + (1 - beta_val) * old_v
+                state_remove = k_h.unsqueeze(1) @ old_v.unsqueeze(0)
+                state_update = k_h.unsqueeze(1) @ new_v.unsqueeze(0)
+                h_state = old_state - state_remove + state_update
+
+                output[b_idx, t, h_idx] = (scale * (q_h @ h_state)).to(torch.bfloat16)
+                state_HVK[h_idx] = h_state
+
+            if cache_intermediate:
+                intermediate_states_buffer[state_idx, t] = state_HVK.transpose(
+                    -1, -2
+                )  # [H,K,V] -> [H,V,K]
+
+        # Commit accumulated state back to the pool slot [H,K,V] -> [H,V,K].
+        final_state[state_idx] = state_HVK.transpose(-1, -2)
+
+    return output, final_state
+
+
+gdn_mtp_trace = TraceTemplate(
+    op_type="gdn",
+    name_prefix="gdn_mtp",
+    description=(
+        "Gated Delta Net Multi-Token Prediction (MTP) with GVA configuration. "
+        "Used for speculative decoding verification where multiple tokens (T > 1) "
+        "need to be processed in sequence. State layout is k-last [pool_size, H, V, K]."
+    ),
+    axes={
+        "batch_size": Var(
+            description="Number of sequences being verified concurrently."
+        ),
+        "seq_len": Var(description="Number of tokens to process (T > 1 for MTP)."),
+        "num_q_heads": Const(
+            description="Number of query heads (same as key heads in GVA mode).",
+            abbrev="qk",
+        ),
+        "num_k_heads": Const(description="Number of key heads.", abbrev=""),
+        "num_v_heads": Const(
+            description="Number of value heads (GVA: more value heads than query heads).",
+            abbrev="v",
+        ),
+        "head_size": Const(
+            description="Dimension of each attention head (K dimension in query/key space, V dimension in value space).",
+            abbrev="d",
+        ),
+        "pool_size": Var(description="Size of the state pool for efficient batching."),
+    },
+    inputs={
+        "q": Tensor(
+            ["batch_size", "seq_len", "num_q_heads", "head_size"],
+            description="Query tensor for multiple tokens.",
+        ),
+        "k": Tensor(
+            ["batch_size", "seq_len", "num_k_heads", "head_size"],
+            description="Key tensor for multiple tokens.",
+        ),
+        "v": Tensor(
+            ["batch_size", "seq_len", "num_v_heads", "head_size"],
+            description="Value tensor for multiple tokens.",
+        ),
+        "initial_state": Tensor(
+            ["pool_size", "num_v_heads", "head_size", "head_size"],
+            description="Initial recurrent state pool in k-last layout [pool_size, H, V, K].",
+        ),
+        "initial_state_indices": Tensor(
+            ["batch_size"],
+            description="Indices mapping each batch to its initial state in the pool.",
+        ),
+        "A_log": Tensor(
+            ["num_v_heads"],
+            description="Log decay parameter (learnable). Used to compute g = exp(-exp(A_log) * softplus(a + dt_bias)).",
+        ),
+        "a": Tensor(
+            ["batch_size", "seq_len", "num_v_heads"],
+            description="Input-dependent decay from projection.",
+        ),
+        "dt_bias": Tensor(
+            ["num_v_heads"],
+            description="Decay bias (learnable). Added to 'a' before softplus.",
+        ),
+        "b": Tensor(
+            ["batch_size", "seq_len", "num_v_heads"],
+            description="Update gate input from projection. beta = sigmoid(b).",
+        ),
+        "scale": Scalar(
+            "float32",
+            optional=True,
+            description="Scale factor. Default is 1/sqrt(head_size).",
+        ),
+        "intermediate_states_buffer": Tensor(
+            ["pool_size", "seq_len", "num_v_heads", "head_size", "head_size"],
+            optional=True,
+            description="Optional buffer for caching intermediate states for potential rollback.",
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["batch_size", "seq_len", "num_v_heads", "head_size"],
+            dtype="bfloat16",
+            description="Attention output for all T tokens. Shape follows num_v_heads in GVA mode.",
+        ),
+        "final_state": Tensor(
+            ["pool_size", "num_v_heads", "head_size", "head_size"],
+            dtype="float32",
+            description="Updated recurrent state pool in k-last layout [pool_size, H, V, K].",
+        ),
+    },
+    constraints=[
+        "num_v_heads >= num_q_heads",
+        "num_v_heads % num_q_heads == 0",
+        "num_k_heads == num_q_heads",
+        "seq_len > 1",
+    ],
+    tags=["stage:mtp", "status:verified"],
+    reference=_gdn_mtp_reference,
+)
diff --git a/flashinfer/trace/templates/gemm.py b/flashinfer/trace/templates/gemm.py
new file mode 100644
index 0000000000..0a0de70e3a
--- /dev/null
+++ b/flashinfer/trace/templates/gemm.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for GEMM operations."""
+
+import torch
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+
+def _mm_reference(A, B):
+    # B is physically [K, N] (column-major weight), so C = A @ B.
+    return torch.matmul(A, B)
+
+
+def _mm_fp8_reference(A, B):
+    """Dequantize FP8 block-scale inputs and compute C = A @ B.
+
+    B is in TRT-LLM block layout [K//block_size, N, block_size] and is
+    reshaped to [K, N] before the matmul.
+    """
+    K_div_bs, N, block_size = B.shape
+    B_fp32 = B.reshape(K_div_bs * block_size, N).to(torch.float32)
+    A_fp32 = A.to(torch.float32)
+    return torch.matmul(A_fp32, B_fp32).to(torch.bfloat16)
+
+
+def _mm_mxfp8_reference(A, B, a_descale, b_descale):
+    """Dequantize MXFP8 inputs (block size 32) and compute C = A @ B.
+
+    a_descale: [M, K//32] uint8 interpreted as float scale per block.
+    b_descale: [K//32, N] uint8 interpreted as float scale per block.
+    """
+    _, K = A.shape
+    block_size = 32
+    A_fp32 = A.to(torch.float32)
+    B_fp32 = B.to(torch.float32)
+    # Apply per-block scales along the K dimension.
+    a_scale = a_descale.to(torch.float32).repeat_interleave(block_size, dim=1)  # [M, K]
+    b_scale = b_descale.to(torch.float32).repeat_interleave(block_size, dim=0)  # [K, N]
+    A_scaled = A_fp32 * a_scale
+    B_scaled = B_fp32 * b_scale
+    return torch.matmul(A_scaled, B_scaled).to(torch.bfloat16)
+
+
+def _mm_fp4_reference(A, B, a_descale, b_descale, block_size=16):
+    """Dequantize FP4 inputs and compute C = A @ B.
+
+    A and B are fp4 e2m1fn values packed two-per-byte as uint8.
+    a_descale: [M, K//block_size], b_descale: [K, N//block_size].
+    The reference unpacks the nibbles and applies the block scales.
+    """
+
+    def _unpack_fp4(packed, rows, cols):
+        # Each byte holds two fp4 nibbles (low nibble = first element).
+        lo = (packed & 0x0F).to(torch.float32)
+        hi = ((packed >> 4) & 0x0F).to(torch.float32)
+        # Interleave low/high nibbles along the last dimension.
+        out = torch.stack([lo, hi], dim=-1).reshape(rows, cols)
+        return out
+
+    M, K_packed = A.shape
+    K = K_packed * 2
+    _, N_packed = B.shape
+    N = N_packed * 2
+
+    A_fp32 = _unpack_fp4(A, M, K)
+    B_fp32 = _unpack_fp4(B, K, N)
+
+    # Apply per-block scales.
+    a_scale = a_descale.to(torch.float32).repeat_interleave(block_size, dim=1)  # [M, K]
+    b_scale = b_descale.to(torch.float32).repeat_interleave(block_size, dim=1)  # [K, N]
+    A_scaled = A_fp32 * a_scale
+    B_scaled = B_fp32 * b_scale
+    return torch.matmul(A_scaled, B_scaled).to(torch.bfloat16)
+
+
+mm_bf16_trace = TraceTemplate(
+    op_type="gemm_bf16",
+    description="General matrix multiply (GEMM) C = A @ B (B is column-major [K, N]).",
+    axes={
+        "M": Var(),
+        "N": Const(),
+        "K": Const(),
+    },
+    inputs={
+        "A": Tensor(["M", "K"], param="a"),
+        "B": Tensor(
+            ["K", "N"],
+            param="b",
+            description="Weight matrix in column-major layout (physical shape [K, N]).",
+        ),
+    },
+    outputs={
+        "C": Tensor(["M", "N"], dtype_from="a"),
+    },
+    tags=["status:verified"],
+    reference=_mm_reference,
+)
+
+mm_fp8_trace = TraceTemplate(
+    op_type="gemm_fp8",
+    description=(
+        "FP8 block-scale GEMM C = A @ B (TRT-LLM layout). "
+        "A is [M, K] float8_e4m3fn; B is [K//block_size, N, block_size] float8_e4m3fn."
+    ),
+    axes={
+        "M": Var(),
+        "N": Const(),
+        "K": Const(),
+    },
+    inputs={
+        "A": Tensor(["M", "K"], param="a"),
+        "B": Tensor(
+            ["K_div_block_size", "N", "block_size"],
+            param="b",
+            description="FP8 weight in TRT-LLM block layout [K//block_size, N, block_size].",
+        ),
+    },
+    outputs={
+        "C": Tensor(["M", "N"], dtype="bfloat16"),
+    },
+    tags=["status:verified", "quantization:float8_e4m3fn"],
+    reference=_mm_fp8_reference,
+)
+
+# ── MXFP8 GEMM ───────────────────────────────────────────────────────────────
+
+mm_mxfp8_trace = TraceTemplate(
+    op_type="gemm_mxfp8",
+    description=(
+        "MXFP8 GEMM C = A @ B (MX block size 32). "
+        "A and B are float8_e4m3fn; scale tensors use block size 32."
+    ),
+    axes={
+        "M": Var(),
+        "N": Const(),
+        "K": Const(),
+    },
+    inputs={
+        "A": Tensor(
+            ["M", "K"],
+            param="a",
+            description="Input A tensor, float8_e4m3fn.",
+        ),
+        "B": Tensor(
+            ["K", "N"],
+            param="b",
+            description="Input B tensor, float8_e4m3fn, column-major.",
+        ),
+        "a_descale": Tensor(
+            ["M", "K_div_32"],
+            description="Block scale for A, shape [M, K//32], uint8.",
+        ),
+        "b_descale": Tensor(
+            ["K_div_32", "N"],
+            description="Block scale for B, shape [K//32, N], uint8.",
+        ),
+    },
+    outputs={
+        "C": Tensor(["M", "N"], dtype="bfloat16"),
+    },
+    tags=["status:verified", "quantization:mxfp8"],
+    reference=_mm_mxfp8_reference,
+)
+
+# ── FP4 GEMM ─────────────────────────────────────────────────────────────────
+
+mm_fp4_trace = TraceTemplate(
+    op_type="gemm_fp4",
+    description=(
+        "FP4 GEMM C = A @ B. "
+        "A and B are fp4 (e2m1fn_x2 packed as uint8); scale tensors use block_size."
+    ),
+    axes={
+        "M": Var(),
+        "N": Const(),
+        "K": Const(),
+        "block_size": Const(
+            description="FP4 quantization block size (16 for nvfp4, 32 for mxfp4)."
+        ),
+    },
+    inputs={
+        "A": Tensor(
+            ["M", "K"],
+            param="a",
+            description="Input A tensor, fp4 e2m1fn_x2 packed as uint8.",
+        ),
+        "B": Tensor(
+            ["K", "N"],
+            param="b",
+            description="Input B tensor, fp4 e2m1fn_x2 packed as uint8, column-major.",
+        ),
+        "a_descale": Tensor(
+            ["M", "K_div_block_size"],
+            description="Block scale for A, shape [M, K//block_size], float8_e4m3fn or uint8.",
+        ),
+        "b_descale": Tensor(
+            ["K", "N_div_block_size"],
+            description="Block scale for B, shape [K, N//block_size], float8_e4m3fn or uint8.",
+        ),
+        "block_size": Scalar(
+            "int32",
+            description="FP4 quantization block size (16 for nvfp4, 32 for mxfp4).",
+        ),
+    },
+    outputs={
+        "C": Tensor(["M", "N"], dtype="bfloat16"),
+    },
+    tags=["status:verified", "quantization:fp4"],
+    reference=_mm_fp4_reference,
+)
diff --git a/flashinfer/trace/templates/moe.py b/flashinfer/trace/templates/moe.py
new file mode 100644
index 0000000000..e93fb9f5d2
--- /dev/null
+++ b/flashinfer/trace/templates/moe.py
@@ -0,0 +1,2390 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for Mixture-of-Experts operations."""
+
+import torch
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+# ---------------------------------------------------------------------------
+# Shared GEMM computation helper
+# ---------------------------------------------------------------------------
+
+
+@torch.no_grad()
+def _fp8_moe_run_experts(
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    weights,
+    topk_idx,
+    local_expert_offset,
+    E_global,
+):
+    """FP8 block-scale dequantization + SwiGLU + GEMM for all routing types.
+
+    ``weights``   : [T, TOP_K] float32 — per-token expert weights (already normalised)
+    ``topk_idx``  : [T, TOP_K] int64   — selected global expert indices
+    """
+    T, H = hidden_states.shape
+    E_local, gemm1_out_size, _ = gemm1_weights.shape
+    I = gemm1_out_size // 2
+    BLOCK = 128
+    if gemm1_out_size != 2 * I:
+        raise ValueError(
+            f"gemm1_weights.shape[1]={gemm1_out_size} is not 2*intermediate_size; "
+            "SwiGLU requires gemm1_out_size == 2 * intermediate_size."
+        )
+    device = hidden_states.device
+
+    A_fp32 = hidden_states.to(torch.float32)
+    A_scale = hidden_states_scale.to(torch.float32)  # [H/128, T]
+    A_scale_TH = A_scale.permute(1, 0).contiguous()  # [T, H/128]
+    A_scale_expanded = (
+        A_scale_TH.unsqueeze(-1).repeat(1, 1, BLOCK).reshape(T, H).contiguous()
+    )
+    A = A_fp32 * A_scale_expanded
+
+    W13_fp32 = gemm1_weights.to(torch.float32)
+    S13 = gemm1_weights_scale.to(torch.float32)
+    S13_expanded = torch.repeat_interleave(S13, BLOCK, dim=1)
+    S13_expanded = torch.repeat_interleave(S13_expanded, BLOCK, dim=2)
+    W13 = W13_fp32 * S13_expanded
+
+    W2_fp32 = gemm2_weights.to(torch.float32)
+    S2 = gemm2_weights_scale.to(torch.float32)
+    S2_expanded = torch.repeat_interleave(S2, BLOCK, dim=1)
+    S2_expanded = torch.repeat_interleave(S2_expanded, BLOCK, dim=2)
+    W2 = W2_fp32 * S2_expanded
+
+    output = torch.zeros((T, H), dtype=torch.float32, device=device)
+    local_start = int(local_expert_offset)
+
+    for le in range(E_local):
+        ge = local_start + le
+        if ge < 0 or ge >= E_global:
+            continue
+        # tokens that selected this expert
+        sel_mask = (topk_idx == ge).any(dim=1)
+        if not sel_mask.any():
+            continue
+        token_idx = torch.nonzero(sel_mask, as_tuple=False).squeeze(1)
+        A_e = A.index_select(0, token_idx)
+        G1 = A_e.matmul(W13[le].t())
+        X1, X2 = G1[:, :I], G1[:, I:]
+        silu_X2 = X2 / (1.0 + torch.exp(-X2))
+        O = (silu_X2 * X1).matmul(W2[le].t())
+        # per-expert contribution weight for each token
+        w_tok = weights.index_select(0, token_idx)
+        # find which slot in topk_idx[token_idx] corresponds to ge
+        match = (topk_idx.index_select(0, token_idx) == ge).float()
+        w_e = (w_tok * match).sum(dim=1)
+        output.index_add_(0, token_idx, O * w_e.unsqueeze(1))
+
+    return output.to(torch.bfloat16)
+
+
+# ---------------------------------------------------------------------------
+# Per-routing-type reference implementations
+# ---------------------------------------------------------------------------
+
+
+@torch.no_grad()
+def _trtllm_fp8_block_scale_moe_ds_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    top_k,
+    n_group,
+    topk_group,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """
+    FP8 block-scale MoE with DeepSeek-V3 routing:
+        s = sigmoid(logits)
+        s_with_bias = s + bias
+        group by n_group; per group take top-2 sum → pick topk_group groups
+        on the kept groups, take global top_k experts
+        combine with weights derived from s (without bias), normalised and
+        scaled by routed_scaling_factor
+    """
+    E_global = routing_logits.shape[1]
+    T = routing_logits.shape[0]
+    TOP_K = int(top_k)
+    N_GROUP = int(n_group)
+    TOPK_GROUP = int(topk_group)
+
+    logits = routing_logits.to(torch.float32)
+    bias = routing_bias.to(torch.float32).reshape(-1)
+
+    s = 1.0 / (1.0 + torch.exp(-logits))
+    s_with_bias = s + bias
+
+    group_size = E_global // N_GROUP
+    s_wb_grouped = s_with_bias.view(T, N_GROUP, group_size)
+    top2_vals, _ = torch.topk(s_wb_grouped, k=2, dim=2, largest=True, sorted=False)
+    group_scores = top2_vals.sum(dim=2)
+
+    _, group_idx = torch.topk(
+        group_scores, k=TOPK_GROUP, dim=1, largest=True, sorted=False
+    )
+    group_mask = torch.zeros_like(group_scores)
+    group_mask.scatter_(1, group_idx, 1.0)
+    score_mask = (
+        group_mask.unsqueeze(2).expand(T, N_GROUP, group_size).reshape(T, E_global)
+    )
+
+    neg_inf = torch.finfo(torch.float32).min
+    scores_pruned = s_with_bias.masked_fill(score_mask == 0, neg_inf)
+    _, topk_idx = torch.topk(scores_pruned, k=TOP_K, dim=1, largest=True, sorted=False)
+
+    M = torch.zeros_like(s)
+    M.scatter_(1, topk_idx, 1.0)
+    raw_w = s * M
+    weights_sum = raw_w.sum(dim=1, keepdim=True) + 1e-20
+    weights = (raw_w / weights_sum) * routed_scaling_factor
+
+    # Gather per-row weights into [T, TOP_K] for the shared GEMM helper
+    w_topk = weights.gather(1, topk_idx)
+
+    return _fp8_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp8_block_scale_moe_default_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """
+    FP8 block-scale MoE with Default routing: Softmax → TopK.
+    routing_bias is added to logits before softmax when provided.
+    """
+    TOP_K = int(top_k)
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    s = torch.softmax(logits, dim=-1)
+    _, topk_idx = torch.topk(s, k=TOP_K, dim=1, largest=True, sorted=False)
+    weights = s.gather(1, topk_idx) * routed_scaling_factor
+    return _fp8_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        weights,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp8_block_scale_moe_renormalize_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """
+    FP8 block-scale MoE with Renormalize routing: TopK → Softmax.
+    TopK is applied on raw logits; weights are then derived by softmax
+    over the selected logits.
+    """
+    TOP_K = int(top_k)
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    _, topk_idx = torch.topk(logits, k=TOP_K, dim=1, largest=True, sorted=False)
+    gathered = logits.gather(1, topk_idx)
+    weights = torch.softmax(gathered, dim=-1) * routed_scaling_factor
+    return _fp8_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        weights,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp8_block_scale_moe_llama4_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """
+    FP8 block-scale MoE with Llama4 routing: Top1 → Sigmoid.
+    Single expert selected per token; weight derived from sigmoid of its logit.
+    By definition Llama4 routing uses top_k=1; the parameter is accepted for
+    schema consistency with the other routing methods.
+    """
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    topk_idx = logits.argmax(dim=-1, keepdim=True)  # [T, 1]
+    top1_logit = logits.gather(1, topk_idx)
+    weights = (1.0 / (1.0 + torch.exp(-top1_logit))) * routed_scaling_factor
+    return _fp8_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        weights,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp8_block_scale_moe_renormalize_naive_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """
+    FP8 block-scale MoE with RenormalizeNaive routing: Softmax → TopK → Renormalize.
+    Same as Default but the selected weights are re-normalised to sum to 1.
+    """
+    TOP_K = int(top_k)
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    s = torch.softmax(logits, dim=-1)
+    _, topk_idx = torch.topk(s, k=TOP_K, dim=1, largest=True, sorted=False)
+    gathered = s.gather(1, topk_idx)
+    weights = gathered / (gathered.sum(dim=1, keepdim=True) + 1e-20)
+    weights = weights * routed_scaling_factor
+    return _fp8_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        weights,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp8_block_scale_moe_topk_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """
+    FP8 block-scale MoE with TopK-only routing: TopK, uniform weights.
+    No softmax or sigmoid; all selected experts receive equal weight.
+    """
+    TOP_K = int(top_k)
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    _, topk_idx = torch.topk(logits, k=TOP_K, dim=1, largest=True, sorted=False)
+    T = logits.shape[0]
+    weights = torch.full(
+        (T, TOP_K),
+        routed_scaling_factor / TOP_K,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    return _fp8_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        weights,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Template factory: shared axes/inputs/outputs for all routing types
+# ---------------------------------------------------------------------------
+
+_STANDARD_AXES = {
+    "seq_len": Var(description="Sequence length (number of tokens)"),
+    "num_experts": Const(description="Total number of experts.", abbrev=""),
+    "top_k": Const(
+        description="Number of experts to route to per token.", abbrev="topk"
+    ),
+    "num_local_experts": Const(description="Number of local experts.", abbrev="e"),
+    "hidden_size": Const(description="Hidden dimension size.", abbrev="h"),
+    "intermediate_size": Const(description="MoE intermediate layer size.", abbrev="i"),
+    "gemm1_out_size": Const(
+        description="Output size of the first GEMM (W13). Should be 2 * intermediate_size.",
+        abbrev="",
+    ),
+    "num_hidden_blocks": Const(
+        description="Number of quantized blocks along the hidden_size dimension (block_size=128).",
+        abbrev="",
+    ),
+    "num_intermediate_blocks": Const(
+        description="Number of quantized blocks along the intermediate_size dimension (block_size=128).",
+        abbrev="",
+    ),
+    "num_gemm1_out_blocks": Const(
+        description="Number of quantized blocks along the gemm1_out_size dimension (block_size=128).",
+        abbrev="",
+    ),
+}
+
+_STANDARD_INPUTS = {
+    "routing_logits": Tensor(
+        ["seq_len", "num_experts"],
+        description="Routing logits for expert selection.",
+    ),
+    "routing_bias": Tensor(
+        ["num_experts"],
+        description="Bias added to logits before routing. Pass None for no bias.",
+        optional=True,
+    ),
+    "hidden_states": Tensor(
+        ["seq_len", "hidden_size"],
+        description="Input hidden states tensor (FP8 quantized).",
+    ),
+    "hidden_states_scale": Tensor(
+        ["num_hidden_blocks", "seq_len"],
+        description="Block-wise scaling factors for hidden states.",
+    ),
+    "gemm1_weights": Tensor(
+        ["num_local_experts", "gemm1_out_size", "hidden_size"],
+        description="First GEMM weights for all local experts (gate and up projections).",
+    ),
+    "gemm1_weights_scale": Tensor(
+        ["num_local_experts", "num_gemm1_out_blocks", "num_hidden_blocks"],
+        description="Block-wise scaling factors for first GEMM weights.",
+    ),
+    "gemm2_weights": Tensor(
+        ["num_local_experts", "hidden_size", "intermediate_size"],
+        description="Second GEMM weights for all local experts (down projection).",
+    ),
+    "gemm2_weights_scale": Tensor(
+        ["num_local_experts", "num_hidden_blocks", "num_intermediate_blocks"],
+        description="Block-wise scaling factors for second GEMM weights.",
+    ),
+    "top_k": Scalar(
+        "int32",
+        description="Number of experts to route to per token.",
+    ),
+    "local_expert_offset": Scalar(
+        "int32",
+        description="Offset of local experts in global expert space.",
+    ),
+    "routed_scaling_factor": Scalar(
+        "float32",
+        description="Scaling factor applied to routing weights.",
+    ),
+}
+
+_STANDARD_OUTPUTS = {
+    "output": Tensor(
+        ["seq_len", "hidden_size"],
+        dtype="bfloat16",
+        description="Final MoE output tensor.",
+    ),
+}
+
+_STANDARD_TAGS = ["status:verified", "quantization:float8_e4m3fn"]
+
+
+def _make_standard_moe_trace(name_prefix, description, reference):
+    """Factory for standard (non-DS) routing templates (same inputs/axes)."""
+    return TraceTemplate(
+        op_type="moe",
+        name_prefix=name_prefix,
+        description=description,
+        axes=dict(_STANDARD_AXES),
+        inputs=dict(_STANDARD_INPUTS),
+        outputs=dict(_STANDARD_OUTPUTS),
+        tags=_STANDARD_TAGS,
+        reference=reference,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Template instances — one per RoutingMethodType value
+# ---------------------------------------------------------------------------
+
+# RoutingMethodType.DeepSeekV3 = 2
+# Uses additional n_group / topk_group axes and requires routing_bias.
+trtllm_fp8_block_scale_moe_ds_routing_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="moe_fp8_block_scale_ds_routing",
+    description="FP8 block scale MoE with DeepSeek-V3 routing. Includes grouped sigmoid routing and two grouped-GEMM.",
+    axes={
+        "seq_len": Var(description="Sequence length (number of tokens)"),
+        "num_experts": Const(description="Total number of experts.", abbrev=""),
+        "top_k": Const(
+            description="Number of experts to route to per token.", abbrev="topk"
+        ),
+        "n_group": Const(
+            description="Number of expert groups for group routing.", abbrev="ng"
+        ),
+        "topk_group": Const(
+            description="Number of groups to select for top-k routing.", abbrev="kg"
+        ),
+        "num_local_experts": Const(description="Number of local experts.", abbrev="e"),
+        "hidden_size": Const(description="Hidden dimension size.", abbrev="h"),
+        "intermediate_size": Const(
+            description="MoE intermediate layer size.", abbrev="i"
+        ),
+        "gemm1_out_size": Const(
+            description="Output size of the first GEMM (W13). Should be 2 * intermediate_size.",
+            abbrev="",
+        ),
+        "num_hidden_blocks": Const(
+            description="Number of quantized blocks along the hidden_size dimension (block_size=128).",
+            abbrev="",
+        ),
+        "num_intermediate_blocks": Const(
+            description="Number of quantized blocks along the intermediate_size dimension (block_size=128).",
+            abbrev="",
+        ),
+        "num_gemm1_out_blocks": Const(
+            description="Number of quantized blocks along the gemm1_out_size dimension (block_size=128).",
+            abbrev="",
+        ),
+    },
+    inputs={
+        "routing_logits": Tensor(
+            ["seq_len", "num_experts"],
+            description="Routing logits for expert selection.",
+        ),
+        "routing_bias": Tensor(
+            ["num_experts"],
+            description="Bias tensor for routing. Pass all zeros for no bias.",
+        ),
+        "hidden_states": Tensor(
+            ["seq_len", "hidden_size"],
+            description="Input hidden states tensor (FP8 quantized).",
+        ),
+        "hidden_states_scale": Tensor(
+            ["num_hidden_blocks", "seq_len"],
+            description="Block-wise scaling factors for hidden states.",
+        ),
+        "gemm1_weights": Tensor(
+            ["num_local_experts", "gemm1_out_size", "hidden_size"],
+            description="First GEMM weights for all local experts (gate and up projections).",
+        ),
+        "gemm1_weights_scale": Tensor(
+            ["num_local_experts", "num_gemm1_out_blocks", "num_hidden_blocks"],
+            description="Block-wise scaling factors for first GEMM weights.",
+        ),
+        "gemm2_weights": Tensor(
+            ["num_local_experts", "hidden_size", "intermediate_size"],
+            description="Second GEMM weights for all local experts (down projection).",
+        ),
+        "gemm2_weights_scale": Tensor(
+            ["num_local_experts", "num_hidden_blocks", "num_intermediate_blocks"],
+            description="Block-wise scaling factors for second GEMM weights.",
+        ),
+        "top_k": Scalar(
+            "int32",
+            description="Number of experts to route to per token (DeepSeek-V3 uses 8).",
+        ),
+        "n_group": Scalar(
+            "int32",
+            description="Number of expert groups (DeepSeek-V3 uses 8).",
+        ),
+        "topk_group": Scalar(
+            "int32",
+            description="Number of groups to keep after group-level top-k (DeepSeek-V3 uses 4).",
+        ),
+        "local_expert_offset": Scalar(
+            "int32",
+            description="Offset of local experts in global expert space.",
+        ),
+        "routed_scaling_factor": Scalar(
+            "float32",
+            description="Scaling factor for routing weights.",
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["seq_len", "hidden_size"],
+            dtype="bfloat16",
+            description="Final MoE output tensor.",
+        ),
+    },
+    tags=["status:verified", "quantization:float8_e4m3fn"],
+    reference=_trtllm_fp8_block_scale_moe_ds_routing_reference,
+)
+
+# Backward-compatible alias (the original name used in fused_moe/core.py import).
+trtllm_fp8_block_scale_moe_trace = trtllm_fp8_block_scale_moe_ds_routing_trace
+
+# RoutingMethodType.Default = 0 — Softmax → TopK
+trtllm_fp8_block_scale_moe_default_routing_trace = _make_standard_moe_trace(
+    name_prefix="moe_fp8_block_scale_default_routing",
+    description="FP8 block scale MoE with Default routing (Softmax → TopK).",
+    reference=_trtllm_fp8_block_scale_moe_default_routing_reference,
+)
+
+# RoutingMethodType.Renormalize = 1 — TopK → Softmax
+trtllm_fp8_block_scale_moe_renormalize_routing_trace = _make_standard_moe_trace(
+    name_prefix="moe_fp8_block_scale_renormalize_routing",
+    description="FP8 block scale MoE with Renormalize routing (TopK → Softmax).",
+    reference=_trtllm_fp8_block_scale_moe_renormalize_routing_reference,
+)
+
+# RoutingMethodType.Llama4 = 3 — Top1 → Sigmoid
+trtllm_fp8_block_scale_moe_llama4_routing_trace = _make_standard_moe_trace(
+    name_prefix="moe_fp8_block_scale_llama4_routing",
+    description="FP8 block scale MoE with Llama4 routing (Top1 → Sigmoid).",
+    reference=_trtllm_fp8_block_scale_moe_llama4_routing_reference,
+)
+
+# RoutingMethodType.RenormalizeNaive = 4 — Softmax → TopK → Renormalize
+trtllm_fp8_block_scale_moe_renormalize_naive_routing_trace = _make_standard_moe_trace(
+    name_prefix="moe_fp8_block_scale_renormalize_naive_routing",
+    description="FP8 block scale MoE with RenormalizeNaive routing (Softmax → TopK → Renormalize).",
+    reference=_trtllm_fp8_block_scale_moe_renormalize_naive_routing_reference,
+)
+
+# RoutingMethodType.TopK = 5 — TopK only (no softmax), uniform weights
+trtllm_fp8_block_scale_moe_topk_routing_trace = _make_standard_moe_trace(
+    name_prefix="moe_fp8_block_scale_topk_routing",
+    description="FP8 block scale MoE with TopK-only routing (no softmax, uniform weights).",
+    reference=_trtllm_fp8_block_scale_moe_topk_routing_reference,
+)
+
+# ---------------------------------------------------------------------------
+# Dispatch function — maps routing_method_type → TraceTemplate
+# ---------------------------------------------------------------------------
+
+_MOE_TRACE_BY_ROUTING_TYPE = {
+    0: trtllm_fp8_block_scale_moe_default_routing_trace,  # Default
+    1: trtllm_fp8_block_scale_moe_renormalize_routing_trace,  # Renormalize
+    2: trtllm_fp8_block_scale_moe_ds_routing_trace,  # DeepSeekV3
+    3: trtllm_fp8_block_scale_moe_llama4_routing_trace,  # Llama4
+    4: trtllm_fp8_block_scale_moe_renormalize_naive_routing_trace,  # RenormalizeNaive
+    5: trtllm_fp8_block_scale_moe_topk_routing_trace,  # TopK
+    # 6 = Unspecified: no trace
+}
+
+
+def trtllm_fp8_block_scale_moe_trace_dispatch(**kwargs):
+    """Return the appropriate TraceTemplate for the given ``routing_method_type``.
+
+    Pass this as ``trace=trtllm_fp8_block_scale_moe_trace_dispatch`` to
+    ``@flashinfer_api`` so the correct template is selected at call time::
+
+        @flashinfer_api(trace=trtllm_fp8_block_scale_moe_trace_dispatch)
+        def trtllm_fp8_block_scale_moe(..., routing_method_type: int = 0, ...):
+            ...
+
+    Returns ``None`` for ``RoutingMethodType.Unspecified`` (6), which
+    suppresses trace generation.
+    """
+    routing_method_type = int(kwargs.get("routing_method_type", 0))
+    return _MOE_TRACE_BY_ROUTING_TYPE.get(routing_method_type)
+
+
+# Expose all possible templates so _attach_fi_trace can auto-register them
+# in _TRACE_REGISTRY for consistency testing.
+trtllm_fp8_block_scale_moe_trace_dispatch.templates = list(  # type: ignore[attr-defined]
+    _MOE_TRACE_BY_ROUTING_TYPE.values()
+)
+
+
+# ---------------------------------------------------------------------------
+# FP4 block-scale MoE (trtllm_fp4_block_scale_moe)
+# ---------------------------------------------------------------------------
+# NvFP4: block_size=16, weights packed as uint8 (2 fp4 per byte).
+#   hidden_states       : [seq_len, hidden_size // 2]   uint8
+#   hidden_states_scale : [seq_len, hidden_size // 16]  float8  (optional for bf16 input)
+#   gemm1_weights       : [E_loc, 2*I, hidden_size // 2]         uint8
+#   gemm1_weights_scale : [E_loc, 2*I, hidden_size // 16]        float8
+#   gemm2_weights       : [E_loc, hidden_size, I // 2]            uint8
+#   gemm2_weights_scale : [E_loc, hidden_size, I // 16]           float8
+# ---------------------------------------------------------------------------
+
+
+# FP4 e2m1fn magnitudes. The 4-bit code is {sign(1), exponent(2), mantissa(1)};
+# this table maps the 16 possible nibble values to the corresponding float32
+# magnitude so dequantization is a single gather.
+_E2M1_LUT_VALUES = [
+    0.0,
+    0.5,
+    1.0,
+    1.5,
+    2.0,
+    3.0,
+    4.0,
+    6.0,
+    -0.0,
+    -0.5,
+    -1.0,
+    -1.5,
+    -2.0,
+    -3.0,
+    -4.0,
+    -6.0,
+]
+
+
+@torch.no_grad()
+def _unpack_fp4_e2m1(packed: torch.Tensor) -> torch.Tensor:
+    """Unpack a uint8 tensor of packed e2m1fn FP4 values into float32.
+
+    Each byte stores two 4-bit values (low nibble = first element along the
+    last axis). The returned tensor has twice the last-dim size of *packed*.
+    """
+    lut = torch.tensor(_E2M1_LUT_VALUES, dtype=torch.float32, device=packed.device)
+    p = packed.view(torch.uint8).to(torch.int64)
+    lo = lut[p & 0x0F]
+    hi = lut[(p >> 4) & 0x0F]
+    stacked = torch.stack([lo, hi], dim=-1)  # pairs along a new last axis
+    return stacked.reshape(*packed.shape[:-1], packed.shape[-1] * 2)
+
+
+@torch.no_grad()
+def _ue8m0_to_float32(scales: torch.Tensor) -> torch.Tensor:
+    """Decode UE8M0 (uint8, unsigned exponent-only) scales to float32."""
+    e = scales.view(torch.uint8).to(torch.int64)
+    return torch.pow(torch.tensor(2.0, device=scales.device), (e - 127).float())
+
+
+@torch.no_grad()
+def _decode_block_scales(scales: torch.Tensor, is_ue8m0: bool) -> torch.Tensor:
+    """Decode block scales: UE8M0 for MX formats, float8_e4m3fn otherwise."""
+    if is_ue8m0:
+        return _ue8m0_to_float32(scales)
+    # fp8_e4m3fn (or already float): plain cast.
+    return scales.to(torch.float32)
+
+
+@torch.no_grad()
+def _dequantize_fp4_tensor(
+    packed: torch.Tensor,
+    scales: torch.Tensor,
+    is_ue8m0_scales: bool,
+) -> torch.Tensor:
+    """Unpack an FP4 tensor and apply its per-block scales along the last dim.
+
+    The packed tensor has half the logical last-dim size of the output; the
+    scale tensor has last-dim size = (output last dim) / block_size.
+    block_size is inferred from the shape ratio.
+    """
+    unpacked = _unpack_fp4_e2m1(packed)  # float32, last dim = packed.last * 2
+    block_size = unpacked.shape[-1] // scales.shape[-1]
+    decoded_scales = _decode_block_scales(scales, is_ue8m0_scales)
+    expanded = decoded_scales.repeat_interleave(block_size, dim=-1)
+    return unpacked * expanded
+
+
+@torch.no_grad()
+def _dequantize_fp4_hidden_states(
+    hidden_states: torch.Tensor,
+    hidden_states_scale,
+    is_weights_mxfp4: bool,
+) -> torch.Tensor:
+    """Dequantize hidden_states to float32.
+
+    Three cases by dtype:
+      * bfloat16 — pass-through (no scale).
+      * float8_e4m3fn — MXFP8 activation with UE8M0 per-32 scales.
+      * uint8 — NvFP4/MXFP4 packed activation with per-block scales (fp8_e4m3fn
+        for NvFP4, UE8M0 for MXFP4; here both are treated as fp8_e4m3fn since
+        the runtime FP4 path uses fp8_e4m3fn scales for activations).
+    """
+    if hidden_states.dtype == torch.bfloat16:
+        return hidden_states.to(torch.float32)
+    if hidden_states.dtype == torch.float8_e4m3fn:
+        # MXFP8 hidden states: UE8M0 scales, block size 32.
+        scales = _ue8m0_to_float32(hidden_states_scale)
+        block_size = hidden_states.shape[-1] // scales.shape[-1]
+        expanded = scales.repeat_interleave(block_size, dim=-1)
+        return hidden_states.to(torch.float32) * expanded
+    # uint8-packed FP4. For NvFP4 activation + NvFP4 weights the scales are
+    # fp8_e4m3fn; for MXFP4 weights (and bf16-packed-as-fp4 corner cases) they
+    # are UE8M0. Use the weight mode as the tiebreaker since activation scale
+    # format tracks weight format in the trtllm-gen kernel.
+    return _dequantize_fp4_tensor(
+        hidden_states, hidden_states_scale, is_ue8m0_scales=is_weights_mxfp4
+    )
+
+
+@torch.no_grad()
+def _fp4_moe_run_experts(
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    gemm1_bias,
+    gemm2_bias,
+    weights,
+    topk_idx,
+    local_expert_offset,
+    E_global,
+):
+    """FP4 dequantize + SwiGLU + GEMM for all routing types.
+
+    ``weights``   : [T, TOP_K] float32 — per-token expert weights (normalised)
+    ``topk_idx``  : [T, TOP_K] int64   — selected global expert indices
+
+    Detects MXFP4 vs NvFP4 weight format from whether gemm1_weights_scale is
+    fp8_e4m3fn (NvFP4) or uint8 (UE8M0, MXFP4). Block size is inferred from
+    the ratio of unpacked K to scale K.
+    """
+    is_mxfp4 = gemm1_weights_scale.dtype == torch.uint8
+    device = gemm1_weights.device
+
+    # Dequantize both expert-weight tensors in one shot.
+    W1 = _dequantize_fp4_tensor(
+        gemm1_weights, gemm1_weights_scale, is_ue8m0_scales=is_mxfp4
+    )  # [E_local, 2*I, H]
+    W2 = _dequantize_fp4_tensor(
+        gemm2_weights, gemm2_weights_scale, is_ue8m0_scales=is_mxfp4
+    )  # [E_local, H, I]
+
+    E_local, gemm1_out_size, H = W1.shape
+    I = gemm1_out_size // 2
+    if gemm1_out_size != 2 * I:
+        raise ValueError(
+            f"gemm1 output size {gemm1_out_size} is not 2*intermediate_size; "
+            "FP4 MoE requires SwiGLU (gate + up)."
+        )
+
+    A = _dequantize_fp4_hidden_states(hidden_states, hidden_states_scale, is_mxfp4)
+    T = A.shape[0]
+    output = torch.zeros((T, H), dtype=torch.float32, device=device)
+    local_start = int(local_expert_offset)
+
+    for le in range(E_local):
+        ge = local_start + le
+        if ge < 0 or ge >= E_global:
+            continue
+        sel_mask = (topk_idx == ge).any(dim=1)
+        if not sel_mask.any():
+            continue
+        token_idx = torch.nonzero(sel_mask, as_tuple=False).squeeze(1)
+        A_e = A.index_select(0, token_idx)  # [N, H]
+        G1 = A_e.matmul(W1[le].t())  # [N, 2*I]
+        if gemm1_bias is not None:
+            G1 = G1 + gemm1_bias[le].to(torch.float32)
+        # SwiGLU uses the trtllm-gen convention: silu(X2) * X1 with X1 first.
+        X1, X2 = G1[:, :I], G1[:, I:]
+        silu_X2 = X2 / (1.0 + torch.exp(-X2))
+        activated = silu_X2 * X1
+        O = activated.matmul(W2[le].t())  # [N, H]
+        if gemm2_bias is not None:
+            O = O + gemm2_bias[le].to(torch.float32)
+        # Fold per-token expert weight.
+        w_tok = weights.index_select(0, token_idx)
+        match = (topk_idx.index_select(0, token_idx) == ge).float()
+        w_e = (w_tok * match).sum(dim=1)
+        output.index_add_(0, token_idx, O * w_e.unsqueeze(1))
+
+    return output.to(torch.bfloat16)
+
+
+@torch.no_grad()
+def _trtllm_fp4_block_scale_moe_default_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm1_bias,
+    gemm2_weights,
+    gemm2_weights_scale,
+    gemm2_bias,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """FP4 MoE with Default routing (Softmax → TopK)."""
+    TOP_K = int(top_k)
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    s = torch.softmax(logits, dim=-1)
+    _, topk_idx = torch.topk(s, k=TOP_K, dim=1, largest=True, sorted=False)
+    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)
+    w_topk = s.gather(1, topk_idx) * scale
+    return _fp4_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm1_bias,
+        gemm2_bias,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp4_block_scale_moe_renormalize_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm1_bias,
+    gemm2_weights,
+    gemm2_weights_scale,
+    gemm2_bias,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """FP4 MoE with Renormalize routing (TopK on logits → Softmax)."""
+    TOP_K = int(top_k)
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    _, topk_idx = torch.topk(logits, k=TOP_K, dim=1, largest=True, sorted=False)
+    gathered = logits.gather(1, topk_idx)
+    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)
+    w_topk = torch.softmax(gathered, dim=-1) * scale
+    return _fp4_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm1_bias,
+        gemm2_bias,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp4_block_scale_moe_ds_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm1_bias,
+    gemm2_weights,
+    gemm2_weights_scale,
+    gemm2_bias,
+    top_k,
+    n_group,
+    topk_group,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """FP4 MoE with DeepSeek-V3 routing: sigmoid + groups + top_k."""
+    TOP_K = int(top_k)
+    N_GROUP = int(n_group)
+    TOPK_GROUP = int(topk_group)
+    E_global = routing_logits.shape[1]
+    T = routing_logits.shape[0]
+
+    logits = routing_logits.to(torch.float32)
+    bias = routing_bias.to(torch.float32).reshape(-1)
+    s = 1.0 / (1.0 + torch.exp(-logits))
+    s_with_bias = s + bias
+
+    group_size = E_global // N_GROUP
+    s_wb_grouped = s_with_bias.view(T, N_GROUP, group_size)
+    top2_vals, _ = torch.topk(s_wb_grouped, k=2, dim=2, largest=True, sorted=False)
+    group_scores = top2_vals.sum(dim=2)
+
+    _, group_idx = torch.topk(
+        group_scores, k=TOPK_GROUP, dim=1, largest=True, sorted=False
+    )
+    group_mask = torch.zeros_like(group_scores)
+    group_mask.scatter_(1, group_idx, 1.0)
+    score_mask = (
+        group_mask.unsqueeze(2).expand(T, N_GROUP, group_size).reshape(T, E_global)
+    )
+
+    neg_inf = torch.finfo(torch.float32).min
+    scores_pruned = s_with_bias.masked_fill(score_mask == 0, neg_inf)
+    _, topk_idx = torch.topk(scores_pruned, k=TOP_K, dim=1, largest=True, sorted=False)
+
+    M = torch.zeros_like(s)
+    M.scatter_(1, topk_idx, 1.0)
+    raw_w = s * M
+    weights_sum = raw_w.sum(dim=1, keepdim=True) + 1e-20
+    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)
+    full_weights = (raw_w / weights_sum) * scale
+    w_topk = full_weights.gather(1, topk_idx)
+
+    return _fp4_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm1_bias,
+        gemm2_bias,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp4_block_scale_moe_llama4_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm1_bias,
+    gemm2_weights,
+    gemm2_weights_scale,
+    gemm2_bias,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """FP4 MoE with Llama4 routing (Top1 → Sigmoid). top_k is fixed at 1."""
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    topk_idx = logits.argmax(dim=-1, keepdim=True)
+    top1_logit = logits.gather(1, topk_idx)
+    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)
+    w_topk = (1.0 / (1.0 + torch.exp(-top1_logit))) * scale
+    return _fp4_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm1_bias,
+        gemm2_bias,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp4_block_scale_moe_renormalize_naive_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm1_bias,
+    gemm2_weights,
+    gemm2_weights_scale,
+    gemm2_bias,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """FP4 MoE with RenormalizeNaive routing (Softmax → TopK → sum-to-1)."""
+    TOP_K = int(top_k)
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    s = torch.softmax(logits, dim=-1)
+    _, topk_idx = torch.topk(s, k=TOP_K, dim=1, largest=True, sorted=False)
+    gathered = s.gather(1, topk_idx)
+    w_topk = gathered / (gathered.sum(dim=1, keepdim=True) + 1e-20)
+    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)
+    w_topk = w_topk * scale
+    return _fp4_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm1_bias,
+        gemm2_bias,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp4_block_scale_moe_topk_routing_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm1_bias,
+    gemm2_weights,
+    gemm2_weights_scale,
+    gemm2_bias,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor,
+):
+    """FP4 MoE with TopK-only routing (uniform weights)."""
+    TOP_K = int(top_k)
+    E_global = routing_logits.shape[1]
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    _, topk_idx = torch.topk(logits, k=TOP_K, dim=1, largest=True, sorted=False)
+    T = logits.shape[0]
+    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)
+    w_topk = torch.full(
+        (T, TOP_K), scale / TOP_K, dtype=torch.float32, device=logits.device
+    )
+    return _fp4_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm1_bias,
+        gemm2_bias,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        E_global,
+    )
+
+
+_FP4_STANDARD_AXES: dict[str, Var | Const] = {
+    "seq_len": Var(description="Number of tokens."),
+    "num_experts": Const(description="Total number of experts.", abbrev=""),
+    "top_k": Const(description="Number of experts selected per token.", abbrev="topk"),
+    "num_local_experts": Const(description="Number of local experts.", abbrev="e"),
+    "hidden_size": Const(description="Hidden dimension size.", abbrev="h"),
+    "intermediate_size": Const(description="MoE intermediate layer size.", abbrev="i"),
+    # Derived / block-count axes (abbrev="" → omitted from filename)
+    "gemm1_out_size": Const(
+        description="Output size of FC1 (2 × intermediate_size for SwiGLU).",
+        abbrev="",
+    ),
+    "num_packed_hidden": Const(
+        description="Packed hidden dimension (hidden_size // 2 for NvFP4).",
+        abbrev="",
+    ),
+    "num_fp4_hidden_blocks": Const(
+        description="Number of FP4 scale blocks along hidden_size (hidden_size // 16 for NvFP4).",
+        abbrev="",
+    ),
+    "num_packed_intermediate": Const(
+        description="Packed intermediate dimension (intermediate_size // 2 for NvFP4).",
+        abbrev="",
+    ),
+    "num_fp4_intermediate_blocks": Const(
+        description="Number of FP4 scale blocks along intermediate_size (intermediate_size // 16 for NvFP4).",
+        abbrev="",
+    ),
+}
+
+_FP4_STANDARD_INPUTS: dict[str, Tensor | Scalar] = {
+    "routing_logits": Tensor(
+        ["seq_len", "num_experts"],
+        description="Routing logits for expert selection.",
+    ),
+    "routing_bias": Tensor(
+        ["num_experts"],
+        description="Bias added to routing logits. Pass None when not used.",
+        optional=True,
+    ),
+    # Packed NvFP4 hidden states (2 values per uint8 byte).
+    "hidden_states": Tensor(
+        ["seq_len", "num_packed_hidden"],
+        description="Input hidden states, NvFP4-packed (uint8, 2 fp4 per byte).",
+    ),
+    "hidden_states_scale": Tensor(
+        ["seq_len", "num_fp4_hidden_blocks"],
+        description="Block-wise scale factors for hidden_states (float8). None for bf16 input.",
+        optional=True,
+    ),
+    "gemm1_weights": Tensor(
+        ["num_local_experts", "gemm1_out_size", "num_packed_hidden"],
+        description="FC1 weights, NvFP4-packed (uint8). Shape includes gate+up for SwiGLU.",
+    ),
+    "gemm1_weights_scale": Tensor(
+        ["num_local_experts", "gemm1_out_size", "num_fp4_hidden_blocks"],
+        description="Block-wise scale factors for gemm1_weights (float8).",
+    ),
+    "gemm1_bias": Tensor(
+        ["num_local_experts", "gemm1_out_size"],
+        description="FC1 bias (float32). Optional.",
+        optional=True,
+    ),
+    "gemm1_alpha": Tensor(
+        ["num_local_experts"],
+        description="Per-expert SwiGLU alpha (float32). Optional.",
+        optional=True,
+    ),
+    "gemm1_beta": Tensor(
+        ["num_local_experts"],
+        description="Per-expert SwiGLU beta (float32). Optional.",
+        optional=True,
+    ),
+    "gemm1_clamp_limit": Tensor(
+        ["num_local_experts"],
+        description="Per-expert SwiGLU clamp limit (float32). Optional.",
+        optional=True,
+    ),
+    "gemm2_weights": Tensor(
+        ["num_local_experts", "hidden_size", "num_packed_intermediate"],
+        description="FC2 weights, NvFP4-packed (uint8).",
+    ),
+    "gemm2_weights_scale": Tensor(
+        ["num_local_experts", "hidden_size", "num_fp4_intermediate_blocks"],
+        description="Block-wise scale factors for gemm2_weights (float8).",
+    ),
+    "gemm2_bias": Tensor(
+        ["num_local_experts", "hidden_size"],
+        description="FC2 bias (float32). Optional.",
+        optional=True,
+    ),
+    "output1_scale_scalar": Tensor(
+        ["num_local_experts"],
+        description="Per-expert output scale for FC1 activation (float32). Optional.",
+        optional=True,
+    ),
+    "output1_scale_gate_scalar": Tensor(
+        ["num_local_experts"],
+        description="Per-expert output scale for FC1 gate (float32). Optional.",
+        optional=True,
+    ),
+    "output2_scale_scalar": Tensor(
+        ["num_local_experts"],
+        description="Per-expert output scale for FC2 (float32). Optional.",
+        optional=True,
+    ),
+    "local_expert_offset": Scalar(
+        "int32",
+        description="Offset of local experts in the global expert array.",
+    ),
+    "routed_scaling_factor": Scalar(
+        "float32",
+        optional=True,
+        description="Scaling factor applied to routing weights. None for some routing methods.",
+    ),
+}
+
+_FP4_STANDARD_OUTPUTS = {
+    "output": Tensor(
+        ["seq_len", "hidden_size"],
+        dtype="bfloat16",
+        description="Final MoE output tensor.",
+    ),
+}
+
+_FP4_STANDARD_TAGS = ["status:experimental", "quantization:nvfp4"]
+
+
+def _make_standard_fp4_moe_trace(name_prefix, description, reference=None):
+    """Factory for FP4 MoE templates that share the standard (non-DS) axis set."""
+    return TraceTemplate(
+        op_type="moe",
+        name_prefix=name_prefix,
+        description=description,
+        axes=dict(_FP4_STANDARD_AXES),
+        inputs=dict(_FP4_STANDARD_INPUTS),
+        outputs=dict(_FP4_STANDARD_OUTPUTS),
+        tags=_FP4_STANDARD_TAGS,
+        reference=reference,
+    )
+
+
+# RoutingMethodType.Default = 0 — Softmax → TopK
+trtllm_fp4_block_scale_moe_default_routing_trace = _make_standard_fp4_moe_trace(
+    name_prefix="moe_fp4_block_scale_default_routing",
+    description="NvFP4 block-scale MoE with Default routing (Softmax → TopK).",
+    reference=_trtllm_fp4_block_scale_moe_default_routing_reference,
+)
+
+# RoutingMethodType.Renormalize = 1 — TopK → Softmax
+trtllm_fp4_block_scale_moe_renormalize_routing_trace = _make_standard_fp4_moe_trace(
+    name_prefix="moe_fp4_block_scale_renormalize_routing",
+    description="NvFP4 block-scale MoE with Renormalize routing (TopK → Softmax).",
+    reference=_trtllm_fp4_block_scale_moe_renormalize_routing_reference,
+)
+
+# RoutingMethodType.DeepSeekV3 = 2 — Sigmoid → group selection → TopK
+trtllm_fp4_block_scale_moe_ds_routing_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="moe_fp4_block_scale_ds_routing",
+    description="NvFP4 block-scale MoE with DeepSeekV3 routing (Sigmoid → group selection → top_k).",
+    axes={
+        **_FP4_STANDARD_AXES,
+        "n_group": Const(
+            description="Number of expert groups for group routing.", abbrev="ng"
+        ),
+        "topk_group": Const(
+            description="Number of groups selected in top-k routing.", abbrev="kg"
+        ),
+    },
+    inputs=dict(_FP4_STANDARD_INPUTS),
+    outputs=dict(_FP4_STANDARD_OUTPUTS),
+    tags=_FP4_STANDARD_TAGS,
+    reference=_trtllm_fp4_block_scale_moe_ds_routing_reference,
+)
+
+# RoutingMethodType.Llama4 = 3 — Top1 → Sigmoid
+trtllm_fp4_block_scale_moe_llama4_routing_trace = _make_standard_fp4_moe_trace(
+    name_prefix="moe_fp4_block_scale_llama4_routing",
+    description="NvFP4 block-scale MoE with Llama4 routing (Top1 → Sigmoid).",
+    reference=_trtllm_fp4_block_scale_moe_llama4_routing_reference,
+)
+
+# RoutingMethodType.RenormalizeNaive = 4 — Softmax → TopK → Renormalize
+trtllm_fp4_block_scale_moe_renormalize_naive_routing_trace = _make_standard_fp4_moe_trace(
+    name_prefix="moe_fp4_block_scale_renormalize_naive_routing",
+    description="NvFP4 block-scale MoE with RenormalizeNaive routing (Softmax → TopK → Renormalize).",
+    reference=_trtllm_fp4_block_scale_moe_renormalize_naive_routing_reference,
+)
+
+# RoutingMethodType.TopK = 5 — plain TopK, uniform weights
+trtllm_fp4_block_scale_moe_topk_routing_trace = _make_standard_fp4_moe_trace(
+    name_prefix="moe_fp4_block_scale_topk_routing",
+    description="NvFP4 block-scale MoE with TopK-only routing (no softmax, uniform weights).",
+    reference=_trtllm_fp4_block_scale_moe_topk_routing_reference,
+)
+
+_FP4_MOE_TRACE_BY_ROUTING_TYPE = {
+    0: trtllm_fp4_block_scale_moe_default_routing_trace,
+    1: trtllm_fp4_block_scale_moe_renormalize_routing_trace,
+    2: trtllm_fp4_block_scale_moe_ds_routing_trace,
+    3: trtllm_fp4_block_scale_moe_llama4_routing_trace,
+    4: trtllm_fp4_block_scale_moe_renormalize_naive_routing_trace,
+    5: trtllm_fp4_block_scale_moe_topk_routing_trace,
+    # 6 = Unspecified: no trace
+}
+
+
+def trtllm_fp4_block_scale_moe_trace_dispatch(**kwargs):
+    """Return the FP4 TraceTemplate for the given ``routing_method_type``.
+
+    Pass this as ``trace=trtllm_fp4_block_scale_moe_trace_dispatch`` to
+    ``@flashinfer_api`` so the correct template is selected at call time::
+
+        @flashinfer_api(trace=trtllm_fp4_block_scale_moe_trace_dispatch)
+        def trtllm_fp4_block_scale_moe(..., routing_method_type: int = 0, ...):
+            ...
+
+    Returns ``None`` for ``RoutingMethodType.Unspecified`` (6).
+    """
+    routing_method_type = int(kwargs.get("routing_method_type", 0))
+    return _FP4_MOE_TRACE_BY_ROUTING_TYPE.get(routing_method_type)
+
+
+trtllm_fp4_block_scale_moe_trace_dispatch.templates = list(  # type: ignore[attr-defined]
+    _FP4_MOE_TRACE_BY_ROUTING_TYPE.values()
+)
+
+
+# ---------------------------------------------------------------------------
+# Additional MoE variants (CUTLASS fused MoE, bf16, routed, per-tensor, mxint4)
+# ---------------------------------------------------------------------------
+
+_MOE_COMMON_AXES: dict[str, Var | Const] = {
+    "seq_len": Var(description="Number of input tokens."),
+    "num_experts": Const(abbrev="", description="Total number of experts."),
+    "top_k": Const(abbrev="topk"),
+    "num_local_experts": Const(abbrev="e", description="Number of local experts."),
+    "hidden_size": Const(abbrev="h"),
+    "intermediate_size": Const(abbrev="i"),
+}
+
+# ---------------------------------------------------------------------------
+# References for the additional MoE variants (bf16 / per-tensor FP8 / routed /
+# mxint4). Each reference assumes inputs are already in their declared dtypes.
+# ---------------------------------------------------------------------------
+
+
+@torch.no_grad()
+def _moe_bf16_run_experts(
+    hidden_states,
+    gemm1_weights,
+    gemm2_weights,
+    weights,
+    topk_idx,
+    local_expert_offset,
+    E_global,
+):
+    """Un-quantized (bf16) MoE expert computation with SwiGLU."""
+    T, H = hidden_states.shape
+    E_local, gemm1_out, _ = gemm1_weights.shape
+    I = gemm1_out // 2
+    device = hidden_states.device
+    A = hidden_states.to(torch.float32)
+    W1 = gemm1_weights.to(torch.float32)
+    W2 = gemm2_weights.to(torch.float32)
+    output = torch.zeros((T, H), dtype=torch.float32, device=device)
+    local_start = int(local_expert_offset)
+    for le in range(E_local):
+        ge = local_start + le
+        if ge < 0 or ge >= E_global:
+            continue
+        sel_mask = (topk_idx == ge).any(dim=1)
+        if not sel_mask.any():
+            continue
+        token_idx = torch.nonzero(sel_mask, as_tuple=False).squeeze(1)
+        A_e = A.index_select(0, token_idx)
+        G1 = A_e.matmul(W1[le].t())
+        X1, X2 = G1[:, :I], G1[:, I:]
+        silu_X2 = X2 / (1.0 + torch.exp(-X2))
+        O = (silu_X2 * X1).matmul(W2[le].t())
+        w_tok = weights.index_select(0, token_idx)
+        match = (topk_idx.index_select(0, token_idx) == ge).float()
+        w_e = (w_tok * match).sum(dim=1)
+        output.index_add_(0, token_idx, O * w_e.unsqueeze(1))
+    return output.to(torch.bfloat16)
+
+
+@torch.no_grad()
+def _default_routing_weights(routing_logits, routing_bias, top_k, scale):
+    logits = routing_logits.to(torch.float32)
+    if routing_bias is not None:
+        logits = logits + routing_bias.to(torch.float32).reshape(-1)
+    s = torch.softmax(logits, dim=-1)
+    _, topk_idx = torch.topk(s, k=int(top_k), dim=1, largest=True, sorted=False)
+    return s.gather(1, topk_idx) * float(scale or 1.0), topk_idx
+
+
+@torch.no_grad()
+def _cutlass_fused_moe_reference(
+    input,
+    token_selected_experts,
+    token_final_scales,
+    fc1_expert_weights,
+    fc2_expert_weights,
+    **_unused,
+):
+    """Reference for CUTLASS fused MoE with precomputed routing."""
+    E_global = fc1_expert_weights.shape[0]
+    return _moe_bf16_run_experts(
+        input,
+        fc1_expert_weights,
+        fc2_expert_weights,
+        token_final_scales,
+        token_selected_experts.to(torch.int64),
+        local_expert_offset=0,
+        E_global=E_global,
+    )
+
+
+@torch.no_grad()
+def _trtllm_bf16_moe_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    gemm1_weights,
+    gemm2_weights,
+    num_experts,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor=None,
+    **_unused,
+):
+    """Reference for TRT-LLM BF16 MoE (Default routing)."""
+    w_topk, topk_idx = _default_routing_weights(
+        routing_logits, routing_bias, top_k, routed_scaling_factor
+    )
+    return _moe_bf16_run_experts(
+        hidden_states,
+        gemm1_weights,
+        gemm2_weights,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        int(num_experts),
+    )
+
+
+@torch.no_grad()
+def _trtllm_bf16_routed_moe_reference(
+    topk_ids,
+    hidden_states,
+    gemm1_weights,
+    gemm2_weights,
+    num_experts,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor=None,
+    **_unused,
+):
+    """Reference for TRT-LLM BF16 MoE with precomputed topk_ids."""
+    T = topk_ids.shape[0]
+    scale = float(routed_scaling_factor or 1.0)
+    # Uniform weight per selected expert (real routing scales not available).
+    w_topk = torch.full(
+        (T, int(top_k)),
+        scale / float(top_k),
+        dtype=torch.float32,
+        device=hidden_states.device,
+    )
+    return _moe_bf16_run_experts(
+        hidden_states,
+        gemm1_weights,
+        gemm2_weights,
+        w_topk,
+        topk_ids.to(torch.int64),
+        local_expert_offset,
+        int(num_experts),
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp8_per_tensor_scale_moe_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    gemm1_weights,
+    output1_scales_scalar,
+    output1_scales_gate_scalar,
+    gemm2_weights,
+    output2_scales_scalar,
+    num_experts,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor=None,
+    **_unused,
+):
+    """Reference for TRT-LLM FP8 per-tensor scale MoE. Dequantizes per-expert."""
+    E_local = gemm1_weights.shape[0]
+    w_topk, topk_idx = _default_routing_weights(
+        routing_logits, routing_bias, top_k, routed_scaling_factor
+    )
+    # Per-expert dequant: each expert has its own scalar scale for FC1 gate,
+    # FC1 up, and FC2. Scale broadcasts over the non-expert dims.
+    W1 = gemm1_weights.to(torch.float32)
+    W2 = gemm2_weights.to(torch.float32)
+    s1 = output1_scales_scalar.to(torch.float32).view(E_local, 1, 1)
+    s1g = output1_scales_gate_scalar.to(torch.float32).view(E_local, 1, 1)
+    s2 = output2_scales_scalar.to(torch.float32).view(E_local, 1, 1)
+    I = W1.shape[1] // 2
+    # W1 is [E, 2I, H]: first half is gate, second half is up — apply scales.
+    W1 = torch.cat([W1[:, :I] * s1g, W1[:, I:] * s1], dim=1)
+    W2 = W2 * s2
+    return _moe_bf16_run_experts(
+        hidden_states.to(torch.float32),
+        W1,
+        W2,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        int(num_experts),
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp8_block_scale_routed_moe_reference(
+    topk_ids,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    num_experts,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor=None,
+    **_unused,
+):
+    """Reference for TRT-LLM FP8 block-scale routed MoE (precomputed topk_ids).
+
+    Reuses ``_fp8_moe_run_experts`` for the dequant + SwiGLU path, and builds
+    a uniform per-token weight tensor (real routing scales are not available
+    from topk_ids alone).
+    """
+    T = topk_ids.shape[0]
+    TOP_K = int(top_k)
+    scale = float(routed_scaling_factor or 1.0)
+    w_topk = torch.full(
+        (T, TOP_K),
+        scale / TOP_K,
+        dtype=torch.float32,
+        device=hidden_states.device,
+    )
+    return _fp8_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        w_topk,
+        topk_ids.to(torch.int64),
+        local_expert_offset,
+        int(num_experts),
+    )
+
+
+@torch.no_grad()
+def _trtllm_fp4_block_scale_routed_moe_reference(
+    topk_ids,
+    hidden_states,
+    hidden_states_scale,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm1_bias,
+    gemm2_weights,
+    gemm2_weights_scale,
+    gemm2_bias,
+    num_experts,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor=None,
+    **_unused,
+):
+    """Reference for TRT-LLM FP4 block-scale routed MoE (precomputed topk_ids)."""
+    T = topk_ids.shape[0]
+    TOP_K = int(top_k)
+    scale = float(routed_scaling_factor or 1.0)
+    w_topk = torch.full(
+        (T, TOP_K),
+        scale / TOP_K,
+        dtype=torch.float32,
+        device=hidden_states.device,
+    )
+    return _fp4_moe_run_experts(
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm1_bias,
+        gemm2_bias,
+        w_topk,
+        topk_ids.to(torch.int64),
+        local_expert_offset,
+        int(num_experts),
+    )
+
+
+@torch.no_grad()
+def _trtllm_mxint4_block_scale_moe_reference(
+    routing_logits,
+    routing_bias,
+    hidden_states,
+    gemm1_weights,
+    gemm1_weights_scale,
+    gemm2_weights,
+    gemm2_weights_scale,
+    num_experts,
+    top_k,
+    local_expert_offset,
+    routed_scaling_factor=None,
+    **_unused,
+):
+    """Reference for TRT-LLM MxInt4 block-scale MoE.
+
+    Weights are int4 packed as uint8 with bf16 per-32 block scales. Hidden
+    states are bf16 (no activation quantization).
+    """
+
+    # Unpack int4: low nibble is first element, values are 4-bit signed (-8..7).
+    def _unpack_int4(packed):
+        lo = (packed & 0x0F).to(torch.int64)
+        hi = ((packed >> 4) & 0x0F).to(torch.int64)
+        # Sign-extend from 4-bit.
+        lo = torch.where(lo >= 8, lo - 16, lo)
+        hi = torch.where(hi >= 8, hi - 16, hi)
+        stacked = torch.stack([lo, hi], dim=-1)
+        return stacked.reshape(*packed.shape[:-1], packed.shape[-1] * 2).to(
+            torch.float32
+        )
+
+    W1 = _unpack_int4(gemm1_weights)  # [E, 2I, H]
+    W2 = _unpack_int4(gemm2_weights)  # [E, H, I]
+    # Scales are bf16, broadcast per-32 along last axis.
+    s1 = gemm1_weights_scale.to(torch.float32)
+    s2 = gemm2_weights_scale.to(torch.float32)
+    block1 = W1.shape[-1] // s1.shape[-1]
+    block2 = W2.shape[-1] // s2.shape[-1]
+    W1 = W1 * s1.repeat_interleave(block1, dim=-1)
+    W2 = W2 * s2.repeat_interleave(block2, dim=-1)
+
+    w_topk, topk_idx = _default_routing_weights(
+        routing_logits, routing_bias, top_k, routed_scaling_factor
+    )
+    return _moe_bf16_run_experts(
+        hidden_states,
+        W1,
+        W2,
+        w_topk,
+        topk_idx,
+        local_expert_offset,
+        int(num_experts),
+    )
+
+
+# CUTLASS fused MoE: precomputed token_selected_experts + token_final_scales
+cutlass_fused_moe_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="cutlass_fused_moe",
+    description="CUTLASS fused MoE. Accepts precomputed per-token expert selections.",
+    axes={
+        "seq_len": Var(description="Number of input tokens."),
+        "num_local_experts": Const(abbrev="e"),
+        "hidden_size": Const(abbrev="h"),
+        "intermediate_size": Const(abbrev="i"),
+        "top_k": Const(abbrev="topk"),
+    },
+    inputs={
+        "input": Tensor(
+            ["seq_len", "hidden_size"],
+            description="Input hidden states (bf16/fp8/fp4 depending on quant config).",
+        ),
+        "token_selected_experts": Tensor(
+            ["seq_len", "top_k"],
+            dtype="int32",
+            description="Precomputed top-k expert ids per token.",
+        ),
+        "token_final_scales": Tensor(
+            ["seq_len", "top_k"],
+            dtype="float32",
+            description="Precomputed per-token expert scales.",
+        ),
+        "fc1_expert_weights": Tensor(
+            ["num_local_experts", "gemm1_out_size", "hidden_size"],
+            description="FC1 weights per expert.",
+        ),
+        "fc2_expert_weights": Tensor(
+            ["num_local_experts", "hidden_size", "intermediate_size"],
+            description="FC2 weights per expert.",
+        ),
+    },
+    outputs={
+        "output": Tensor(["seq_len", "hidden_size"], dtype="bfloat16"),
+    },
+    tags=["status:verified", "backend:cutlass"],
+    reference=_cutlass_fused_moe_reference,
+)
+cutlass_fused_moe_trace.axes["gemm1_out_size"] = Const(
+    abbrev="", description="FC1 output size (typically 2 * intermediate_size)."
+)
+
+# Shared factory for the remaining trtllm_* variants
+_TRTLLM_MOE_COMMON_INPUTS: dict[str, Tensor | Scalar] = {
+    "routing_logits": Tensor(
+        ["seq_len", "num_experts"], description="Routing logits for expert selection."
+    ),
+    "routing_bias": Tensor(
+        ["num_experts"], optional=True, description="Optional routing bias."
+    ),
+    "hidden_states": Tensor(
+        ["seq_len", "hidden_size"],
+        description="Input hidden states (dtype depends on variant).",
+    ),
+    "gemm1_weights": Tensor(
+        ["num_local_experts", "gemm1_out_size", "hidden_size"],
+        description="FC1 weights (gate+up).",
+    ),
+    "gemm2_weights": Tensor(
+        ["num_local_experts", "hidden_size", "intermediate_size"],
+        description="FC2 weights (down).",
+    ),
+    "top_k": Scalar("int32", description="Number of experts to route per token."),
+    "n_group": Scalar(
+        "int32", optional=True, description="Expert groups (DeepSeek-V3)."
+    ),
+    "topk_group": Scalar(
+        "int32", optional=True, description="Groups to keep (DeepSeek-V3)."
+    ),
+    "local_expert_offset": Scalar(
+        "int32", description="Offset of local experts in global expert space."
+    ),
+    "routed_scaling_factor": Scalar(
+        "float32", optional=True, description="Scaling factor for routing weights."
+    ),
+    "routing_method_type": Scalar(
+        "int32",
+        optional=True,
+        description="0=Default, 1=Renormalize, 2=DeepSeekV3, 3=Llama4, 4=RenormalizeNaive, 5=TopK.",
+    ),
+}
+
+_TRTLLM_MOE_COMMON_AXES: dict[str, Var | Const] = {
+    **_MOE_COMMON_AXES,
+    "gemm1_out_size": Const(abbrev="", description="2 * intermediate_size."),
+}
+
+_TRTLLM_MOE_COMMON_OUTPUTS: dict[str, Tensor | Scalar] = {
+    "output": Tensor(
+        ["seq_len", "hidden_size"], dtype="bfloat16", description="MoE output."
+    ),
+}
+
+# BF16 MoE (no quantization)
+trtllm_bf16_moe_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="trtllm_bf16_moe",
+    description="TRT-LLM BF16 MoE (no quantization).",
+    axes=dict(_TRTLLM_MOE_COMMON_AXES),
+    inputs=dict(_TRTLLM_MOE_COMMON_INPUTS),
+    outputs=dict(_TRTLLM_MOE_COMMON_OUTPUTS),
+    tags=["status:verified", "backend:trtllm"],
+    reference=_trtllm_bf16_moe_reference,
+)
+
+# BF16 routed MoE (accepts precomputed topk_ids instead of routing_logits)
+# num_experts / intermediate_size become Var in routed variants because they
+# are passed as scalar kwargs (no routing_logits tensor to resolve from).
+_TRTLLM_MOE_ROUTED_AXES: dict[str, Var | Const] = {
+    **_TRTLLM_MOE_COMMON_AXES,
+    "num_experts": Var(description="Total number of experts (passed as kwarg)."),
+    "intermediate_size": Var(
+        description="MoE intermediate layer size (passed as kwarg)."
+    ),
+}
+trtllm_bf16_routed_moe_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="trtllm_bf16_routed_moe",
+    description="TRT-LLM BF16 MoE with precomputed topk_ids.",
+    axes=dict(_TRTLLM_MOE_ROUTED_AXES),
+    inputs={
+        "topk_ids": Tensor(
+            ["seq_len", "top_k"],
+            dtype="int32",
+            description="Precomputed top-k expert ids per token.",
+        ),
+        "hidden_states": _TRTLLM_MOE_COMMON_INPUTS["hidden_states"],
+        "gemm1_weights": _TRTLLM_MOE_COMMON_INPUTS["gemm1_weights"],
+        "gemm2_weights": _TRTLLM_MOE_COMMON_INPUTS["gemm2_weights"],
+        "num_experts": Scalar("int32", description="Total number of experts."),
+        "top_k": _TRTLLM_MOE_COMMON_INPUTS["top_k"],
+        "local_expert_offset": _TRTLLM_MOE_COMMON_INPUTS["local_expert_offset"],
+        "routed_scaling_factor": _TRTLLM_MOE_COMMON_INPUTS["routed_scaling_factor"],
+    },
+    outputs=dict(_TRTLLM_MOE_COMMON_OUTPUTS),
+    tags=["status:verified", "backend:trtllm"],
+    reference=_trtllm_bf16_routed_moe_reference,
+)
+
+# FP8 per-tensor scale MoE
+trtllm_fp8_per_tensor_scale_moe_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="trtllm_fp8_per_tensor_scale_moe",
+    description="TRT-LLM FP8 MoE with per-tensor activation/weight scales.",
+    axes=dict(_TRTLLM_MOE_COMMON_AXES),
+    inputs={
+        **_TRTLLM_MOE_COMMON_INPUTS,
+        "output1_scales_scalar": Tensor(
+            ["num_local_experts"],
+            dtype="float32",
+            description="Per-expert FC1 output scale.",
+        ),
+        "output1_scales_gate_scalar": Tensor(
+            ["num_local_experts"],
+            dtype="float32",
+            description="Per-expert FC1 gate scale.",
+        ),
+        "output2_scales_scalar": Tensor(
+            ["num_local_experts"],
+            dtype="float32",
+            description="Per-expert FC2 output scale.",
+        ),
+    },
+    outputs=dict(_TRTLLM_MOE_COMMON_OUTPUTS),
+    tags=["status:verified", "backend:trtllm", "quantization:float8_e4m3fn"],
+    reference=_trtllm_fp8_per_tensor_scale_moe_reference,
+)
+
+# FP8 block-scale routed (precomputed topk_ids)
+trtllm_fp8_block_scale_routed_moe_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="trtllm_fp8_block_scale_routed_moe",
+    description="TRT-LLM FP8 block-scale MoE with precomputed topk_ids.",
+    axes={
+        **_TRTLLM_MOE_ROUTED_AXES,
+        "num_hidden_blocks": Const(abbrev=""),
+        "num_intermediate_blocks": Const(abbrev=""),
+        "num_gemm1_out_blocks": Const(abbrev=""),
+    },
+    inputs={
+        "topk_ids": Tensor(
+            ["seq_len", "top_k"], dtype="int32", description="Precomputed top-k."
+        ),
+        "routing_bias": Tensor(
+            ["num_experts"], optional=True, description="Optional routing bias."
+        ),
+        "hidden_states": Tensor(
+            ["seq_len", "hidden_size"],
+            description="FP8-quantized hidden states.",
+        ),
+        "hidden_states_scale": Tensor(
+            ["num_hidden_blocks", "seq_len"],
+            description="Block-wise hidden_states scale.",
+        ),
+        "gemm1_weights": Tensor(
+            ["num_local_experts", "gemm1_out_size", "hidden_size"],
+            description="FC1 FP8 weights.",
+        ),
+        "gemm1_weights_scale": Tensor(
+            ["num_local_experts", "num_gemm1_out_blocks", "num_hidden_blocks"],
+            description="FC1 block-wise scale.",
+        ),
+        "gemm2_weights": Tensor(
+            ["num_local_experts", "hidden_size", "intermediate_size"],
+            description="FC2 FP8 weights.",
+        ),
+        "gemm2_weights_scale": Tensor(
+            ["num_local_experts", "num_hidden_blocks", "num_intermediate_blocks"],
+            description="FC2 block-wise scale.",
+        ),
+        "num_experts": Scalar("int32", description="Total number of experts."),
+        "top_k": Scalar("int32"),
+        "local_expert_offset": Scalar("int32"),
+        "routed_scaling_factor": Scalar("float32", optional=True),
+    },
+    outputs=dict(_TRTLLM_MOE_COMMON_OUTPUTS),
+    tags=["status:verified", "backend:trtllm", "quantization:float8_e4m3fn"],
+    reference=_trtllm_fp8_block_scale_routed_moe_reference,
+)
+
+# FP4 block-scale routed (precomputed topk_ids)
+trtllm_fp4_block_scale_routed_moe_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="trtllm_fp4_block_scale_routed_moe",
+    description="TRT-LLM NvFP4 block-scale MoE with precomputed topk_ids.",
+    axes={
+        **_TRTLLM_MOE_ROUTED_AXES,
+        "num_packed_hidden": Const(abbrev=""),
+        # Var rather than Const because hidden_states_scale is optional and the
+        # other tensors using this axis may have different shapes in routed mode.
+        "num_fp4_hidden_blocks": Var(
+            description="NvFP4 block count along hidden_size."
+        ),
+        "num_packed_intermediate": Const(abbrev=""),
+        "num_fp4_intermediate_blocks": Const(abbrev=""),
+    },
+    inputs={
+        "topk_ids": Tensor(
+            ["seq_len", "top_k"], dtype="int32", description="Precomputed top-k."
+        ),
+        "routing_bias": Tensor(
+            ["num_experts"], optional=True, description="Optional routing bias."
+        ),
+        "hidden_states": Tensor(
+            ["seq_len", "num_packed_hidden"],
+            description="NvFP4-packed hidden states.",
+        ),
+        "hidden_states_scale": Tensor(
+            ["seq_len", "num_fp4_hidden_blocks"],
+            optional=True,
+            description="NvFP4 hidden_states scale.",
+        ),
+        "gemm1_weights": Tensor(
+            ["num_local_experts", "gemm1_out_size", "num_packed_hidden"],
+            description="FC1 NvFP4 weights.",
+        ),
+        "gemm1_weights_scale": Tensor(
+            ["num_local_experts", "gemm1_out_size", "num_fp4_hidden_blocks"],
+            description="FC1 NvFP4 scale.",
+        ),
+        "gemm2_weights": Tensor(
+            ["num_local_experts", "hidden_size", "num_packed_intermediate"],
+            description="FC2 NvFP4 weights.",
+        ),
+        "gemm2_weights_scale": Tensor(
+            ["num_local_experts", "hidden_size", "num_fp4_intermediate_blocks"],
+            description="FC2 NvFP4 scale.",
+        ),
+        "num_experts": Scalar("int32", description="Total number of experts."),
+        "top_k": Scalar("int32"),
+        "local_expert_offset": Scalar("int32"),
+        "routed_scaling_factor": Scalar("float32", optional=True),
+    },
+    outputs=dict(_TRTLLM_MOE_COMMON_OUTPUTS),
+    tags=["status:experimental", "backend:trtllm", "quantization:nvfp4"],
+    reference=_trtllm_fp4_block_scale_routed_moe_reference,
+)
+
+# MxInt4 block-scale MoE
+trtllm_mxint4_block_scale_moe_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="trtllm_mxint4_block_scale_moe",
+    description="TRT-LLM MxInt4 block-scale MoE.",
+    axes={
+        **_TRTLLM_MOE_COMMON_AXES,
+        "intermediate_size": Var(description="MoE intermediate size (kwarg)."),
+        "num_packed_hidden": Const(abbrev=""),
+        "num_mxint4_hidden_blocks": Const(abbrev=""),
+        "num_packed_intermediate": Const(abbrev=""),
+        "num_mxint4_intermediate_blocks": Const(abbrev=""),
+    },
+    inputs={
+        "routing_logits": Tensor(
+            ["seq_len", "num_experts"], description="Routing logits."
+        ),
+        "routing_bias": Tensor(
+            ["num_experts"], optional=True, description="Optional routing bias."
+        ),
+        "hidden_states": Tensor(
+            ["seq_len", "hidden_size"],
+            description="BF16/FP16 hidden states (quantized internally).",
+        ),
+        "gemm1_weights": Tensor(
+            ["num_local_experts", "gemm1_out_size", "num_packed_hidden"],
+            description="FC1 MxInt4-packed weights.",
+        ),
+        "gemm1_weights_scale": Tensor(
+            ["num_local_experts", "gemm1_out_size", "num_mxint4_hidden_blocks"],
+            description="FC1 MxInt4 scales.",
+        ),
+        "gemm2_weights": Tensor(
+            ["num_local_experts", "hidden_size", "num_packed_intermediate"],
+            description="FC2 MxInt4-packed weights.",
+        ),
+        "gemm2_weights_scale": Tensor(
+            ["num_local_experts", "hidden_size", "num_mxint4_intermediate_blocks"],
+            description="FC2 MxInt4 scales.",
+        ),
+        "top_k": Scalar("int32"),
+        "n_group": Scalar("int32", optional=True),
+        "topk_group": Scalar("int32", optional=True),
+        "local_expert_offset": Scalar("int32"),
+        "routed_scaling_factor": Scalar("float32", optional=True),
+        "routing_method_type": Scalar("int32", optional=True),
+    },
+    outputs=dict(_TRTLLM_MOE_COMMON_OUTPUTS),
+    tags=["status:experimental", "backend:trtllm", "quantization:mxint4"],
+    reference=_trtllm_mxint4_block_scale_moe_reference,
+)
+
+
+# ---------------------------------------------------------------------------
+# CuteDSL MoE variants (precomputed routing, NvFP4 weights on SM100+)
+# ---------------------------------------------------------------------------
+
+cute_dsl_fused_moe_nvfp4_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="cute_dsl_fused_moe_nvfp4",
+    description=(
+        "CuteDSL NVFP4 fused MoE (SM100/SM103). Accepts NvFP4-packed input + "
+        "scales with precomputed top-k routing (token_selected_experts + "
+        "token_final_scales) and per-expert alpha scales."
+    ),
+    axes={
+        "num_tokens": Var(description="Total tokens across the batch."),
+        "num_experts": Const(abbrev="", description="Total number of experts."),
+        "top_k": Const(abbrev="topk"),
+        "num_local_experts": Const(abbrev="e"),
+        "hidden_size": Const(abbrev="h"),
+        "intermediate_size": Var(description="MoE intermediate size (kwarg)."),
+        "num_packed_hidden": Var(description="hidden_size // 2 (NvFP4 packed)."),
+        "num_packed_intermediate": Var(
+            description="intermediate_size // 2 (NvFP4 packed)."
+        ),
+        "num_fp4_hidden_blocks": Var(
+            description="NvFP4 scale-factor count along hidden_size."
+        ),
+        "num_fp4_intermediate_blocks": Var(
+            description="NvFP4 scale-factor count along intermediate_size."
+        ),
+        "gemm1_out_size": Const(abbrev="", description="2 * intermediate_size."),
+    },
+    inputs={
+        "x": Tensor(
+            ["num_tokens", "num_packed_hidden"],
+            description="NvFP4-packed input (uint8, 2 fp4 per byte).",
+        ),
+        "x_sf": Tensor(
+            ["num_tokens", "num_fp4_hidden_blocks"],
+            description="NvFP4 scale factors for x (float8_e4m3fn).",
+        ),
+        "token_selected_experts": Tensor(
+            ["num_tokens", "top_k"],
+            dtype="int32",
+            description="Precomputed top-k expert ids per token.",
+        ),
+        "token_final_scales": Tensor(
+            ["num_tokens", "top_k"],
+            dtype="float32",
+            description="Precomputed per-token routing scales.",
+        ),
+        "w1_weight": Tensor(
+            ["num_local_experts", "gemm1_out_size", "num_packed_hidden"],
+            description="FC1 weights, NvFP4-packed.",
+        ),
+        "w1_weight_sf": Tensor(
+            ["num_local_experts", "gemm1_out_size", "num_fp4_hidden_blocks"],
+            description="FC1 NvFP4 scales.",
+        ),
+        "w1_alpha": Tensor(
+            ["num_local_experts"],
+            dtype="float32",
+            description="Per-expert FC1 global scale.",
+        ),
+        "fc2_input_scale": Tensor(
+            ["one"],
+            dtype="float32",
+            description="Global scale for FC2 input quantization.",
+        ),
+        "w2_weight": Tensor(
+            ["num_local_experts", "hidden_size", "num_packed_intermediate"],
+            description="FC2 weights, NvFP4-packed.",
+        ),
+        "w2_weight_sf": Tensor(
+            ["num_local_experts", "hidden_size", "num_fp4_intermediate_blocks"],
+            description="FC2 NvFP4 scales.",
+        ),
+        "w2_alpha": Tensor(
+            ["num_local_experts"],
+            dtype="float32",
+            description="Per-expert FC2 global scale.",
+        ),
+        "num_experts": Scalar("int32", description="Total number of experts."),
+        "top_k": Scalar("int32", description="Number of experts per token."),
+        "local_expert_offset": Scalar(
+            "int32", optional=True, description="Offset of local experts."
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["num_tokens", "hidden_size"],
+            dtype="bfloat16",
+            description="MoE output.",
+        ),
+    },
+    tags=["status:experimental", "backend:cute-dsl", "quantization:nvfp4"],
+)
+cute_dsl_fused_moe_nvfp4_trace.axes["one"] = Var(
+    description="Placeholder for shape [1] scalars."
+)
+
+_cute_dsl_wrapper_inputs = dict(cute_dsl_fused_moe_nvfp4_trace.inputs)
+# num_experts / top_k live on the wrapper instance (set in __init__), not on run().
+_cute_dsl_wrapper_inputs["num_experts"] = Scalar(
+    "int32",
+    optional=True,
+    description="Set at wrapper __init__, not passed to run().",
+)
+_cute_dsl_wrapper_inputs["top_k"] = Scalar(
+    "int32",
+    optional=True,
+    description="Set at wrapper __init__, not passed to run().",
+)
+
+_cute_dsl_wrapper_axes = dict(cute_dsl_fused_moe_nvfp4_trace.axes)
+# num_experts / top_k are set at __init__ time — no tensor on run() has a
+# num_experts dim, so the axis must be a Var here.
+_cute_dsl_wrapper_axes["num_experts"] = Var(description="Total number of experts.")
+_cute_dsl_wrapper_axes["top_k"] = Var(description="Experts per token.")
+
+cute_dsl_moe_wrapper_run_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="cute_dsl_moe_wrapper",
+    description=(
+        "CuteDslMoEWrapper.run(): stateful version of cute_dsl_fused_moe_nvfp4 "
+        "(same schema; wrapper persists autotuning state across calls)."
+    ),
+    axes=_cute_dsl_wrapper_axes,
+    inputs=_cute_dsl_wrapper_inputs,
+    outputs=dict(cute_dsl_fused_moe_nvfp4_trace.outputs),
+    tags=cute_dsl_fused_moe_nvfp4_trace.tags,
+)
+
+
+# ---------------------------------------------------------------------------
+# B12x MoE (SM120/SM121 CuTe-DSL, bf16 input + FP4 packed weights)
+# ---------------------------------------------------------------------------
+
+b12x_fused_moe_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="b12x_fused_moe",
+    description=(
+        "B12x CuTe-DSL fused MoE (SM120/SM121). BF16 input, FP4-packed "
+        "weights, precomputed top-k routing; fuses quant + FC1 + activation + "
+        "FC2 + scatter."
+    ),
+    axes={
+        "num_tokens": Var(),
+        "num_experts": Const(abbrev="", description="Total number of experts."),
+        "top_k": Const(abbrev="topk"),
+        "num_local_experts": Const(abbrev="e"),
+        "hidden_size": Const(abbrev="h"),
+        "intermediate_size": Var(description="MoE intermediate size (kwarg)."),
+        "num_packed_hidden": Var(description="hidden_size // 2."),
+        "num_packed_intermediate": Var(description="intermediate_size // 2."),
+        "num_fp4_hidden_blocks": Var(),
+        "num_fp4_intermediate_blocks": Var(),
+        "gemm1_out_size": Const(
+            abbrev="",
+            description="2*I (SwiGLU) or I (ReLU2).",
+        ),
+    },
+    inputs={
+        "x": Tensor(
+            ["num_tokens", "hidden_size"], description="BF16 input activations."
+        ),
+        "w1_weight": Tensor(
+            ["num_local_experts", "gemm1_out_size", "num_packed_hidden"],
+            description="FC1 weights, FP4-packed.",
+        ),
+        "w1_weight_sf": Tensor(
+            ["num_local_experts", "gemm1_out_size", "num_fp4_hidden_blocks"],
+            description="FC1 FP4 scales.",
+        ),
+        "w2_weight": Tensor(
+            ["num_local_experts", "hidden_size", "num_packed_intermediate"],
+            description="FC2 weights, FP4-packed.",
+        ),
+        "w2_weight_sf": Tensor(
+            ["num_local_experts", "hidden_size", "num_fp4_intermediate_blocks"],
+            description="FC2 FP4 scales.",
+        ),
+        "token_selected_experts": Tensor(
+            ["num_tokens", "top_k"],
+            dtype="int32",
+            description="Precomputed top-k expert ids per token.",
+        ),
+        "token_final_scales": Tensor(
+            ["num_tokens", "top_k"],
+            dtype="float32",
+            description="Precomputed per-token routing scales.",
+        ),
+        "num_experts": Scalar("int32", description="Total experts."),
+        "top_k": Scalar("int32"),
+        "w1_alpha": Tensor(
+            ["num_local_experts"],
+            dtype="float32",
+            description="Per-expert FC1 global scale.",
+        ),
+        "w2_alpha": Tensor(
+            ["num_local_experts"],
+            dtype="float32",
+            description="Per-expert FC2 global scale.",
+        ),
+        "fc2_input_scale": Tensor(
+            ["one"],
+            dtype="float32",
+            description="Global scale for FC2 input quantization.",
+        ),
+    },
+    outputs={
+        "output": Tensor(
+            ["num_tokens", "hidden_size"],
+            dtype="bfloat16",
+            description="MoE output.",
+        ),
+    },
+    tags=["status:experimental", "backend:cute-dsl", "quantization:fp4"],
+)
+b12x_fused_moe_trace.axes["one"] = Var(description="Placeholder for shape [1].")
+
+_b12x_wrapper_inputs = dict(b12x_fused_moe_trace.inputs)
+_b12x_wrapper_inputs["num_experts"] = Scalar(
+    "int32",
+    optional=True,
+    description="Set at wrapper __init__, not passed to run().",
+)
+_b12x_wrapper_inputs["top_k"] = Scalar(
+    "int32",
+    optional=True,
+    description="Set at wrapper __init__, not passed to run().",
+)
+
+_b12x_wrapper_axes = dict(b12x_fused_moe_trace.axes)
+_b12x_wrapper_axes["num_experts"] = Var(description="Total number of experts.")
+_b12x_wrapper_axes["top_k"] = Var(description="Experts per token.")
+
+
+@torch.no_grad()
+def _cute_dsl_fused_moe_nvfp4_reference(
+    x,
+    x_sf,
+    token_selected_experts,
+    token_final_scales,
+    w1_weight,
+    w1_weight_sf,
+    w1_alpha,
+    fc2_input_scale,
+    w2_weight,
+    w2_weight_sf,
+    w2_alpha,
+    num_experts,
+    top_k,
+    **_unused,
+):
+    """Reference for CuteDSL NvFP4 fused MoE — bridges to the FP4
+    block-scale kernel with alpha scales folded into the dequantized
+    weights."""
+    E_local = w1_weight.shape[0]
+    # Dequantize input and weights with alpha factors.
+    hs_deq = _dequantize_fp4_tensor(x, x_sf, is_ue8m0_scales=False)
+    W1 = _dequantize_fp4_tensor(w1_weight, w1_weight_sf, is_ue8m0_scales=False)
+    W2 = _dequantize_fp4_tensor(w2_weight, w2_weight_sf, is_ue8m0_scales=False)
+    W1 = W1 * w1_alpha.to(torch.float32).view(E_local, 1, 1)
+    W2 = W2 * w2_alpha.to(torch.float32).view(E_local, 1, 1)
+    return _moe_bf16_run_experts(
+        hs_deq,
+        W1,
+        W2,
+        token_final_scales,
+        token_selected_experts.to(torch.int64),
+        local_expert_offset=0,
+        E_global=int(num_experts),
+    )
+
+
+@torch.no_grad()
+def _b12x_fused_moe_reference(
+    x,
+    w1_weight,
+    w1_weight_sf,
+    w2_weight,
+    w2_weight_sf,
+    token_selected_experts,
+    token_final_scales,
+    num_experts,
+    top_k,
+    w1_alpha=None,
+    w2_alpha=None,
+    fc2_input_scale=None,
+    **_unused,
+):
+    """Reference for B12x CuTe-DSL fused MoE (bf16 input, FP4 weights)."""
+    E_local = w1_weight.shape[0]
+    W1 = _dequantize_fp4_tensor(w1_weight, w1_weight_sf, is_ue8m0_scales=False)
+    W2 = _dequantize_fp4_tensor(w2_weight, w2_weight_sf, is_ue8m0_scales=False)
+    if w1_alpha is not None:
+        W1 = W1 * w1_alpha.to(torch.float32).view(E_local, 1, 1)
+    if w2_alpha is not None:
+        W2 = W2 * w2_alpha.to(torch.float32).view(E_local, 1, 1)
+    return _moe_bf16_run_experts(
+        x,
+        W1,
+        W2,
+        token_final_scales,
+        token_selected_experts.to(torch.int64),
+        local_expert_offset=0,
+        E_global=int(num_experts),
+    )
+
+
+cute_dsl_fused_moe_nvfp4_trace.reference = _cute_dsl_fused_moe_nvfp4_reference
+cute_dsl_moe_wrapper_run_trace.reference = _cute_dsl_fused_moe_nvfp4_reference
+b12x_fused_moe_trace.reference = _b12x_fused_moe_reference
+
+
+b12x_moe_wrapper_run_trace = TraceTemplate(
+    op_type="moe",
+    name_prefix="b12x_moe_wrapper",
+    description="B12xMoEWrapper.run(): wrapper form of b12x_fused_moe.",
+    axes=_b12x_wrapper_axes,
+    inputs=_b12x_wrapper_inputs,
+    outputs=dict(b12x_fused_moe_trace.outputs),
+    tags=b12x_fused_moe_trace.tags,
+    reference=_b12x_fused_moe_reference,
+)
diff --git a/flashinfer/trace/templates/norm.py b/flashinfer/trace/templates/norm.py
new file mode 100644
index 0000000000..dabb5dcbd1
--- /dev/null
+++ b/flashinfer/trace/templates/norm.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for normalization operations."""
+
+import torch
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+# ── RMSNorm ───────────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _rmsnorm_reference(hidden_states, weight):
+    """Root Mean Square Normalization. Epsilon is fixed at 1e-6."""
+    EPS = 1e-6
+    x = hidden_states.to(torch.float32)
+    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)
+    y = (x * inv_rms) * weight.to(torch.float32)
+    return y.to(hidden_states.dtype)
+
+
+rmsnorm_trace = TraceTemplate(
+    op_type="rmsnorm",
+    name_prefix="rmsnorm",
+    description="Root Mean Square Normalization. Epsilon is fixed at 1e-6.",
+    axes={
+        "batch_size": Var(),
+        "hidden_size": Const(abbrev="h"),
+    },
+    inputs={
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "weight": Tensor(["hidden_size"]),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "hidden_size"], dtype_from="input"),
+    },
+    tags=["status:verified"],
+    reference=_rmsnorm_reference,
+)
+
+# ── Fused Add + RMSNorm ───────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _fused_add_rmsnorm_reference(hidden_states, residual, weight):
+    """Fused Add + RMSNorm. Epsilon is fixed at 1e-6."""
+    EPS = 1e-6
+    x = hidden_states.to(torch.float32) + residual.to(torch.float32)
+    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)
+    y = (x * inv_rms) * weight.to(torch.float32)
+    return y.to(hidden_states.dtype)
+
+
+fused_add_rmsnorm_trace = TraceTemplate(
+    op_type="rmsnorm",
+    name_prefix="fused_add_rmsnorm",
+    description="Fused Add + RMSNorm. Epsilon is fixed at 1e-6.",
+    axes={
+        "batch_size": Var(),
+        "hidden_size": Const(abbrev="h"),
+    },
+    inputs={
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "residual": Tensor(["batch_size", "hidden_size"]),
+        "weight": Tensor(["hidden_size"]),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "hidden_size"], dtype_from="input"),
+        "residual": Tensor(
+            ["batch_size", "hidden_size"],
+            dtype_from="input",
+            description="Updated residual (in-place: residual += hidden_states).",
+        ),
+    },
+    tags=["status:verified", "fused"],
+    reference=_fused_add_rmsnorm_reference,
+)
+
+# ── RMSNorm + FP8 Quantize ────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _rmsnorm_quant_reference(hidden_states, weight, scale):
+    """RMSNorm followed by per-tensor FP8 (e4m3fn) quantization.
+
+    ``out = clamp(rmsnorm(input, weight) / scale, fp8_min, fp8_max).to(fp8_e4m3fn)``.
+    Epsilon is fixed at 1e-6.
+    """
+    EPS = 1e-6
+    x = hidden_states.to(torch.float32)
+    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)
+    y = (x * inv_rms) * weight.to(torch.float32)
+    s = (
+        scale.to(torch.float32).reshape(())
+        if isinstance(scale, torch.Tensor)
+        else float(scale)
+    )
+    y = y / s
+    fp8_max = 448.0  # float8_e4m3fn max finite value
+    y = y.clamp(-fp8_max, fp8_max)
+    return y.to(torch.float8_e4m3fn)
+
+
+rmsnorm_quant_trace = TraceTemplate(
+    op_type="rmsnorm",
+    name_prefix="rmsnorm_quant",
+    description="RMSNorm + FP8 quantization. out = quantize(rmsnorm(input, weight), scale).",
+    axes={
+        "batch_size": Var(),
+        "hidden_size": Const(abbrev="h"),
+    },
+    inputs={
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "weight": Tensor(["hidden_size"]),
+        "scale": Scalar(
+            "float32", description="Per-tensor quantization scale, shape (1,)."
+        ),
+    },
+    outputs={
+        "out": Tensor(
+            ["batch_size", "hidden_size"],
+            description="Quantized output (dtype matches pre-allocated out tensor).",
+        ),
+    },
+    tags=["status:verified", "quantization:fp8"],
+    reference=_rmsnorm_quant_reference,
+)
+
+# ── Fused Add + RMSNorm + FP8 Quantize ───────────────────────────────────────
+
+
+@torch.no_grad()
+def _fused_add_rmsnorm_quant_reference(hidden_states, residual, weight, scale):
+    """Fused Add + RMSNorm + FP8 quantize.
+
+    ``residual' = hidden_states + residual``
+    ``out = quantize(rmsnorm(residual', weight), scale)``
+    Returns ``(out, residual')``.
+    """
+    EPS = 1e-6
+    x = hidden_states.to(torch.float32) + residual.to(torch.float32)
+    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)
+    y = (x * inv_rms) * weight.to(torch.float32)
+    s = (
+        scale.to(torch.float32).reshape(())
+        if isinstance(scale, torch.Tensor)
+        else float(scale)
+    )
+    y = y / s
+    fp8_max = 448.0
+    y = y.clamp(-fp8_max, fp8_max).to(torch.float8_e4m3fn)
+    return y, x.to(hidden_states.dtype)
+
+
+fused_add_rmsnorm_quant_trace = TraceTemplate(
+    op_type="rmsnorm",
+    name_prefix="fused_add_rmsnorm_quant",
+    description=(
+        "Fused Add + RMSNorm + FP8 quantization. "
+        "residual += input; out = quantize(rmsnorm(residual, weight), scale)."
+    ),
+    axes={
+        "batch_size": Var(),
+        "hidden_size": Const(abbrev="h"),
+    },
+    inputs={
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "residual": Tensor(["batch_size", "hidden_size"]),
+        "weight": Tensor(["hidden_size"]),
+        "scale": Scalar(
+            "float32", description="Per-tensor quantization scale, shape (1,)."
+        ),
+    },
+    outputs={
+        "out": Tensor(
+            ["batch_size", "hidden_size"],
+            description="Quantized output (dtype matches pre-allocated out tensor).",
+        ),
+        "residual": Tensor(
+            ["batch_size", "hidden_size"],
+            dtype_from="input",
+            description="Updated residual (in-place: residual += input).",
+        ),
+    },
+    tags=["status:verified", "fused", "quantization:fp8"],
+    reference=_fused_add_rmsnorm_quant_reference,
+)
+
+# ── Gemma RMSNorm ─────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gemma_rmsnorm_reference(input, weight):
+    """Gemma-style RMSNorm: out = rmsnorm(input) * (weight + 1). Epsilon fixed at 1e-6."""
+    EPS = 1e-6
+    x = input.to(torch.float32)
+    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)
+    return (x * inv_rms * (weight.to(torch.float32) + 1)).to(input.dtype)
+
+
+gemma_rmsnorm_trace = TraceTemplate(
+    op_type="rmsnorm",
+    name_prefix="gemma_rmsnorm",
+    description="Gemma-style RMSNorm: out = rmsnorm(x) * (weight + 1).",
+    axes={
+        "batch_size": Var(),
+        "hidden_size": Const(abbrev="h"),
+    },
+    inputs={
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "weight": Tensor(["hidden_size"]),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "hidden_size"], dtype_from="input"),
+    },
+    tags=["status:verified", "model:gemma"],
+    reference=_gemma_rmsnorm_reference,
+)
+
+# ── Gemma Fused Add + RMSNorm ─────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _gemma_fused_add_rmsnorm_reference(input, residual, weight):
+    """Gemma-style Fused Add + RMSNorm."""
+    EPS = 1e-6
+    x = input.to(torch.float32) + residual.to(torch.float32)
+    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)
+    return (x * inv_rms * (weight.to(torch.float32) + 1)).to(input.dtype)
+
+
+gemma_fused_add_rmsnorm_trace = TraceTemplate(
+    op_type="rmsnorm",
+    name_prefix="gemma_fused_add_rmsnorm",
+    description="Gemma-style Fused Add + RMSNorm: residual += input; out = gemma_rmsnorm(residual).",
+    axes={
+        "batch_size": Var(),
+        "hidden_size": Const(abbrev="h"),
+    },
+    inputs={
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "residual": Tensor(["batch_size", "hidden_size"]),
+        "weight": Tensor(["hidden_size"]),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "hidden_size"], dtype_from="input"),
+        "residual": Tensor(
+            ["batch_size", "hidden_size"],
+            dtype_from="input",
+            description="Updated residual (in-place: residual += input).",
+        ),
+    },
+    tags=["status:verified", "fused", "model:gemma"],
+    reference=_gemma_fused_add_rmsnorm_reference,
+)
+
+# ── LayerNorm ─────────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _layernorm_reference(input, weight, bias):
+    """Standard LayerNorm with gamma (weight) and beta (bias). Epsilon fixed at 1e-6."""
+    EPS = 1e-6
+    x = input.to(torch.float32)
+    mean = x.mean(dim=-1, keepdim=True)
+    var = ((x - mean) ** 2).mean(dim=-1, keepdim=True)
+    x_norm = (x - mean) / torch.sqrt(var + EPS)
+    return (x_norm * weight.to(torch.float32) + bias.to(torch.float32)).to(input.dtype)
+
+
+layernorm_trace = TraceTemplate(
+    op_type="layernorm",
+    name_prefix="layernorm",
+    description="Standard LayerNorm with gamma and beta. Epsilon fixed at 1e-6.",
+    axes={
+        "batch_size": Var(),
+        "hidden_size": Const(abbrev="h"),
+    },
+    inputs={
+        "hidden_states": Tensor(["batch_size", "hidden_size"], param="input"),
+        "weight": Tensor(
+            ["hidden_size"], param="gemma", description="Scale (gamma) tensor, float32."
+        ),
+        "bias": Tensor(
+            ["hidden_size"], param="beta", description="Bias (beta) tensor, float32."
+        ),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "hidden_size"], dtype_from="input"),
+    },
+    tags=["status:verified"],
+    reference=_layernorm_reference,
+)
diff --git a/flashinfer/trace/templates/page.py b/flashinfer/trace/templates/page.py
new file mode 100644
index 0000000000..2080f481aa
--- /dev/null
+++ b/flashinfer/trace/templates/page.py
@@ -0,0 +1,507 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for paged-KV cache append operations."""
+
+import math
+
+import torch
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+
+@torch.no_grad()
+def _append_paged_kv_cache_reference(
+    append_key,
+    append_value,
+    batch_indices,
+    positions,
+    paged_kv_cache,
+    kv_indices,
+    kv_indptr,
+    kv_last_page_len,
+    kv_layout="NHD",
+    **_unused,
+):
+    """Append (append_key, append_value) into the paged KV cache at the
+    specified (batch_indices, positions) offsets.
+
+    Mutates ``paged_kv_cache`` in place. Accepts both tuple ``(k, v)`` and
+    single-tensor interleaved layouts. Only the NHD layout is modelled here;
+    HND is a permutation of the same data.
+    """
+    if isinstance(paged_kv_cache, tuple):
+        k_cache, v_cache = paged_kv_cache
+    else:
+        # Single tensor: [num_pages, 2, page_size, num_kv_heads, head_dim] in NHD
+        k_cache = paged_kv_cache[:, 0]
+        v_cache = paged_kv_cache[:, 1]
+    N = int(batch_indices.shape[0])
+    page_size = k_cache.shape[1] if kv_layout == "NHD" else k_cache.shape[2]
+    for i in range(N):
+        b = int(batch_indices[i].item())
+        pos = int(positions[i].item())
+        page_offset = pos // page_size
+        in_page_offset = pos % page_size
+        # kv_indices maps to the global page id for this (batch, page_offset).
+        idx_base = int(kv_indptr[b].item())
+        page_id = int(kv_indices[idx_base + page_offset].item())
+        if kv_layout == "NHD":
+            k_cache[page_id, in_page_offset] = append_key[i]
+            v_cache[page_id, in_page_offset] = append_value[i]
+        else:  # HND
+            k_cache[page_id, :, in_page_offset] = append_key[i]
+            v_cache[page_id, :, in_page_offset] = append_value[i]
+    return paged_kv_cache
+
+
+append_paged_kv_cache_trace = TraceTemplate(
+    op_type="page_append",
+    name_prefix="append_paged_kv_cache",
+    description=(
+        "Append a batch of (key, value) rows into a paged KV cache at "
+        "positions determined by (batch_indices, positions) and the per-seq "
+        "kv_indptr/kv_indices/kv_last_page_len layout."
+    ),
+    axes={
+        "nnz_kv": Var(description="Total K/V tokens to append."),
+        "num_kv_heads": Const(abbrev="kv"),
+        "head_dim": Const(abbrev="d"),
+        "num_pages": Var(),
+        "page_size": Const(abbrev="ps"),
+        "batch_size": Var(),
+        "batch_size_plus_1": Var(description="batch_size + 1."),
+        "num_kv_indices": Var(description="Flat length of kv_indices."),
+    },
+    inputs={
+        "append_key": Tensor(["nnz_kv", "num_kv_heads", "head_dim"]),
+        "append_value": Tensor(["nnz_kv", "num_kv_heads", "head_dim"]),
+        "batch_indices": Tensor(
+            ["nnz_kv"],
+            dtype="int32",
+            description="Per-token batch index.",
+        ),
+        "positions": Tensor(
+            ["nnz_kv"],
+            dtype="int32",
+            description="Per-token absolute position.",
+        ),
+        "paged_kv_cache": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            description="Paged KV cache (tuple or single tensor).",
+        ),
+        "kv_indices": Tensor(["num_kv_indices"], dtype="int32"),
+        "kv_indptr": Tensor(["batch_size_plus_1"], dtype="int32"),
+        "kv_last_page_len": Tensor(["batch_size"], dtype="int32"),
+    },
+    outputs={
+        "paged_kv_cache": Tensor(
+            ["num_pages", "page_size", "num_kv_heads", "head_dim"],
+            dtype_from="append_key",
+            description="Updated paged KV cache (in-place).",
+        ),
+    },
+    constraints=["batch_size_plus_1 == batch_size + 1"],
+    tags=["status:verified"],
+    reference=_append_paged_kv_cache_reference,
+)
+
+
+@torch.no_grad()
+def _append_paged_mla_kv_cache_reference(
+    append_ckv,
+    append_kpe,
+    batch_indices,
+    positions,
+    ckv_cache,
+    kpe_cache,
+    kv_indices,
+    kv_indptr,
+    kv_last_page_len,
+    **_unused,
+):
+    """Append (append_ckv, append_kpe) into the MLA paged KV cache."""
+    if ckv_cache is None or kpe_cache is None:
+        return ckv_cache, kpe_cache
+    N = int(batch_indices.shape[0])
+    page_size = ckv_cache.shape[1]
+    for i in range(N):
+        b = int(batch_indices[i].item())
+        pos = int(positions[i].item())
+        page_offset = pos // page_size
+        in_page_offset = pos % page_size
+        idx_base = int(kv_indptr[b].item())
+        page_id = int(kv_indices[idx_base + page_offset].item())
+        ckv_cache[page_id, in_page_offset] = append_ckv[i]
+        kpe_cache[page_id, in_page_offset] = append_kpe[i]
+    return ckv_cache, kpe_cache
+
+
+append_paged_mla_kv_cache_trace = TraceTemplate(
+    op_type="page_append",
+    name_prefix="append_paged_mla_kv_cache",
+    description=(
+        "Append MLA (ckv, kpe) rows into an MLA paged KV cache. Same "
+        "indexing scheme as append_paged_kv_cache but with the MLA latent "
+        "split (ckv ~ head_dim_ckv=512, kpe ~ head_dim_kpe=64)."
+    ),
+    axes={
+        "nnz_kv": Var(description="Total K/V tokens to append."),
+        "head_dim_ckv": Const(abbrev="ckv"),
+        "head_dim_kpe": Const(abbrev="kpe"),
+        "num_pages": Var(),
+        # page_size is Var because ckv_cache / kpe_cache are optional.
+        "page_size": Var(description="Size of each page (from optional cache)."),
+        "batch_size": Var(),
+        "batch_size_plus_1": Var(description="batch_size + 1."),
+        "num_kv_indices": Var(),
+    },
+    inputs={
+        "append_ckv": Tensor(["nnz_kv", "head_dim_ckv"]),
+        "append_kpe": Tensor(["nnz_kv", "head_dim_kpe"]),
+        "batch_indices": Tensor(["nnz_kv"], dtype="int32"),
+        "positions": Tensor(["nnz_kv"], dtype="int32"),
+        "ckv_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_ckv"],
+            optional=True,
+        ),
+        "kpe_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_kpe"],
+            optional=True,
+        ),
+        "kv_indices": Tensor(["num_kv_indices"], dtype="int32"),
+        "kv_indptr": Tensor(["batch_size_plus_1"], dtype="int32"),
+        "kv_last_page_len": Tensor(["batch_size"], dtype="int32"),
+    },
+    outputs={
+        "ckv_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_ckv"],
+            dtype_from="append_ckv",
+            description="Updated compressed KV cache (in-place).",
+        ),
+        "kpe_cache": Tensor(
+            ["num_pages", "page_size", "head_dim_kpe"],
+            dtype_from="append_kpe",
+            description="Updated KPE cache (in-place).",
+        ),
+    },
+    constraints=["batch_size_plus_1 == batch_size + 1"],
+    tags=["status:verified"],
+    reference=_append_paged_mla_kv_cache_reference,
+)
+
+
+# ── XQA attention (paged KV + block-tables) ──────────────────────────────────
+
+_XQA_AXES: dict[str, Var | Const] = {
+    "num_tokens": Var(),
+    "num_heads_qo": Const(abbrev="h"),
+    "num_kv_heads": Const(abbrev="kv"),
+    "head_dim": Const(abbrev="d"),
+    "num_pages": Var(),
+    "page_size": Const(abbrev="ps"),
+    "batch_size": Var(),
+    "max_pages_per_seq": Var(),
+}
+
+
+@torch.no_grad()
+def _xqa_reference(
+    q,
+    k_cache,
+    v_cache,
+    page_table,
+    seq_lens,
+    output=None,
+    **_unused,
+):
+    """Reference XQA decode: page-gather + SDPA per batch item. kv_layout=NHD."""
+    _, num_heads_qo, head_dim = (
+        q.shape if q.dim() == 3 else q.reshape(-1, q.shape[-2], q.shape[-1]).shape
+    )
+    q_flat = q.reshape(-1, num_heads_qo, head_dim)
+    num_kv_heads = k_cache.shape[-2]
+    gqa_ratio = num_heads_qo // num_kv_heads
+    batch_size = page_table.shape[0]
+    page_size = k_cache.shape[1]
+    sm_scale = 1.0 / math.sqrt(head_dim)
+    out = torch.zeros_like(q_flat, dtype=torch.float32)
+    for b in range(batch_size):
+        kv_len = int(seq_lens[b].item())
+        n_pages_used = (kv_len + page_size - 1) // page_size
+        pages = page_table[b, :n_pages_used].to(torch.long)
+        k_b = k_cache[pages].reshape(-1, num_kv_heads, head_dim)[:kv_len]
+        v_b = v_cache[pages].reshape(-1, num_kv_heads, head_dim)[:kv_len]
+        for h in range(num_heads_qo):
+            kv_h = h // gqa_ratio
+            logits = (
+                q_flat[b, h].to(torch.float32) @ k_b[:, kv_h].to(torch.float32).T
+            ) * sm_scale
+            attn = torch.softmax(logits, dim=-1)
+            out[b, h] = attn @ v_b[:, kv_h].to(torch.float32)
+    result = out.reshape(*q.shape).to(q.dtype)
+    if output is not None:
+        output.copy_(result)
+    return result
+
+
+@torch.no_grad()
+def _xqa_mla_reference(
+    q,
+    k_cache,
+    v_cache,
+    page_table,
+    seq_lens,
+    output=None,
+    output_dtype=None,
+    **_unused,
+):
+    """Reference XQA MLA decode: page-gather + SDPA with ckv/kpe split.
+
+    In MLA the K cache and V "cache" share the latent representation: K is
+    [ckv ‖ rope]; V is the first ckv_len dims of that same tensor. This
+    reference models that by slicing the first ``v_head_dim`` columns of
+    ``v_cache`` (which the kernel treats as the V tensor) for the AV matmul.
+    The output has shape ``[..., num_heads_qo, v_head_dim]``.
+    """
+    head_dim_qk = q.shape[-1]
+    v_head_dim = v_cache.shape[-1]
+    batch_size = page_table.shape[0]
+    page_size = k_cache.shape[1]
+    num_heads_qo = q.shape[-2] if q.dim() >= 3 else 1
+    q_flat = q.reshape(-1, num_heads_qo, head_dim_qk)
+    sm_scale = 1.0 / math.sqrt(head_dim_qk)
+    out_shape = q.shape[:-1] + (v_head_dim,)
+    out = torch.zeros(
+        (q_flat.shape[0], num_heads_qo, v_head_dim),
+        dtype=torch.float32,
+        device=q.device,
+    )
+    for b in range(batch_size):
+        kv_len = int(seq_lens[b].item())
+        n_pages_used = (kv_len + page_size - 1) // page_size
+        pages = page_table[b, :n_pages_used].to(torch.long)
+        k_b = k_cache[pages].reshape(-1, head_dim_qk)[:kv_len].to(torch.float32)
+        # V shares the K latent — slice the first v_head_dim columns.
+        v_b = k_b[:, :v_head_dim]
+        for h in range(num_heads_qo):
+            logits = q_flat[b, h].to(torch.float32) @ k_b.T * sm_scale
+            attn = torch.softmax(logits, dim=-1)
+            out[b, h] = attn @ v_b
+    dtype = output_dtype or q.dtype
+    result = out.reshape(out_shape).to(dtype)
+    if output is not None:
+        output.copy_(result)
+    return result
+
+
+xqa_trace = TraceTemplate(
+    op_type="xqa",
+    name_prefix="xqa",
+    description=(
+        "XQA (Cross-Query Attention) paged decode kernel. Fast decode path "
+        "with separate k/v caches and rectangular page_table[batch_size, "
+        "num_pages_per_seq]."
+    ),
+    axes=_XQA_AXES,
+    inputs={
+        "q": Tensor(["num_tokens", "num_heads_qo", "head_dim"]),
+        "k_cache": Tensor(["num_pages", "num_kv_heads", "page_size", "head_dim"]),
+        "v_cache": Tensor(["num_pages", "num_kv_heads", "page_size", "head_dim"]),
+        "page_table": Tensor(
+            ["batch_size", "max_pages_per_seq"],
+            dtype="int32",
+        ),
+        "seq_lens": Tensor(["batch_size"], dtype="int32"),
+    },
+    outputs={
+        "output": Tensor(
+            ["num_tokens", "num_heads_qo", "head_dim"],
+            dtype_from="q",
+        ),
+    },
+    tags=["status:verified", "backend:xqa"],
+    reference=_xqa_reference,
+)
+
+
+xqa_mla_trace = TraceTemplate(
+    op_type="xqa",
+    name_prefix="xqa_mla",
+    description=(
+        "XQA MLA decode: MLA (ckv + kpe) latent split applied to the XQA "
+        "paged decode path."
+    ),
+    axes={
+        "num_tokens": Var(),
+        "num_heads_qo": Const(abbrev="h"),
+        "head_dim_ckv": Const(abbrev="ckv"),
+        "head_dim_kpe": Const(abbrev="kpe"),
+        "num_pages": Var(),
+        "page_size": Const(abbrev="ps"),
+        "batch_size": Var(),
+        "max_pages_per_seq": Var(),
+    },
+    inputs={
+        "q": Tensor(["num_tokens", "num_heads_qo", "head_dim_ckv"]),
+        "k_cache": Tensor(["num_pages", "page_size", "head_dim_ckv"]),
+        "v_cache": Tensor(["num_pages", "page_size", "head_dim_kpe"]),
+        "page_table": Tensor(
+            ["batch_size", "max_pages_per_seq"],
+            dtype="int32",
+        ),
+        "seq_lens": Tensor(["batch_size"], dtype="int32"),
+    },
+    outputs={
+        "output": Tensor(
+            ["num_tokens", "num_heads_qo", "head_dim_ckv"],
+            dtype_from="q",
+        ),
+    },
+    tags=["status:verified", "backend:xqa", "mla"],
+    reference=_xqa_mla_reference,
+)
+
+
+# ── TRTLLM FMHA v2 prefill ──────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _trtllm_fmha_v2_prefill_reference(
+    qkv,
+    seq_lens,
+    max_q_len,
+    max_kv_len,
+    bmm1_scale,
+    bmm2_scale,
+    batch_size,
+    cum_seq_lens_q,
+    cum_seq_lens_kv,
+    **_unused,
+):
+    """Reference for TRT-LLM FMHA v2 prefill.
+
+    Assumes qkv is either a single fused tensor [total_tokens, 3, H, D]
+    or a tuple (q, k, v). Treats the workload as causal SDPA per batch.
+    """
+    if isinstance(qkv, tuple):
+        q, k, v = qkv[0], qkv[1], qkv[2] if len(qkv) == 3 else qkv[1]
+    elif qkv.dim() == 4 and qkv.shape[1] == 3:
+        q, k, v = qkv[:, 0], qkv[:, 1], qkv[:, 2]
+    else:
+        q = qkv
+        k = qkv
+        v = qkv
+    out = torch.zeros_like(q, dtype=torch.float32)
+    num_heads = q.shape[-2]
+    for b in range(int(batch_size)):
+        q_start = int(cum_seq_lens_q[b].item())
+        q_end = int(cum_seq_lens_q[b + 1].item())
+        kv_start = int(cum_seq_lens_kv[b].item())
+        kv_end = int(cum_seq_lens_kv[b + 1].item())
+        q_b = q[q_start:q_end].to(torch.float32)
+        k_b = k[kv_start:kv_end].to(torch.float32)
+        v_b = v[kv_start:kv_end].to(torch.float32)
+        qi = q_end - q_start
+        kv_len = kv_end - kv_start
+        delta = kv_len - qi
+        for h in range(num_heads):
+            logits = (q_b[:, h] @ k_b[:, h].T) * float(bmm1_scale)
+            mask = torch.full_like(logits, float("-inf"))
+            for i in range(qi):
+                mask[i, : i + 1 + max(0, delta)] = 0.0
+            logits = logits + mask
+            attn = torch.softmax(logits, dim=-1)
+            out[q_start:q_end, h] = (attn @ v_b[:, h]) * float(bmm2_scale)
+    return out.to(q.dtype)
+
+
+@torch.no_grad()
+def _tgv_gemm_sm100_reference(a, b, bias, **_unused):
+    """TGV GEMM: C = A @ B + bias."""
+    return (a.to(torch.float32) @ b.to(torch.float32) + bias.to(torch.float32)).to(
+        a.dtype
+    )
+
+
+# ── TRTLLM FMHA v2 prefill (original) ──────────────────────────────────────
+
+trtllm_fmha_v2_prefill_trace = TraceTemplate(
+    op_type="trtllm_paged",
+    name_prefix="trtllm_fmha_v2_prefill",
+    description=(
+        "TRT-LLM FMHA v2 prefill. Accepts fused qkv or separate (q, kv), "
+        "variable-length sequences with cum_seq_lens_q/kv."
+    ),
+    axes={
+        "num_tokens": Var(),
+        "num_heads": Const(abbrev="h"),
+        "head_dim": Const(abbrev="d"),
+        "batch_size": Var(),
+        "batch_size_plus_1_q": Var(description="batch_size + 1 for cum_seq_lens_q."),
+        "batch_size_plus_1_kv": Var(description="batch_size + 1 for cum_seq_lens_kv."),
+    },
+    inputs={
+        "qkv": Tensor(
+            ["num_tokens", "num_heads", "head_dim"],
+            description="Fused qkv or q tensor (layout determined by input_layout).",
+        ),
+        "seq_lens": Tensor(["batch_size"], dtype="int32"),
+        "max_q_len": Scalar("int32"),
+        "max_kv_len": Scalar("int32"),
+        "bmm1_scale": Scalar("float32"),
+        "bmm2_scale": Scalar("float32"),
+        "batch_size_scalar": Scalar("int32", param="batch_size"),
+        "cum_seq_lens_q": Tensor(["batch_size_plus_1_q"], dtype="int32"),
+        "cum_seq_lens_kv": Tensor(["batch_size_plus_1_kv"], dtype="int32"),
+    },
+    outputs={
+        "output": Tensor(
+            ["num_tokens", "num_heads", "head_dim"],
+            dtype_from="qkv",
+        ),
+    },
+    tags=["status:verified", "stage:prefill", "backend:trtllm"],
+    reference=_trtllm_fmha_v2_prefill_reference,
+)
+
+
+# ── TGV GEMM SM100 ──────────────────────────────────────────────────────────
+
+tgv_gemm_sm100_trace = TraceTemplate(
+    op_type="gemm_bf16",
+    name_prefix="tgv_gemm_sm100",
+    description=(
+        "TGV GEMM on SM100: C = A @ B + bias. Automatic dtype detection "
+        "(bf16/fp16). Intended for the TRT-LLM TGV backend."
+    ),
+    axes={
+        "M": Var(),
+        "N": Const(),
+        "K": Const(),
+    },
+    inputs={
+        "a": Tensor(["M", "K"]),
+        "b": Tensor(
+            ["K", "N"],
+            description="Weight matrix in column-major layout.",
+        ),
+        "bias": Tensor(["N"], description="Bias tensor."),
+    },
+    outputs={
+        "output": Tensor(["M", "N"], dtype_from="a"),
+    },
+    tags=["status:verified", "backend:tgv"],
+    reference=_tgv_gemm_sm100_reference,
+)
diff --git a/flashinfer/trace/templates/quantize.py b/flashinfer/trace/templates/quantize.py
new file mode 100644
index 0000000000..767ea42d20
--- /dev/null
+++ b/flashinfer/trace/templates/quantize.py
@@ -0,0 +1,369 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for FP4 / FP8 quantization APIs."""
+
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+_AxisT = Union[Var, Const]
+
+
+# ── Reference helpers ────────────────────────────────────────────────────────
+
+_E2M1_VALUES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]  # FP4 e2m1fn magnitudes
+
+
+@torch.no_grad()
+def _fp4_e2m1_quantize_block(
+    block: torch.Tensor, amax_per_block: torch.Tensor
+) -> torch.Tensor:
+    """Round a float block to the nearest FP4 e2m1fn value and pack sign/magnitude.
+
+    Returns an int64 tensor with values in [0, 15] matching the nibble codes
+    used by ``_unpack_fp4_e2m1`` in moe.py: low 3 bits = magnitude index,
+    high bit = sign.
+    """
+    values = torch.tensor(_E2M1_VALUES, dtype=torch.float32, device=block.device)
+    sign_bit = (block < 0).to(torch.int64) << 3
+    mag = block.abs()
+    # Nearest-magnitude index among the 8 e2m1 values.
+    diffs = (mag.unsqueeze(-1) - values).abs()
+    idx = diffs.argmin(dim=-1)
+    return (idx | sign_bit) & 0x0F
+
+
+@torch.no_grad()
+def _pack_fp4_pairs(nibbles: torch.Tensor) -> torch.Tensor:
+    """Pack pairs of 4-bit codes along the last axis into uint8 bytes.
+
+    Low nibble = first element (matches _unpack_fp4_e2m1).
+    """
+    assert nibbles.shape[-1] % 2 == 0
+    lo = nibbles[..., 0::2]
+    hi = nibbles[..., 1::2]
+    packed = (lo | (hi << 4)).to(torch.uint8)
+    return packed
+
+
+@torch.no_grad()
+def _quantize_fp4_block_scale(
+    input_tensor: torch.Tensor,
+    block_size: int,
+    use_ue8m0: bool,
+    global_scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference FP4 block-scale quantization.
+
+    Returns ``(packed_uint8, scales)`` where ``scales`` has dtype
+    ``float8_e4m3fn`` when ``use_ue8m0`` is False (NvFP4) and ``uint8``
+    (UE8M0) otherwise (MXFP4).
+    """
+    M, K = input_tensor.shape
+    assert K % block_size == 0
+    x = input_tensor.to(torch.float32)
+    blocks = x.reshape(M, K // block_size, block_size)
+    amax = blocks.abs().amax(dim=-1)  # [M, K/bs]
+    # Per-block scale that maps amax to FP4 max magnitude (6.0).
+    block_scale = amax / 6.0
+    # Optional global scale factor applied before block scaling (NvFP4 path).
+    if global_scale is not None:
+        gs = global_scale.to(torch.float32).reshape(())
+        block_scale = block_scale * gs
+    if use_ue8m0:
+        # Round scale to the nearest power of two and encode as UE8M0 (uint8).
+        safe = torch.where(block_scale > 0, block_scale, torch.ones_like(block_scale))
+        exp = torch.floor(torch.log2(safe)).to(torch.int64)
+        exp = exp.clamp(-127, 128) + 127
+        scales_raw = exp.to(torch.uint8)
+        # Reconstruct the actual scale we quantized with for the packed values.
+        actual_scale = torch.pow(
+            torch.tensor(2.0, device=x.device), (exp - 127).to(torch.float32)
+        )
+    else:
+        scales_raw = block_scale.to(torch.float8_e4m3fn)
+        actual_scale = scales_raw.to(torch.float32)
+    # Avoid division by zero for all-zero blocks.
+    actual_scale = torch.where(
+        actual_scale > 0,
+        actual_scale,
+        torch.ones_like(actual_scale),
+    )
+    # Broadcast block scale back to element granularity and quantize.
+    scaled = blocks / actual_scale.unsqueeze(-1)
+    nibbles = _fp4_e2m1_quantize_block(scaled, amax)
+    nibbles = nibbles.reshape(M, K)
+    packed = _pack_fp4_pairs(nibbles)
+    return packed, scales_raw
+
+
+@torch.no_grad()
+def _quantize_mxfp8(
+    input_tensor: torch.Tensor, block_size: int = 32
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference MXFP8 quantization: fp8_e4m3fn values with UE8M0 per-32 scales."""
+    M, K = input_tensor.shape
+    assert K % block_size == 0
+    x = input_tensor.to(torch.float32)
+    blocks = x.reshape(M, K // block_size, block_size)
+    amax = blocks.abs().amax(dim=-1)
+    # fp8_e4m3fn max finite value is 448.0.
+    block_scale = amax / 448.0
+    safe = torch.where(block_scale > 0, block_scale, torch.ones_like(block_scale))
+    exp = torch.floor(torch.log2(safe)).to(torch.int64)
+    exp = exp.clamp(-127, 128) + 127
+    scales_raw = exp.to(torch.uint8)
+    actual_scale = torch.pow(
+        torch.tensor(2.0, device=x.device), (exp - 127).to(torch.float32)
+    )
+    actual_scale = torch.where(
+        actual_scale > 0, actual_scale, torch.ones_like(actual_scale)
+    )
+    scaled = blocks / actual_scale.unsqueeze(-1)
+    quantized = scaled.clamp(-448.0, 448.0).to(torch.float8_e4m3fn).reshape(M, K)
+    return quantized, scales_raw
+
+
+@torch.no_grad()
+def _fp4_quantize_reference(
+    input: torch.Tensor,
+    global_scale: Optional[torch.Tensor] = None,
+    sf_vec_size: int = 16,
+    sf_use_ue8m0: bool = False,
+    is_sf_swizzled_layout: bool = True,
+    is_sf_8x4_layout: bool = False,
+    enable_pdl: Optional[bool] = None,
+    backend: str = "cuda",
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference FP4 quantize. Produces packed uint8 + scales in LINEAR layout.
+
+    The runtime API may return scales in a swizzled layout; consumers should
+    dequantize before comparing.
+    """
+    packed, scales = _quantize_fp4_block_scale(
+        input.reshape(-1, input.shape[-1]),
+        block_size=int(sf_vec_size),
+        use_ue8m0=bool(sf_use_ue8m0),
+        global_scale=global_scale,
+    )
+    packed = packed.reshape(*input.shape[:-1], input.shape[-1] // 2)
+    scales = scales.reshape(*input.shape[:-1], input.shape[-1] // int(sf_vec_size))
+    return packed, scales
+
+
+@torch.no_grad()
+def _nvfp4_quantize_reference(
+    a: torch.Tensor,
+    a_global_sf: torch.Tensor,
+    sfLayout=None,
+    do_shuffle: bool = False,
+    sf_vec_size: int = 16,
+    enable_pdl: Optional[bool] = None,
+    backend: str = "cuda",
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference NvFP4 quantize (block_size=16, fp8_e4m3fn scales)."""
+    return _fp4_quantize_reference(
+        a,
+        global_scale=a_global_sf,
+        sf_vec_size=sf_vec_size,
+        sf_use_ue8m0=False,
+    )
+
+
+@torch.no_grad()
+def _mxfp4_quantize_reference(
+    a: torch.Tensor,
+    backend: str = "cuda",
+    enable_pdl: Optional[bool] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference MXFP4 quantize (block_size=32, UE8M0 scales)."""
+    return _fp4_quantize_reference(
+        a,
+        global_scale=None,
+        sf_vec_size=32,
+        sf_use_ue8m0=True,
+    )
+
+
+@torch.no_grad()
+def _mxfp8_quantize_reference(
+    input: torch.Tensor,
+    is_sf_swizzled_layout: bool = True,
+    alignment: int = 32,
+    enable_pdl: Optional[bool] = None,
+    backend: str = "cuda",
+    sf_swizzle_layout=None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference MXFP8 quantize (block_size=32, UE8M0 scales)."""
+    return _quantize_mxfp8(
+        input.reshape(-1, input.shape[-1]),
+        block_size=int(alignment),
+    )
+
+
+# ── FP4 quantization (generic) ───────────────────────────────────────────────
+# input [M, K]  →  (quantized [M, K/2] uint8 packed,  scales [variable])
+
+_FP4_AXES: Dict[str, _AxisT] = {
+    "M": Var(description="Number of rows."),
+    "K": Const(abbrev="k", description="Number of input columns."),
+    "K_packed": Var(
+        description="Packed column dimension (K/2 for FP4, two values per uint8).",
+    ),
+    "num_scale_elems": Var(
+        description="Total number of scale factor elements (layout-dependent)."
+    ),
+    "one": Var(description="Placeholder for shape [1] scalar tensors."),
+}
+
+fp4_quantize_trace = TraceTemplate(
+    op_type="quantization",
+    name_prefix="fp4_quantize",
+    description="Generic FP4 quantization: bf16/fp16 input → packed FP4 e2m1fn + block scales.",
+    axes=_FP4_AXES,
+    inputs={
+        "input": Tensor(
+            ["M", "K"],
+            param="input",
+            description="Input tensor, fp16/bf16/fp8_e4m3fn.",
+        ),
+        "global_scale": Tensor(
+            ["one"],
+            dtype="float32",
+            optional=True,
+            description="Optional per-tensor global scale (shape [1]).",
+        ),
+        "sf_vec_size": Scalar(
+            "int32",
+            optional=True,
+            description="Scale-factor vector size (16 for NVFP4, 32 for MXFP4).",
+        ),
+    },
+    outputs={
+        "quantized": Tensor(
+            ["M", "K_packed"],
+            dtype="uint8",
+            description="Packed FP4 output (two e2m1fn values per byte).",
+        ),
+        "scales": Tensor(
+            ["num_scale_elems"],
+            dtype="uint8",
+            description="Block scale factors packed as uint8 bytes (layout-dependent shape).",
+        ),
+    },
+    constraints=["K_packed == K // 2"],
+    tags=["status:verified", "quantization:fp4"],
+    reference=_fp4_quantize_reference,
+)
+
+# ── NVFP4 quantization ────────────────────────────────────────────────────────
+nvfp4_quantize_trace = TraceTemplate(
+    op_type="quantization",
+    name_prefix="nvfp4_quantize",
+    description="NVFP4 quantization (sf_vec_size=16). Requires a per-tensor global scale.",
+    axes=_FP4_AXES,
+    inputs={
+        "a": Tensor(["M", "K"], description="Input tensor, fp16/bf16/fp8_e4m3fn."),
+        "a_global_sf": Tensor(
+            ["one"],
+            dtype="float32",
+            description="Global scale factor, shape [1].",
+        ),
+        "sf_vec_size": Scalar(
+            "int32",
+            optional=True,
+            description="Scale-factor vector size (fixed at 16 for NVFP4).",
+        ),
+    },
+    outputs={
+        "quantized": Tensor(
+            ["M", "K_packed"],
+            dtype="uint8",
+            description="Packed FP4 output.",
+        ),
+        "scales": Tensor(
+            ["num_scale_elems"],
+            dtype="uint8",
+            description="Block scale factors packed as uint8 bytes (layout-dependent shape).",
+        ),
+    },
+    constraints=["K_packed == K // 2"],
+    tags=["status:verified", "quantization:nvfp4"],
+    reference=_nvfp4_quantize_reference,
+)
+
+# ── MXFP4 quantization ────────────────────────────────────────────────────────
+mxfp4_quantize_trace = TraceTemplate(
+    op_type="quantization",
+    name_prefix="mxfp4_quantize",
+    description="MXFP4 quantization (sf_vec_size=32, UE8M0 scales). No global scale.",
+    axes=_FP4_AXES,
+    inputs={
+        "a": Tensor(["M", "K"], description="Input tensor, fp16/bf16."),
+    },
+    outputs={
+        "quantized": Tensor(
+            ["M", "K_packed"],
+            dtype="uint8",
+            description="Packed FP4 output.",
+        ),
+        "scales": Tensor(
+            ["num_scale_elems"],
+            dtype="uint8",
+            description="UE8M0 block scale factors (1 byte per 32-element block).",
+        ),
+    },
+    constraints=["K_packed == K // 2"],
+    tags=["status:verified", "quantization:mxfp4"],
+    reference=_mxfp4_quantize_reference,
+)
+
+# ── MXFP8 quantization ────────────────────────────────────────────────────────
+
+mxfp8_quantize_trace = TraceTemplate(
+    op_type="quantization",
+    name_prefix="mxfp8_quantize",
+    description="MXFP8 quantization (block size 32, UE8M0 scales). Output is fp8_e4m3fn.",
+    axes={
+        "M": Var(description="Number of rows."),
+        "K": Const(abbrev="k", description="Number of input columns."),
+        "num_scale_elems": Var(
+            description="Total number of scale factor elements (layout-dependent)."
+        ),
+    },
+    inputs={
+        "input": Tensor(
+            ["M", "K"],
+            param="input",
+            description="Input tensor, fp16/bf16.",
+        ),
+    },
+    outputs={
+        "quantized": Tensor(
+            ["M", "K"],
+            dtype="float8_e4m3fn",
+            description="MXFP8 quantized output.",
+        ),
+        "scales": Tensor(
+            ["num_scale_elems"],
+            dtype="uint8",
+            description="UE8M0 block scale factors (1 byte per 32-element block).",
+        ),
+    },
+    tags=["status:verified", "quantization:mxfp8"],
+    reference=_mxfp8_quantize_reference,
+)
diff --git a/flashinfer/trace/templates/rope.py b/flashinfer/trace/templates/rope.py
new file mode 100644
index 0000000000..99ef43bd72
--- /dev/null
+++ b/flashinfer/trace/templates/rope.py
@@ -0,0 +1,567 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for RoPE (Rotary Position Embedding) operations."""
+
+import math
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+_AxisT = Union[Var, Const]
+_InputT = Union[Tensor, Scalar]
+
+
+# ── Reference helpers ────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _rope_freqs(
+    rotary_dim: int,
+    rope_theta: float,
+    device: torch.device,
+) -> torch.Tensor:
+    """Base RoPE inverse-frequency vector (length rotary_dim // 2)."""
+    i = torch.arange(0, rotary_dim, 2, dtype=torch.float32, device=device)
+    return 1.0 / torch.pow(
+        torch.tensor(rope_theta, dtype=torch.float32, device=device), i / rotary_dim
+    )
+
+
+@torch.no_grad()
+def _llama31_freqs(
+    rotary_dim: int,
+    rope_theta: float,
+    rope_scale: float,
+    low_freq_factor: float,
+    high_freq_factor: float,
+    old_context_len: float,
+    device: torch.device,
+) -> torch.Tensor:
+    """Llama 3.1 piecewise NTK-aware frequency scaling."""
+    freqs = _rope_freqs(rotary_dim, rope_theta, device)
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / freqs
+    # Default: scale by 1/rope_scale (low-frequency regime).
+    new_freqs = freqs / rope_scale
+    # Smooth interpolation for mid-range.
+    smooth = (old_context_len / wavelen - low_freq_factor) / (
+        high_freq_factor - low_freq_factor
+    )
+    mid = (wavelen >= high_freq_wavelen) & (wavelen <= low_freq_wavelen)
+    new_freqs = torch.where(
+        mid,
+        (1.0 - smooth) * freqs / rope_scale + smooth * freqs,
+        new_freqs,
+    )
+    # High frequency (short wavelength): keep original.
+    new_freqs = torch.where(wavelen < high_freq_wavelen, freqs, new_freqs)
+    return new_freqs
+
+
+@torch.no_grad()
+def _rotate(
+    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleave: bool
+) -> torch.Tensor:
+    """Apply RoPE rotation to the last ``rotary_dim`` channels of x.
+
+    cos/sin have shape ``[..., rotary_dim//2]`` broadcastable to x's leading
+    dims. If ``interleave`` the rotation is on even/odd pairs, otherwise on
+    the half-split halves (first-half / second-half).
+    """
+    rotary_dim = cos.shape[-1] * 2
+    x_rot = x[..., :rotary_dim]
+    x_pass = x[..., rotary_dim:]
+    if interleave:
+        x1 = x_rot[..., 0::2]
+        x2 = x_rot[..., 1::2]
+        rotated_1 = x1 * cos - x2 * sin
+        rotated_2 = x2 * cos + x1 * sin
+        interleaved = torch.stack([rotated_1, rotated_2], dim=-1)
+        rotated = interleaved.reshape(*x_rot.shape)
+    else:
+        half = rotary_dim // 2
+        x1 = x_rot[..., :half]
+        x2 = x_rot[..., half:]
+        rotated_1 = x1 * cos - x2 * sin
+        rotated_2 = x2 * cos + x1 * sin
+        rotated = torch.cat([rotated_1, rotated_2], dim=-1)
+    if x_pass.numel() == 0:
+        return rotated.to(x.dtype)
+    return torch.cat([rotated.to(x.dtype), x_pass], dim=-1)
+
+
+@torch.no_grad()
+def _positions_from_indptr(
+    indptr: torch.Tensor, offsets: torch.Tensor, nnz: int
+) -> torch.Tensor:
+    """Expand (indptr, offsets) into a per-token position tensor of length nnz."""
+    positions = torch.zeros(nnz, dtype=torch.float32, device=indptr.device)
+    batch_size = offsets.shape[0]
+    for b in range(batch_size):
+        start = int(indptr[b].item())
+        end = int(indptr[b + 1].item())
+        off = int(offsets[b].item())
+        n = end - start
+        if n > 0:
+            positions[start:end] = off + torch.arange(
+                n, dtype=torch.float32, device=indptr.device
+            )
+    return positions
+
+
+@torch.no_grad()
+def _apply_rope_core(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    positions: torch.Tensor,
+    freqs: torch.Tensor,
+    interleave: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Shared core: given per-token positions and freqs, rotate q and k."""
+    # cos/sin: [nnz, rotary_dim//2]
+    angles = positions.unsqueeze(-1) * freqs.unsqueeze(0)
+    cos = torch.cos(angles).unsqueeze(1)  # [nnz, 1, rotary_dim//2]
+    sin = torch.sin(angles).unsqueeze(1)
+    q_rope = _rotate(q.to(torch.float32), cos, sin, interleave)
+    k_rope = _rotate(k.to(torch.float32), cos, sin, interleave)
+    return q_rope, k_rope
+
+
+# ── Per-template references ──────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _apply_rope_reference(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    indptr: torch.Tensor,
+    offsets: torch.Tensor,
+    rotary_dim: Optional[int] = None,
+    interleave: bool = False,
+    rope_scale: float = 1,
+    rope_theta: float = 1e4,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if rotary_dim is None:
+        rotary_dim = q.shape[-1]
+    freqs = _rope_freqs(rotary_dim, rope_theta, q.device) / rope_scale
+    positions = _positions_from_indptr(indptr, offsets, q.shape[0])
+    return _apply_rope_core(q, k, positions, freqs, interleave)
+
+
+@torch.no_grad()
+def _apply_rope_pos_ids_reference(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    pos_ids: torch.Tensor,
+    rotary_dim: Optional[int] = None,
+    interleave: bool = False,
+    rope_scale: float = 1,
+    rope_theta: float = 1e4,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if rotary_dim is None:
+        rotary_dim = q.shape[-1]
+    freqs = _rope_freqs(rotary_dim, rope_theta, q.device) / rope_scale
+    return _apply_rope_core(q, k, pos_ids.to(torch.float32), freqs, interleave)
+
+
+@torch.no_grad()
+def _apply_llama31_rope_reference(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    indptr: torch.Tensor,
+    offsets: torch.Tensor,
+    rotary_dim: Optional[int] = None,
+    interleave: bool = False,
+    rope_scale: float = 8,
+    rope_theta: float = 5e5,
+    low_freq_factor: float = 1,
+    high_freq_factor: float = 4,
+    old_context_len: int = 8192,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if rotary_dim is None:
+        rotary_dim = q.shape[-1]
+    freqs = _llama31_freqs(
+        rotary_dim,
+        rope_theta,
+        rope_scale,
+        low_freq_factor,
+        high_freq_factor,
+        float(old_context_len),
+        q.device,
+    )
+    positions = _positions_from_indptr(indptr, offsets, q.shape[0])
+    return _apply_rope_core(q, k, positions, freqs, interleave)
+
+
+@torch.no_grad()
+def _apply_llama31_rope_pos_ids_reference(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    pos_ids: torch.Tensor,
+    rotary_dim: Optional[int] = None,
+    interleave: bool = False,
+    rope_scale: float = 8,
+    rope_theta: float = 5e5,
+    low_freq_factor: float = 1,
+    high_freq_factor: float = 4,
+    old_context_len: int = 8192,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if rotary_dim is None:
+        rotary_dim = q.shape[-1]
+    freqs = _llama31_freqs(
+        rotary_dim,
+        rope_theta,
+        rope_scale,
+        low_freq_factor,
+        high_freq_factor,
+        float(old_context_len),
+        q.device,
+    )
+    return _apply_rope_core(q, k, pos_ids.to(torch.float32), freqs, interleave)
+
+
+@torch.no_grad()
+def _apply_rope_with_cos_sin_cache_reference(
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    head_size: int,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool = True,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply RoPE with a precomputed cos/sin cache.
+
+    cos_sin_cache is ``[max_seq_len, rotary_dim]`` where the first half is
+    cos and the second half is sin. is_neox=True → half-split rotation;
+    is_neox=False → interleaved rotation.
+    """
+    rotary_dim = cos_sin_cache.shape[-1]
+    cos_cache = cos_sin_cache[:, : rotary_dim // 2]
+    sin_cache = cos_sin_cache[:, rotary_dim // 2 :]
+    cos = cos_cache[positions.to(torch.long)].unsqueeze(1)  # [nnz, 1, rotary_dim//2]
+    sin = sin_cache[positions.to(torch.long)].unsqueeze(1)
+    # Reshape flattened (nnz, H*D) → (nnz, H, D) for rotation.
+    q_view = query.view(query.shape[0], -1, head_size)
+    k_view = key.view(key.shape[0], -1, head_size)
+    q_rope = _rotate(q_view.to(torch.float32), cos, sin, interleave=not is_neox)
+    k_rope = _rotate(k_view.to(torch.float32), cos, sin, interleave=not is_neox)
+    return (
+        q_rope.reshape(query.shape).to(query.dtype),
+        k_rope.reshape(key.shape).to(key.dtype),
+    )
+
+
+# ── Shared axes ───────────────────────────────────────────────────────────────
+
+_RAGGED_AXES: Dict[str, _AxisT] = {
+    "nnz": Var(description="Total number of tokens across the batch."),
+    "batch_size": Var(description="Number of sequences in the batch."),
+    "num_q_heads": Const(abbrev="h"),
+    "num_k_heads": Const(abbrev="kv"),
+    "head_dim": Const(abbrev="d"),
+}
+
+_POSIDS_AXES: Dict[str, _AxisT] = {
+    "nnz": Var(description="Total number of tokens across the batch."),
+    "num_q_heads": Const(abbrev="h"),
+    "num_k_heads": Const(abbrev="kv"),
+    "head_dim": Const(abbrev="d"),
+}
+
+_COSSIN_AXES: Dict[str, _AxisT] = {
+    "nnz": Var(description="Total number of tokens across the batch."),
+    "num_q_heads_x_head_size": Const(
+        description="num_q_heads * head_size (flattened query dimension).", abbrev=""
+    ),
+    "num_k_heads_x_head_size": Const(
+        description="num_k_heads * head_size (flattened key dimension).", abbrev=""
+    ),
+    "head_size": Const(abbrev="d"),
+    "max_seq_len": Var(description="cos_sin_cache length (max supported position)."),
+    "rotary_dim": Const(
+        description="Rotary dimension (cos+sin concatenated along last axis).",
+        abbrev="",
+    ),
+}
+
+# ── Base ragged RoPE (indptr + offsets) ──────────────────────────────────────
+
+_RAGGED_INPUTS: Dict[str, _InputT] = {
+    "q": Tensor(["nnz", "num_q_heads", "head_dim"]),
+    "k": Tensor(["nnz", "num_k_heads", "head_dim"]),
+    "indptr": Tensor(
+        ["batch_size_plus_1"],
+        dtype="int32",
+        description="Ragged batch indptr, shape (batch_size + 1).",
+    ),
+    "offsets": Tensor(
+        ["batch_size"],
+        dtype="int32",
+        description="Per-sequence starting position offset.",
+    ),
+    "rotary_dim": Scalar(
+        "int32",
+        optional=True,
+        description="If None, uses head_dim. Rotate only the first `rotary_dim` dims.",
+    ),
+    "interleave": Scalar(
+        "int32",
+        optional=True,
+        description="Bool: interleaved (True) vs half-split (False) rotation.",
+    ),
+    "rope_scale": Scalar("float32", optional=True, description="Scale factor."),
+    "rope_theta": Scalar("float32", optional=True, description="Theta value."),
+}
+
+apply_rope_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="rope",
+    description="Standard RoPE on ragged q/k using indptr + per-seq offsets.",
+    axes={**_RAGGED_AXES, "batch_size_plus_1": Var(description="batch_size + 1.")},
+    inputs=_RAGGED_INPUTS,
+    outputs={
+        "q_rope": Tensor(["nnz", "num_q_heads", "head_dim"], dtype_from="q"),
+        "k_rope": Tensor(["nnz", "num_k_heads", "head_dim"], dtype_from="k"),
+    },
+    constraints=["batch_size_plus_1 == batch_size + 1"],
+    tags=["status:verified"],
+    reference=_apply_rope_reference,
+)
+
+apply_rope_inplace_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="rope_inplace",
+    description="In-place standard RoPE; q and k are mutated.",
+    axes={**_RAGGED_AXES, "batch_size_plus_1": Var(description="batch_size + 1.")},
+    inputs=_RAGGED_INPUTS,
+    outputs={
+        "q": Tensor(
+            ["nnz", "num_q_heads", "head_dim"],
+            dtype_from="q",
+            description="Updated q (in-place).",
+        ),
+        "k": Tensor(
+            ["nnz", "num_k_heads", "head_dim"],
+            dtype_from="k",
+            description="Updated k (in-place).",
+        ),
+    },
+    constraints=["batch_size_plus_1 == batch_size + 1"],
+    tags=["status:verified"],
+    reference=_apply_rope_reference,
+)
+
+# ── pos_ids RoPE ──────────────────────────────────────────────────────────────
+
+_POSIDS_INPUTS: Dict[str, _InputT] = {
+    "q": Tensor(["nnz", "num_q_heads", "head_dim"]),
+    "k": Tensor(["nnz", "num_k_heads", "head_dim"]),
+    "pos_ids": Tensor(["nnz"], dtype="int32", description="Per-token position index."),
+    "rotary_dim": Scalar("int32", optional=True),
+    "interleave": Scalar("int32", optional=True),
+    "rope_scale": Scalar("float32", optional=True),
+    "rope_theta": Scalar("float32", optional=True),
+}
+
+apply_rope_pos_ids_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="rope_pos_ids",
+    description="Standard RoPE using explicit per-token position ids.",
+    axes=_POSIDS_AXES,
+    inputs=_POSIDS_INPUTS,
+    outputs={
+        "q_rope": Tensor(["nnz", "num_q_heads", "head_dim"], dtype_from="q"),
+        "k_rope": Tensor(["nnz", "num_k_heads", "head_dim"], dtype_from="k"),
+    },
+    tags=["status:verified"],
+    reference=_apply_rope_pos_ids_reference,
+)
+
+apply_rope_pos_ids_inplace_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="rope_pos_ids_inplace",
+    description="In-place RoPE using explicit per-token position ids.",
+    axes=_POSIDS_AXES,
+    inputs=_POSIDS_INPUTS,
+    outputs={
+        "q": Tensor(
+            ["nnz", "num_q_heads", "head_dim"],
+            dtype_from="q",
+            description="Updated q (in-place).",
+        ),
+        "k": Tensor(
+            ["nnz", "num_k_heads", "head_dim"],
+            dtype_from="k",
+            description="Updated k (in-place).",
+        ),
+    },
+    tags=["status:verified"],
+    reference=_apply_rope_pos_ids_reference,
+)
+
+# ── Llama 3.1 RoPE ────────────────────────────────────────────────────────────
+
+_LLAMA31_EXTRA: Dict[str, _InputT] = {
+    "low_freq_factor": Scalar(
+        "float32", optional=True, description="Llama 3.1 low-frequency scaling factor."
+    ),
+    "high_freq_factor": Scalar(
+        "float32", optional=True, description="Llama 3.1 high-frequency scaling factor."
+    ),
+    "old_context_len": Scalar(
+        "int32", optional=True, description="Original pretraining context length."
+    ),
+}
+
+_LLAMA31_RAGGED_INPUTS: Dict[str, _InputT] = {**_RAGGED_INPUTS, **_LLAMA31_EXTRA}
+_LLAMA31_POSIDS_INPUTS: Dict[str, _InputT] = {**_POSIDS_INPUTS, **_LLAMA31_EXTRA}
+
+apply_llama31_rope_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="llama31_rope",
+    description="Llama 3.1 RoPE on ragged q/k with indptr + offsets.",
+    axes={**_RAGGED_AXES, "batch_size_plus_1": Var(description="batch_size + 1.")},
+    inputs=_LLAMA31_RAGGED_INPUTS,
+    outputs={
+        "q_rope": Tensor(["nnz", "num_q_heads", "head_dim"], dtype_from="q"),
+        "k_rope": Tensor(["nnz", "num_k_heads", "head_dim"], dtype_from="k"),
+    },
+    constraints=["batch_size_plus_1 == batch_size + 1"],
+    tags=["status:verified", "model:llama"],
+    reference=_apply_llama31_rope_reference,
+)
+
+apply_llama31_rope_inplace_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="llama31_rope_inplace",
+    description="In-place Llama 3.1 RoPE with indptr + offsets.",
+    axes={**_RAGGED_AXES, "batch_size_plus_1": Var(description="batch_size + 1.")},
+    inputs=_LLAMA31_RAGGED_INPUTS,
+    outputs={
+        "q": Tensor(
+            ["nnz", "num_q_heads", "head_dim"],
+            dtype_from="q",
+            description="Updated q (in-place).",
+        ),
+        "k": Tensor(
+            ["nnz", "num_k_heads", "head_dim"],
+            dtype_from="k",
+            description="Updated k (in-place).",
+        ),
+    },
+    constraints=["batch_size_plus_1 == batch_size + 1"],
+    tags=["status:verified", "model:llama"],
+    reference=_apply_llama31_rope_reference,
+)
+
+apply_llama31_rope_pos_ids_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="llama31_rope_pos_ids",
+    description="Llama 3.1 RoPE using per-token position ids.",
+    axes=_POSIDS_AXES,
+    inputs=_LLAMA31_POSIDS_INPUTS,
+    outputs={
+        "q_rope": Tensor(["nnz", "num_q_heads", "head_dim"], dtype_from="q"),
+        "k_rope": Tensor(["nnz", "num_k_heads", "head_dim"], dtype_from="k"),
+    },
+    tags=["status:verified", "model:llama"],
+    reference=_apply_llama31_rope_pos_ids_reference,
+)
+
+apply_llama31_rope_pos_ids_inplace_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="llama31_rope_pos_ids_inplace",
+    description="In-place Llama 3.1 RoPE using per-token position ids.",
+    axes=_POSIDS_AXES,
+    inputs=_LLAMA31_POSIDS_INPUTS,
+    outputs={
+        "q": Tensor(
+            ["nnz", "num_q_heads", "head_dim"],
+            dtype_from="q",
+            description="Updated q (in-place).",
+        ),
+        "k": Tensor(
+            ["nnz", "num_k_heads", "head_dim"],
+            dtype_from="k",
+            description="Updated k (in-place).",
+        ),
+    },
+    tags=["status:verified", "model:llama"],
+    reference=_apply_llama31_rope_pos_ids_reference,
+)
+
+# ── cos/sin cache variant (SGL/vLLM-compatible) ───────────────────────────────
+
+_COSSIN_INPUTS: Dict[str, _InputT] = {
+    "positions": Tensor(
+        ["nnz"], dtype="int32", description="Per-token position index."
+    ),
+    "query": Tensor(
+        ["nnz", "num_q_heads_x_head_size"],
+        description="Flattened query tensor (nnz, num_q_heads * head_size).",
+    ),
+    "key": Tensor(
+        ["nnz", "num_k_heads_x_head_size"],
+        description="Flattened key tensor (nnz, num_k_heads * head_size).",
+    ),
+    "head_size": Scalar("int32", description="Head dimension."),
+    "cos_sin_cache": Tensor(
+        ["max_seq_len", "rotary_dim"],
+        dtype="float32",
+        description="Precomputed cos+sin cache; cos first half, sin second half.",
+    ),
+    "is_neox": Scalar(
+        "int32", optional=True, description="Bool: Neox (True) vs interleaved (False)."
+    ),
+}
+
+apply_rope_with_cos_sin_cache_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="rope_cos_sin_cache",
+    description="RoPE with precomputed cos/sin cache (SGL/vLLM-compatible).",
+    axes=_COSSIN_AXES,
+    inputs=_COSSIN_INPUTS,
+    outputs={
+        "query_out": Tensor(["nnz", "num_q_heads_x_head_size"], dtype_from="query"),
+        "key_out": Tensor(["nnz", "num_k_heads_x_head_size"], dtype_from="key"),
+    },
+    tags=["status:verified"],
+    reference=_apply_rope_with_cos_sin_cache_reference,
+)
+
+apply_rope_with_cos_sin_cache_inplace_trace = TraceTemplate(
+    op_type="rope",
+    name_prefix="rope_cos_sin_cache_inplace",
+    description="In-place RoPE with precomputed cos/sin cache.",
+    axes=_COSSIN_AXES,
+    inputs=_COSSIN_INPUTS,
+    outputs={
+        "query": Tensor(
+            ["nnz", "num_q_heads_x_head_size"],
+            dtype_from="query",
+            description="Updated query (in-place).",
+        ),
+        "key": Tensor(
+            ["nnz", "num_k_heads_x_head_size"],
+            dtype_from="key",
+            description="Updated key (in-place).",
+        ),
+    },
+    tags=["status:verified"],
+    reference=_apply_rope_with_cos_sin_cache_reference,
+)
diff --git a/flashinfer/trace/templates/sampling.py b/flashinfer/trace/templates/sampling.py
new file mode 100644
index 0000000000..0a6ba80fe0
--- /dev/null
+++ b/flashinfer/trace/templates/sampling.py
@@ -0,0 +1,566 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TraceTemplates for sampling operations."""
+
+import torch
+
+from ..template import Const, Scalar, Tensor, TraceTemplate, Var
+
+# ── Top-k sampling ────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _top_k_sampling_reference(probs, top_k):
+    """Top-k sampling: keep only the k highest probability tokens, renormalize, then sample."""
+    batch_size, vocab_size = probs.shape
+    device = probs.device
+    probs = probs.to(torch.float32)
+    samples = torch.empty(batch_size, dtype=torch.int64, device=device)
+    for i in range(batch_size):
+        row = probs[i]
+        k = int(top_k[i].item())
+        if 0 < k < vocab_size:
+            idx_sorted = torch.argsort(row, descending=True)
+            keep_idx = idx_sorted[:k]
+            filtered = torch.zeros_like(row)
+            filtered[keep_idx] = row[keep_idx]
+            row = filtered / filtered.sum()
+        samples[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)
+    return samples
+
+
+top_k_sampling_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="top_k_sampling",
+    description=(
+        "Top-k sampling from probabilities. Keeps only the k highest probability tokens, "
+        "renormalizes, then samples from the filtered distribution."
+    ),
+    axes={
+        "batch_size": Var(description="Number of sequences to sample from"),
+        "vocab_size": Const(description="Vocabulary size.", abbrev="v"),
+    },
+    inputs={
+        "probs": Tensor(
+            ["batch_size", "vocab_size"],
+            description="Probability distributions (after softmax)",
+        ),
+        "top_k": Tensor(
+            ["batch_size"],
+            description="Number of top tokens to consider for sampling per sequence",
+        ),
+    },
+    outputs={
+        "samples": Tensor(
+            ["batch_size"],
+            dtype="int64",
+            description="Sampled token indices",
+        ),
+    },
+    tags=["status:verified"],
+    reference=_top_k_sampling_reference,
+)
+
+# ── Top-p sampling ────────────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _top_p_sampling_reference(probs, top_p):
+    """Top-p (nucleus) sampling: filter by cumulative probability threshold, then sample."""
+    batch_size, vocab_size = probs.shape
+    device = probs.device
+    probs = probs.to(torch.float32)
+    out = torch.empty(batch_size, dtype=torch.int64, device=device)
+    for i in range(batch_size):
+        row = probs[i]
+        p = float(top_p[i].item())
+        if p <= 0.0:
+            out[i] = torch.argmax(row).to(torch.int64)
+            continue
+        if p < 1.0:
+            vals, idx = torch.sort(row, descending=True)
+            cdf = torch.cumsum(vals, dim=0)
+            to_remove = cdf > p
+            to_remove[1:] = to_remove[:-1].clone()
+            to_remove[0] = False
+            keep_idx = idx[~to_remove]
+            filtered = torch.zeros_like(row)
+            filtered[keep_idx] = row[keep_idx]
+            row = filtered / filtered.sum()
+        out[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)
+    return out
+
+
+top_p_sampling_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="top_p_sampling",
+    description=(
+        "Top-p (nucleus) sampling from probabilities. Filters probabilities using "
+        "cumulative probability threshold, then samples from the filtered distribution."
+    ),
+    axes={
+        "batch_size": Var(description="Number of sequences to sample from"),
+        "vocab_size": Const(description="Vocabulary size.", abbrev="v"),
+    },
+    inputs={
+        "probs": Tensor(
+            ["batch_size", "vocab_size"],
+            description="Probability distributions (after softmax)",
+        ),
+        "top_p": Tensor(
+            ["batch_size"],
+            description="Cumulative probability threshold for nucleus sampling per sequence",
+        ),
+    },
+    outputs={
+        "samples": Tensor(
+            ["batch_size"],
+            dtype="int64",
+            description="Sampled token indices",
+        ),
+    },
+    tags=["status:verified"],
+    reference=_top_p_sampling_reference,
+)
+
+# ── Top-k + Top-p sampling ────────────────────────────────────────────────────
+
+
+@torch.no_grad()
+def _top_k_top_p_sampling_reference(probs, top_k, top_p):
+    """Top-k then top-p (nucleus) sampling: apply both filters, then sample."""
+    batch_size, vocab_size = probs.shape
+    device = probs.device
+    probs = probs.to(torch.float32)
+    samples = torch.empty(batch_size, dtype=torch.int64, device=device)
+    for i in range(batch_size):
+        row = probs[i]
+        k = int(top_k[i].item())
+        p = float(top_p[i].item())
+        if 0 < k < vocab_size:
+            idx_sorted = torch.argsort(row, descending=True)
+            keep_idx_k = idx_sorted[:k]
+            filtered_k = torch.zeros_like(row)
+            filtered_k[keep_idx_k] = row[keep_idx_k]
+            row = filtered_k / filtered_k.sum()
+        if p <= 0.0:
+            samples[i] = torch.argmax(row).to(torch.int64)
+            continue
+        if p < 1.0:
+            vals, idx = torch.sort(row, descending=True)
+            cdf = torch.cumsum(vals, dim=0)
+            to_remove = cdf > p
+            if vocab_size > 1:
+                to_remove[1:] = to_remove[:-1].clone()
+                to_remove[0] = False
+            keep_idx_p = idx[~to_remove]
+            filtered_p = torch.zeros_like(row)
+            filtered_p[keep_idx_p] = row[keep_idx_p]
+            row = filtered_p / filtered_p.sum()
+        samples[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)
+    return samples
+
+
+top_k_top_p_sampling_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="top_k_top_p_sampling",
+    description=(
+        "Top-k top-p (nucleus) sampling from probabilities. Filters probabilities using "
+        "top-k and top-p constraints, then samples from the filtered distribution."
+    ),
+    axes={
+        "batch_size": Var(description="Number of sequences to sample from"),
+        "vocab_size": Const(description="Vocabulary size.", abbrev="v"),
+    },
+    inputs={
+        "probs": Tensor(
+            ["batch_size", "vocab_size"],
+            description="Probability distributions (after softmax)",
+        ),
+        "top_k": Tensor(
+            ["batch_size"],
+            description="Number of top tokens to consider for sampling per sequence",
+        ),
+        "top_p": Tensor(
+            ["batch_size"],
+            description="Cumulative probability threshold for nucleus sampling per sequence",
+        ),
+    },
+    outputs={
+        "samples": Tensor(
+            ["batch_size"],
+            dtype="int64",
+            description="Sampled token indices",
+        ),
+    },
+    tags=["status:verified"],
+    reference=_top_k_top_p_sampling_reference,
+)
+
+
+# ── Free-function sampling utilities ─────────────────────────────────────────
+
+
+@torch.no_grad()
+def _softmax_reference(logits, temperature=None, **_unused):
+    """Online safe softmax with optional temperature scaling."""
+    x = logits.to(torch.float32)
+    if temperature is not None:
+        if isinstance(temperature, torch.Tensor):
+            t = temperature.to(torch.float32).reshape(-1, 1)
+        else:
+            t = float(temperature)
+        x = x / t
+    return torch.softmax(x, dim=-1).to(logits.dtype)
+
+
+softmax_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="softmax",
+    description="Fused online safe softmax with optional temperature scaling.",
+    axes={
+        "batch_size": Var(),
+        "vocab_size": Const(abbrev="v"),
+    },
+    inputs={
+        "logits": Tensor(["batch_size", "vocab_size"]),
+        "temperature": Scalar(
+            "float32",
+            optional=True,
+            description="Per-tensor or per-row temperature.",
+        ),
+    },
+    outputs={
+        "output": Tensor(["batch_size", "vocab_size"], dtype_from="logits"),
+    },
+    tags=["status:verified"],
+    reference=_softmax_reference,
+)
+
+
+@torch.no_grad()
+def _sampling_from_probs_reference(probs, indices=None, **_unused):
+    """Categorical sampling from probabilities (deterministic: argmax)."""
+    p = probs.to(torch.float32)
+    if indices is not None:
+        p = p[indices.to(torch.long)]
+    return p.argmax(dim=-1).to(torch.int32)
+
+
+_sampling_common_axes: dict[str, Var | Const] = {
+    "batch_size": Var(),
+    "vocab_size": Const(abbrev="v"),
+    "num_indices": Var(description="Length of optional indices tensor."),
+}
+
+sampling_from_probs_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="sampling_from_probs",
+    description=(
+        "Fused categorical sampling from [batch_size, vocab_size] probs. "
+        "Reference uses argmax (matches deterministic=True)."
+    ),
+    axes=dict(_sampling_common_axes),
+    inputs={
+        "probs": Tensor(["batch_size", "vocab_size"]),
+        "indices": Tensor(
+            ["num_indices"],
+            dtype="int32",
+            optional=True,
+        ),
+    },
+    outputs={"samples": Tensor(["batch_size"], dtype="int32")},
+    tags=["status:verified"],
+    reference=_sampling_from_probs_reference,
+)
+
+
+@torch.no_grad()
+def _sampling_from_logits_reference(logits, indices=None, **_unused):
+    probs = torch.softmax(logits.to(torch.float32), dim=-1)
+    return _sampling_from_probs_reference(probs, indices=indices)
+
+
+sampling_from_logits_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="sampling_from_logits",
+    description=(
+        "Fused sampling from logits (equivalent to softmax + sampling). "
+        "Reference uses softmax + argmax."
+    ),
+    axes=dict(_sampling_common_axes),
+    inputs={
+        "logits": Tensor(["batch_size", "vocab_size"]),
+        "indices": Tensor(
+            ["num_indices"],
+            dtype="int32",
+            optional=True,
+        ),
+    },
+    outputs={"samples": Tensor(["batch_size"], dtype="int32")},
+    tags=["status:verified"],
+    reference=_sampling_from_logits_reference,
+)
+
+
+@torch.no_grad()
+def _min_p_sampling_reference(probs, min_p, indices=None, **_unused):
+    """Min-p sampling: keep probs >= min_p * max_prob, renormalise, then argmax."""
+    p = probs.to(torch.float32)
+    if indices is not None:
+        p = p[indices.to(torch.long)]
+    if isinstance(min_p, torch.Tensor):
+        mp = min_p.to(torch.float32).reshape(-1, 1)
+    else:
+        mp = float(min_p)
+    threshold = p.max(dim=-1, keepdim=True).values * mp
+    mask = p >= threshold
+    p_masked = torch.where(mask, p, torch.zeros_like(p))
+    p_masked = p_masked / (p_masked.sum(dim=-1, keepdim=True) + 1e-20)
+    return p_masked.argmax(dim=-1).to(torch.int32)
+
+
+min_p_sampling_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="min_p_sampling",
+    description=(
+        "Fused min-p sampling: keep probs >= min_p * max_prob, renormalise, "
+        "categorical sample."
+    ),
+    axes=dict(_sampling_common_axes),
+    inputs={
+        "probs": Tensor(["batch_size", "vocab_size"]),
+        "min_p": Scalar(
+            "float32",
+            description="Min-p threshold (scalar or per-row tensor).",
+        ),
+        "indices": Tensor(
+            ["num_indices"],
+            dtype="int32",
+            optional=True,
+        ),
+    },
+    outputs={"samples": Tensor(["batch_size"], dtype="int32")},
+    tags=["status:verified"],
+    reference=_min_p_sampling_reference,
+)
+
+
+@torch.no_grad()
+def _top_p_renorm_probs_reference(probs, top_p, **_unused):
+    """Renormalise probs by top-p thresholding."""
+    p = probs.to(torch.float32)
+    if isinstance(top_p, torch.Tensor):
+        tp = top_p.to(torch.float32).reshape(-1, 1)
+    else:
+        tp = float(top_p)
+    sorted_p, sorted_idx = torch.sort(p, dim=-1, descending=True)
+    cumsum = sorted_p.cumsum(dim=-1)
+    keep_sorted = (cumsum - sorted_p) < tp
+    keep = torch.zeros_like(p, dtype=torch.bool).scatter_(-1, sorted_idx, keep_sorted)
+    p_masked = torch.where(keep, p, torch.zeros_like(p))
+    return (p_masked / (p_masked.sum(dim=-1, keepdim=True) + 1e-20)).to(probs.dtype)
+
+
+top_p_renorm_probs_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="top_p_renorm_probs",
+    description="Renormalise probabilities by top-p thresholding.",
+    axes={"batch_size": Var(), "vocab_size": Const(abbrev="v")},
+    inputs={
+        "probs": Tensor(["batch_size", "vocab_size"]),
+        "top_p": Scalar("float32"),
+    },
+    outputs={
+        "renormalized": Tensor(["batch_size", "vocab_size"], dtype_from="probs"),
+    },
+    tags=["status:verified"],
+    reference=_top_p_renorm_probs_reference,
+)
+
+
+@torch.no_grad()
+def _top_k_renorm_probs_reference(probs, top_k, **_unused):
+    """Renormalise probs by top-k thresholding."""
+    p = probs.to(torch.float32)
+    if isinstance(top_k, torch.Tensor):
+        k = int(top_k.max().item())
+    else:
+        k = int(top_k)
+    _, topk_idx = torch.topk(p, k=k, dim=-1)
+    mask = torch.zeros_like(p, dtype=torch.bool)
+    mask.scatter_(-1, topk_idx, True)
+    p_masked = torch.where(mask, p, torch.zeros_like(p))
+    return (p_masked / (p_masked.sum(dim=-1, keepdim=True) + 1e-20)).to(probs.dtype)
+
+
+top_k_renorm_probs_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="top_k_renorm_probs",
+    description="Renormalise probabilities by top-k thresholding.",
+    axes={"batch_size": Var(), "vocab_size": Const(abbrev="v")},
+    inputs={
+        "probs": Tensor(["batch_size", "vocab_size"]),
+        "top_k": Scalar("int32"),
+    },
+    outputs={
+        "renormalized": Tensor(["batch_size", "vocab_size"], dtype_from="probs"),
+    },
+    tags=["status:verified"],
+    reference=_top_k_renorm_probs_reference,
+)
+
+
+@torch.no_grad()
+def _top_k_mask_logits_reference(logits, top_k, **_unused):
+    """Mask logits outside the top-k to -inf."""
+    x = logits.to(torch.float32)
+    if isinstance(top_k, torch.Tensor):
+        k = int(top_k.max().item())
+    else:
+        k = int(top_k)
+    _, topk_idx = torch.topk(x, k=k, dim=-1)
+    mask = torch.full_like(x, float("-inf"))
+    mask.scatter_(-1, topk_idx, 0.0)
+    return (x + mask).to(logits.dtype)
+
+
+top_k_mask_logits_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="top_k_mask_logits",
+    description="Mask out-of-top-k logits to -inf.",
+    axes={"batch_size": Var(), "vocab_size": Const(abbrev="v")},
+    inputs={
+        "logits": Tensor(["batch_size", "vocab_size"]),
+        "top_k": Scalar("int32"),
+    },
+    outputs={
+        "masked_logits": Tensor(["batch_size", "vocab_size"], dtype_from="logits"),
+    },
+    tags=["status:verified"],
+    reference=_top_k_mask_logits_reference,
+)
+
+
+@torch.no_grad()
+def _top_k_top_p_sampling_from_logits_reference(
+    logits, top_k, top_p, indices=None, filter_apply_order="top_k_first", **_unused
+):
+    """top-k + top-p sampling from logits (deterministic: argmax)."""
+    x = logits.to(torch.float32)
+    if filter_apply_order == "top_k_first":
+        x = _top_k_mask_logits_reference(x, top_k)
+        probs = torch.softmax(x, dim=-1)
+        probs = _top_p_renorm_probs_reference(probs, top_p)
+    else:  # "joint"
+        probs = torch.softmax(x, dim=-1)
+        probs = _top_k_renorm_probs_reference(probs, top_k)
+        probs = _top_p_renorm_probs_reference(probs, top_p)
+    if indices is not None:
+        probs = probs[indices.to(torch.long)]
+    return probs.argmax(dim=-1).to(torch.int32)
+
+
+top_k_top_p_sampling_from_logits_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="top_k_top_p_sampling_from_logits",
+    description=(
+        "Fused top-k + top-p sampling starting from logits. "
+        "Reference: softmax + top_k_mask + top_p_renorm + argmax."
+    ),
+    axes=dict(_sampling_common_axes),
+    inputs={
+        "logits": Tensor(["batch_size", "vocab_size"]),
+        "top_k": Scalar("int32"),
+        "top_p": Scalar("float32"),
+        "indices": Tensor(
+            ["num_indices"],
+            dtype="int32",
+            optional=True,
+        ),
+    },
+    outputs={"samples": Tensor(["batch_size"], dtype="int32")},
+    tags=["status:verified"],
+    reference=_top_k_top_p_sampling_from_logits_reference,
+)
+
+
+@torch.no_grad()
+def _chain_speculative_sampling_reference(
+    draft_probs,
+    draft_token_ids,
+    target_probs,
+    **_unused,
+):
+    """Deterministic chain speculative sampling: accept draft[i] iff
+    target_prob[draft[i]] >= draft_prob[draft[i]]; emit argmax of the
+    first rejecting target distribution (or last step)."""
+    B, S = draft_token_ids.shape
+    dp = draft_probs.to(torch.float32)
+    tp = target_probs.to(torch.float32)
+    out = torch.full(
+        (B, S + 1),
+        -1,
+        dtype=torch.int32,
+        device=draft_token_ids.device,
+    )
+    for b in range(B):
+        for s in range(S):
+            tok = int(draft_token_ids[b, s].item())
+            if tp[b, s, tok] >= dp[b, s, tok]:
+                out[b, s] = tok
+            else:
+                out[b, s] = int(tp[b, s].argmax().item())
+                break
+        else:
+            out[b, S] = int(tp[b, S].argmax().item())
+    return out
+
+
+chain_speculative_sampling_trace = TraceTemplate(
+    op_type="sampling",
+    name_prefix="chain_speculative_sampling",
+    description=(
+        "Chain speculative sampling: accept/reject draft tokens against target "
+        "distribution and emit the accepted prefix + one sampled final token."
+    ),
+    axes={
+        "batch_size": Var(),
+        "num_speculative": Var(description="Draft tokens per step."),
+        "num_speculative_plus_1": Var(
+            description="num_speculative + 1 (draft_probs axis)."
+        ),
+        "vocab_size": Const(abbrev="v"),
+    },
+    inputs={
+        "draft_probs": Tensor(
+            ["batch_size", "num_speculative_plus_1", "vocab_size"],
+        ),
+        "draft_token_ids": Tensor(
+            ["batch_size", "num_speculative"],
+            dtype="int32",
+        ),
+        "target_probs": Tensor(
+            ["batch_size", "num_speculative_plus_1", "vocab_size"],
+        ),
+    },
+    outputs={
+        "accepted_token_ids": Tensor(
+            ["batch_size", "num_speculative_plus_1"], dtype="int32"
+        ),
+    },
+    tags=["status:verified", "speculative"],
+    reference=_chain_speculative_sampling_reference,
+)
diff --git a/flashinfer/trtllm_low_latency_gemm.py b/flashinfer/trtllm_low_latency_gemm.py
index 3aea77affb..aeeb342409 100644
--- a/flashinfer/trtllm_low_latency_gemm.py
+++ b/flashinfer/trtllm_low_latency_gemm.py
@@ -116,6 +116,9 @@ def gemm_runner():
     )
 
 
+# No @flashinfer_api here: this is an internal helper called from the already-
+# decorated mm_fp8. Decorating here produced nested/duplicate log entries when
+# users called mm_fp8. Direct callers still work, just without per-call logging.
 def trtllm_low_latency_gemm(
     A: torch.Tensor,
     B: torch.Tensor,
diff --git a/flashinfer/xqa.py b/flashinfer/xqa.py
index f11944c5e2..0fe67cbd35 100755
--- a/flashinfer/xqa.py
+++ b/flashinfer/xqa.py
@@ -20,6 +20,7 @@
 import torch
 
 from .api_logging import flashinfer_api
+from .trace.templates.page import xqa_mla_trace, xqa_trace
 from .jit.xqa import gen_xqa_module, gen_xqa_module_mla
 from .jit.utils import filename_safe_dtype_map
 from .utils import (
@@ -150,7 +151,7 @@ def _fake_xqa(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=xqa_trace)
 def xqa(
     q: torch.Tensor,
     k_cache: torch.Tensor,
@@ -442,7 +443,7 @@ def _fake_xqa_mla(
     )
 
 
-@flashinfer_api
+@flashinfer_api(trace=xqa_mla_trace)
 def xqa_mla(
     q: torch.Tensor,
     k_cache: torch.Tensor,
diff --git a/tests/trace/example.py b/tests/trace/example.py
new file mode 100644
index 0000000000..a1067a19e5
--- /dev/null
+++ b/tests/trace/example.py
@@ -0,0 +1,802 @@
+"""
+fi_trace example: generate flashinfer-bench definition JSON files via auto-dump.
+
+Run:
+    python tests/trace/example.py
+
+When FLASHINFER_TRACE_DUMP=1 (set below), every @flashinfer_api(trace=...) decorated
+function automatically writes a trace JSON on its first call for each unique input
+shape.  Subsequent calls with the same shape are deduplicated (no re-write).
+
+The output directory is controlled by FLASHINFER_TRACE_DUMP_DIR.
+
+Requires a CUDA-capable GPU.
+
+Results:
+- We would get these example json files under fi_trace_out directory:
+fused_add_rmsnorm_h5120.json
+fused_add_rmsnorm_quant_h7168.json
+gdn_decode_qk4_v8_d128.json
+gdn_mtp_qk4_v8_d128.json
+gdn_prefill_qk4_v8_d128.json
+gemm_bf16_N256_K7168.json
+gemm_bf16_N4096_K4096.json
+gemm_fp4_N2048_K7168_block_size16.json
+gemm_fp8_N1536_K7168.json
+gemm_mxfp8_N4096_K4096.json
+gemma_fused_add_rmsnorm_h4608.json
+gemma_rmsnorm_h4608.json
+gelu_and_mul_h16384.json
+gelu_tanh_and_mul_h16384.json
+gqa_paged_decode_h32_kv8_d128_ps16.json
+gqa_paged_decode_h32_kv8_d128_ps64.json
+gqa_paged_prefill_h32_kv8_d128_ps16.json
+gqa_ragged_h32_kv8_d128.json
+layernorm_h768.json
+merge_state_h32_d128.json
+merge_state_in_place_h32_d128.json
+merge_states_h32_d128.json
+mla_paged_decode_h16_ckv512_kpe64_ps1.json
+mla_paged_decode_h16_ckv512_kpe64_ps64.json
+moe_fp4_block_scale_default_routing_topk8_e32_h7168_i2048.json
+moe_fp4_block_scale_ds_routing_topk8_e32_h7168_i2048_ng8_kg4.json
+moe_fp4_block_scale_llama4_routing_topk1_e32_h7168_i2048.json
+moe_fp4_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json
+moe_fp4_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json
+moe_fp4_block_scale_topk_routing_topk8_e32_h7168_i2048.json
+moe_fp8_block_scale_default_routing_topk8_e32_h7168_i2048.json
+moe_fp8_block_scale_ds_routing_topk8_ng8_kg4_e32_h7168_i2048.json
+moe_fp8_block_scale_llama4_routing_topk1_e32_h7168_i2048.json
+moe_fp8_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json
+moe_fp8_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json
+moe_fp8_block_scale_topk_routing_topk8_e32_h7168_i2048.json
+rmsnorm_h4096.json
+rmsnorm_h7168.json
+rmsnorm_quant_h7168.json
+silu_and_mul_h16384.json
+top_k_sampling_v128256.json
+top_k_top_p_sampling_v128256.json
+top_k_top_p_sampling_v151936.json
+top_p_sampling_v128256.json
+top_p_sampling_v151936.json
+
+Note: top_p_sampling files appear for vocab_size=151936 because
+top_k_top_p_sampling calls top_p_sampling internally.
+FP4 MoE files are only generated on Blackwell (SM100+) GPUs with fp4_quantize available.
+GDN prefill files require SM90+ (Hopper) GPU.
+"""
+
+import contextlib
+import json
+import os
+from pathlib import Path
+
+# Must be set before any flashinfer import: template.py reads these at module load time.
+os.environ.setdefault(
+    "FLASHINFER_TRACE_DUMP_DIR",
+    str(Path(__file__).parent / "fi_trace_out"),
+)
+os.environ.setdefault("FLASHINFER_TRACE_DUMP", "1")
+
+SAVE_DIR = Path(os.environ["FLASHINFER_TRACE_DUMP_DIR"])
+
+import torch
+
+import flashinfer
+import flashinfer.norm
+import flashinfer.sampling
+import flashinfer.gemm
+import flashinfer.gdn_decode
+import flashinfer.fused_moe
+import flashinfer.activation
+import flashinfer.cascade
+from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
+from flashinfer.prefill import (
+    BatchPrefillWithPagedKVCacheWrapper,
+    BatchPrefillWithRaggedKVCacheWrapper,
+)
+from flashinfer.mla import BatchMLAPagedAttentionWrapper
+
+device = "cuda"
+WORKSPACE = 128 * 1024 * 1024  # 128 MB
+
+print(f"\nAuto-dumping fi_trace JSON files to {SAVE_DIR}/\n")
+
+# ── rmsnorm ───────────────────────────────────────────────────────────────────
+# Llama-3.1-8B (hidden=4096) and DeepSeek-V3 (hidden=7168)
+for hidden_size in (4096, 7168):
+    hidden = torch.randn(32, hidden_size, dtype=torch.bfloat16, device=device)
+    weight = torch.ones(hidden_size, dtype=torch.bfloat16, device=device)
+    flashinfer.rmsnorm(hidden, weight)
+
+# ── fused_add_rmsnorm (Qwen3-14B, hidden=5120) ───────────────────────────────
+x = torch.randn(32, 5120, dtype=torch.bfloat16, device=device)
+res = torch.randn(32, 5120, dtype=torch.bfloat16, device=device)
+w = torch.ones(5120, dtype=torch.bfloat16, device=device)
+flashinfer.fused_add_rmsnorm(x, res, w)
+
+# ── rmsnorm_quant + fused_add_rmsnorm_quant (DeepSeek-V3 down-proj, h=7168) ──
+# Quantize to FP8 E4M3 after normalization; scale is per-tensor.
+norm_h = 7168
+norm_in = torch.randn(32, norm_h, dtype=torch.bfloat16, device=device)
+norm_w = torch.ones(norm_h, dtype=torch.bfloat16, device=device)
+norm_scale = torch.tensor([1.0], dtype=torch.float32, device=device)
+norm_out = torch.empty(32, norm_h, dtype=torch.float8_e4m3fn, device=device)
+flashinfer.rmsnorm_quant(norm_out, norm_in, norm_w, norm_scale)
+
+norm_res = torch.randn(32, norm_h, dtype=torch.bfloat16, device=device)
+flashinfer.fused_add_rmsnorm_quant(norm_out, norm_in, norm_res, norm_w, norm_scale)
+
+# ── gemma_rmsnorm + gemma_fused_add_rmsnorm (Gemma-2-27B, hidden=4608) ───────
+gemma_h = 4608
+gemma_in = torch.randn(32, gemma_h, dtype=torch.bfloat16, device=device)
+gemma_w = torch.zeros(gemma_h, dtype=torch.bfloat16, device=device)
+flashinfer.gemma_rmsnorm(gemma_in, gemma_w)
+
+gemma_res = torch.randn(32, gemma_h, dtype=torch.bfloat16, device=device)
+flashinfer.gemma_fused_add_rmsnorm(gemma_in, gemma_res, gemma_w)
+
+# ── layernorm (GPT-2/BERT, hidden=768) ────────────────────────────────────────
+ln_h = 768
+ln_in = torch.randn(32, ln_h, dtype=torch.bfloat16, device=device)
+ln_gamma = torch.ones(ln_h, dtype=torch.float32, device=device)
+ln_beta = torch.zeros(ln_h, dtype=torch.float32, device=device)
+flashinfer.layernorm(ln_in, ln_gamma, ln_beta)
+
+# ── sampling (Llama vocab=128256) ─────────────────────────────────────────────
+probs = torch.rand(64, 128256, dtype=torch.float32, device=device)
+top_k = torch.full((64,), 50, dtype=torch.int32, device=device)
+top_p = torch.full((64,), 0.9, dtype=torch.float32, device=device)
+flashinfer.top_k_sampling_from_probs(probs, top_k)
+flashinfer.top_p_sampling_from_probs(probs, top_p)
+flashinfer.top_k_top_p_sampling_from_probs(probs, top_k, top_p)
+
+# ── sampling (Qwen3 vocab=151936) ─────────────────────────────────────────────
+probs = torch.rand(64, 151936, dtype=torch.float32, device=device)
+flashinfer.top_k_top_p_sampling_from_probs(probs, top_k, top_p)
+
+# ── Activation functions (LLaMA/Mistral FFN, hidden=8192 gate+up) ─────────────
+# Input shape is [T, 2*H] where H is the output (post-gate) hidden dim.
+act_input = torch.randn(128, 2 * 8192, dtype=torch.bfloat16, device=device)
+flashinfer.silu_and_mul(act_input)
+flashinfer.gelu_tanh_and_mul(act_input)
+flashinfer.gelu_and_mul(act_input)
+
+# ── Cascade / merge attention states ─────────────────────────────────────────
+# Cascade attention merges partial V/S states from different KV segments.
+ms_T, ms_H, ms_D = 128, 32, 128
+v_a = torch.randn(ms_T, ms_H, ms_D, dtype=torch.bfloat16, device=device)
+s_a = torch.randn(ms_T, ms_H, dtype=torch.float32, device=device)
+v_b = torch.randn(ms_T, ms_H, ms_D, dtype=torch.bfloat16, device=device)
+s_b = torch.randn(ms_T, ms_H, dtype=torch.float32, device=device)
+flashinfer.merge_state(v_a, s_a, v_b, s_b)
+flashinfer.merge_state_in_place(v_a, s_a, v_b, s_b)
+# merge_states: [T, num_states, H, D]
+v_multi = torch.randn(ms_T, 4, ms_H, ms_D, dtype=torch.bfloat16, device=device)
+s_multi = torch.randn(ms_T, 4, ms_H, dtype=torch.float32, device=device)
+flashinfer.merge_states(v_multi, s_multi)
+
+# ── RoPE (Llama-3.1-8B: h=32/kv=8/d=128, batch=4, seq=128) ────────────────────
+rope_B, rope_S, rope_Hq, rope_Hk, rope_D = 4, 128, 32, 8, 128
+rope_nnz = rope_B * rope_S
+rope_q = torch.randn(rope_nnz, rope_Hq, rope_D, dtype=torch.bfloat16, device=device)
+rope_k = torch.randn(rope_nnz, rope_Hk, rope_D, dtype=torch.bfloat16, device=device)
+rope_indptr = torch.arange(rope_B + 1, dtype=torch.int32, device=device) * rope_S
+rope_offsets = torch.zeros(rope_B, dtype=torch.int32, device=device)
+rope_pos_ids = torch.arange(rope_nnz, dtype=torch.int32, device=device) % rope_S
+flashinfer.apply_rope(rope_q, rope_k, rope_indptr, rope_offsets)
+flashinfer.apply_rope_inplace(rope_q.clone(), rope_k.clone(), rope_indptr, rope_offsets)
+flashinfer.apply_rope_pos_ids(rope_q, rope_k, rope_pos_ids)
+flashinfer.apply_rope_pos_ids_inplace(rope_q.clone(), rope_k.clone(), rope_pos_ids)
+flashinfer.apply_llama31_rope(rope_q, rope_k, rope_indptr, rope_offsets)
+flashinfer.apply_llama31_rope_inplace(
+    rope_q.clone(), rope_k.clone(), rope_indptr, rope_offsets
+)
+flashinfer.apply_llama31_rope_pos_ids(rope_q, rope_k, rope_pos_ids)
+flashinfer.apply_llama31_rope_pos_ids_inplace(
+    rope_q.clone(), rope_k.clone(), rope_pos_ids
+)
+
+# ── RoPE with cos/sin cache (SGL/vLLM-compatible) ─────────────────────────────
+rope_query = torch.randn(
+    rope_nnz, rope_Hq * rope_D, dtype=torch.bfloat16, device=device
+)
+rope_key = torch.randn(rope_nnz, rope_Hk * rope_D, dtype=torch.bfloat16, device=device)
+rope_cos_sin = torch.randn(8192, rope_D, dtype=torch.float32, device=device)
+rope_positions = torch.arange(rope_nnz, dtype=torch.int32, device=device) % 8192
+flashinfer.apply_rope_with_cos_sin_cache(
+    rope_positions, rope_query, rope_key, rope_D, rope_cos_sin
+)
+flashinfer.apply_rope_with_cos_sin_cache_inplace(
+    rope_positions, rope_query.clone(), rope_key.clone(), rope_D, rope_cos_sin
+)
+
+# ── Quantization (FP4 / NVFP4 / MXFP4 / MXFP8, SM100+) ────────────────────────
+# Kernels are SM100+ only; trace is dumped before kernel launch so JSONs are
+# generated on any GPU — runtime failures are suppressed.
+from flashinfer.quantization.fp4_quantization import (
+    fp4_quantize,
+    mxfp4_quantize,
+    nvfp4_quantize,
+)
+from flashinfer.quantization.fp8_quantization import mxfp8_quantize
+
+quant_M, quant_K = 128, 4096
+quant_input_bf16 = torch.randn(quant_M, quant_K, dtype=torch.bfloat16, device=device)
+quant_global_sf = torch.tensor([1.0], dtype=torch.float32, device=device)
+
+with contextlib.suppress(Exception):
+    fp4_quantize(quant_input_bf16, quant_global_sf, sf_vec_size=16)
+with contextlib.suppress(Exception):
+    nvfp4_quantize(quant_input_bf16, quant_global_sf)
+with contextlib.suppress(Exception):
+    mxfp4_quantize(quant_input_bf16)
+with contextlib.suppress(Exception):
+    mxfp8_quantize(quant_input_bf16)
+
+# ── Single-request attention (non-batched) ───────────────────────────────────
+sa_Hq, sa_Hk, sa_D, sa_KV = 32, 8, 128, 256
+sa_q_dec = torch.randn(sa_Hq, sa_D, dtype=torch.bfloat16, device=device)
+sa_k_dec = torch.randn(sa_KV, sa_Hk, sa_D, dtype=torch.bfloat16, device=device)
+sa_v_dec = torch.randn(sa_KV, sa_Hk, sa_D, dtype=torch.bfloat16, device=device)
+with contextlib.suppress(Exception):
+    flashinfer.single_decode_with_kv_cache(sa_q_dec, sa_k_dec, sa_v_dec)
+
+sa_Q = 128
+sa_q_pf = torch.randn(sa_Q, sa_Hq, sa_D, dtype=torch.bfloat16, device=device)
+sa_k_pf = torch.randn(sa_KV, sa_Hk, sa_D, dtype=torch.bfloat16, device=device)
+sa_v_pf = torch.randn(sa_KV, sa_Hk, sa_D, dtype=torch.bfloat16, device=device)
+with contextlib.suppress(Exception):
+    flashinfer.single_prefill_with_kv_cache(sa_q_pf, sa_k_pf, sa_v_pf, causal=True)
+
+# ── GEMM bf16 ─────────────────────────────────────────────────────────────────
+# Llama-3.1-8B o_proj (4096×4096) and DeepSeek-V3 moe.gate (256×7168)
+# mm_bf16 expects b in column-major layout with shape [K, N].
+# randn(N, K).T gives shape [K, N] with strides (1, N); the kernel transposes
+# b back to [N, K] (contiguous) before calling the C++ matmul.
+# backend="auto" picks cudnn on SM80/89/90 and cutlass on SM100+.
+for N, K in ((4096, 4096), (256, 7168)):
+    a = torch.randn(128, K, dtype=torch.bfloat16, device=device)
+    b = torch.randn(
+        N, K, dtype=torch.bfloat16, device=device
+    ).T  # [K, N] column-major; b.T is contiguous
+    with contextlib.suppress(Exception):
+        flashinfer.mm_bf16(a, b, backend="auto")
+
+# ── GEMM fp8 block-scale (DeepSeek-V3 q_proj: M×7168→1536, block=128) ────────
+# Trace is dumped before kernel launch; suppress SM100-only runtime failures.
+with contextlib.suppress(Exception):
+    M, K, N, BS = 128, 7168, 1536, 128
+    a_fp8 = torch.zeros(M, K, dtype=torch.float8_e4m3fn, device=device)
+    b_fp8 = torch.zeros(K // BS, N, BS, dtype=torch.float8_e4m3fn, device=device)
+    alpha_fp8 = torch.tensor(1.0, dtype=torch.float32, device=device)
+    flashinfer.mm_fp8(a_fp8, b_fp8, alpha_fp8)
+
+# ── GEMM mxfp8 (Blackwell SM100+: M×4096@4096×4096, block=32) ────────────────
+try:
+    M, K, N = 128, 4096, 4096
+    a_mxfp8 = torch.zeros(M, K, dtype=torch.float8_e4m3fn, device=device)
+    b_mxfp8 = torch.zeros(K, N, dtype=torch.float8_e4m3fn, device=device)
+    a_ds = torch.ones(M, K // 32, dtype=torch.uint8, device=device)
+    b_ds = torch.ones(K // 32, N, dtype=torch.uint8, device=device)
+    flashinfer.gemm.mm_mxfp8(a_mxfp8, b_mxfp8, a_ds, b_ds)
+except Exception:
+    pass  # Requires Blackwell (SM100+)
+
+# ── GEMM fp4 (Blackwell SM100+: M×7168@2048×7168, block=16) ─────────────────
+try:
+    M, K, N, BS4 = 128, 7168, 2048, 16
+    a_fp4 = torch.zeros(M, K, dtype=torch.uint8, device=device)
+    b_fp4 = torch.zeros(K, N, dtype=torch.uint8, device=device)
+    a_d4 = torch.ones(M, K // BS4, dtype=torch.float8_e4m3fn, device=device)
+    b_d4 = torch.ones(K, N // BS4, dtype=torch.float8_e4m3fn, device=device)
+    flashinfer.gemm.mm_fp4(a_fp4, b_fp4, a_d4, b_d4, block_size=BS4)
+except Exception:
+    pass  # Requires Blackwell (SM100+)
+
+# ── GQA paged decode (Llama-3.1-8B, h=32/kv=8/d=128) ────────────────────────
+num_qo, num_kv, head_dim, batch_size = 32, 8, 128, 32
+
+for page_size, num_pages in ((16, 128), (64, 32)):
+    total = batch_size * num_pages
+    kv_indptr = (
+        torch.arange(batch_size + 1, dtype=torch.int32, device=device) * num_pages
+    )
+    kv_indices = torch.arange(total, dtype=torch.int32, device=device)
+    kv_last = torch.full((batch_size,), page_size, dtype=torch.int32, device=device)
+
+    ws = torch.empty(WORKSPACE, dtype=torch.uint8, device=device)
+    dec = BatchDecodeWithPagedKVCacheWrapper(ws, "NHD")
+    dec.plan(
+        kv_indptr,
+        kv_indices,
+        kv_last,
+        num_qo,
+        num_kv,
+        head_dim,
+        page_size,
+        q_data_type=torch.bfloat16,
+        kv_data_type=torch.bfloat16,
+    )
+    q_d = torch.randn(batch_size, num_qo, head_dim, dtype=torch.bfloat16, device=device)
+    kc = torch.randn(
+        total, page_size, num_kv, head_dim, dtype=torch.bfloat16, device=device
+    )
+    vc = torch.randn(
+        total, page_size, num_kv, head_dim, dtype=torch.bfloat16, device=device
+    )
+    dec.run(q_d, (kc, vc))
+
+# ── GQA paged prefill (Llama-3.1-8B, h=32/kv=8/d=128, page_size=16) ─────────
+n_req, total_q, np_pf, page_size = 4, 512, 32, 16
+total_pf = n_req * np_pf
+qo_indptr = torch.tensor([0, 128, 256, 384, 512], dtype=torch.int32, device=device)
+kv_indptr_p = torch.arange(n_req + 1, dtype=torch.int32, device=device) * np_pf
+kv_idx_p = torch.arange(total_pf, dtype=torch.int32, device=device)
+kv_last_p = torch.full((n_req,), page_size, dtype=torch.int32, device=device)
+
+ws_pf = torch.empty(WORKSPACE, dtype=torch.uint8, device=device)
+pf = BatchPrefillWithPagedKVCacheWrapper(ws_pf, "NHD")
+pf.plan(
+    qo_indptr,
+    kv_indptr_p,
+    kv_idx_p,
+    kv_last_p,
+    num_qo,
+    num_kv,
+    head_dim,
+    page_size,
+    causal=True,
+    q_data_type=torch.bfloat16,
+    kv_data_type=torch.bfloat16,
+)
+q_pf = torch.randn(total_q, num_qo, head_dim, dtype=torch.bfloat16, device=device)
+kc_pf = torch.randn(
+    total_pf, page_size, num_kv, head_dim, dtype=torch.bfloat16, device=device
+)
+vc_pf = torch.randn(
+    total_pf, page_size, num_kv, head_dim, dtype=torch.bfloat16, device=device
+)
+pf.run(q_pf, (kc_pf, vc_pf))
+
+# ── GQA ragged prefill (Llama-3.1-8B) ────────────────────────────────────────
+qo_indptr_r = torch.tensor([0, 64, 128, 192, 256], dtype=torch.int32, device=device)
+kv_indptr_r = torch.tensor([0, 128, 256, 384, 512], dtype=torch.int32, device=device)
+
+ws_r = torch.empty(WORKSPACE, dtype=torch.uint8, device=device)
+rag = BatchPrefillWithRaggedKVCacheWrapper(ws_r, "NHD")
+rag.plan(
+    qo_indptr_r,
+    kv_indptr_r,
+    num_qo,
+    num_kv,
+    head_dim,
+    causal=True,
+    q_data_type=torch.bfloat16,
+    kv_data_type=torch.bfloat16,
+)
+q_r = torch.randn(256, num_qo, head_dim, dtype=torch.bfloat16, device=device)
+k_r = torch.randn(512, num_kv, head_dim, dtype=torch.bfloat16, device=device)
+v_r = torch.randn(512, num_kv, head_dim, dtype=torch.bfloat16, device=device)
+rag.run(q_r, k_r, v_r)
+
+# ── MLA paged decode (DeepSeek-V3 TP=8, h=16/ckv=512/kpe=64) ─────────────────
+mla_b, mla_h, ckv, kpe = 128, 16, 512, 64
+
+for mla_ps, mla_np in ((64, 32), (1, 2048)):
+    total_mla = mla_b * mla_np
+    mla_qo_indptr = torch.arange(mla_b + 1, dtype=torch.int32, device=device)
+    mla_kv_indptr = torch.arange(mla_b + 1, dtype=torch.int32, device=device) * mla_np
+    mla_kv_indices = torch.arange(total_mla, dtype=torch.int32, device=device)
+    mla_kv_len = torch.full((mla_b,), mla_np * mla_ps, dtype=torch.int32, device=device)
+
+    ws_mla = torch.empty(WORKSPACE, dtype=torch.uint8, device=device)
+    mla = BatchMLAPagedAttentionWrapper(ws_mla)
+    mla.plan(
+        mla_qo_indptr,
+        mla_kv_indptr,
+        mla_kv_indices,
+        mla_kv_len,
+        mla_h,
+        ckv,
+        kpe,
+        mla_ps,
+        causal=False,
+        sm_scale=1.0 / (ckv**0.5),
+        q_data_type=torch.bfloat16,
+        kv_data_type=torch.bfloat16,
+    )
+    q_nope = torch.randn(mla_b, mla_h, ckv, dtype=torch.bfloat16, device=device)
+    q_pe = torch.randn(mla_b, mla_h, kpe, dtype=torch.bfloat16, device=device)
+    ckv_cache = torch.randn(total_mla, mla_ps, ckv, dtype=torch.bfloat16, device=device)
+    kpe_cache = torch.randn(total_mla, mla_ps, kpe, dtype=torch.bfloat16, device=device)
+    mla.run(q_nope, q_pe, ckv_cache, kpe_cache)
+
+# ── GDN prefill (Qwen3-Next TP=4, chunk prefill) ─────────────────────────────
+with contextlib.suppress(Exception):
+    import flashinfer.gdn_prefill  # noqa: PLC0415
+
+    gp_T, gp_H, gp_HV, gp_K = 256, 4, 8, 128
+    cu_seqlens = torch.tensor([0, 64, 128, 192, 256], dtype=torch.int64, device=device)
+    gp_q = torch.randn(gp_T, gp_H, gp_K, dtype=torch.bfloat16, device=device)
+    gp_k = torch.randn(gp_T, gp_H, gp_K, dtype=torch.bfloat16, device=device)
+    gp_v = torch.randn(gp_T, gp_HV, gp_K, dtype=torch.bfloat16, device=device)
+    flashinfer.gdn_prefill.chunk_gated_delta_rule(
+        gp_q, gp_k, gp_v, cu_seqlens=cu_seqlens
+    )
+
+# ── GDN decode (Qwen3-Next TP=4, qk=4/v=8/d=128) ────────────────────────────
+B, H, HV, K = 4, 4, 8, 128
+q = torch.randn(B, 1, H, K, dtype=torch.bfloat16, device=device)
+k = torch.randn(B, 1, H, K, dtype=torch.bfloat16, device=device)
+v = torch.randn(B, 1, HV, K, dtype=torch.bfloat16, device=device)
+state = torch.zeros(B, HV, K, K, dtype=torch.float32, device=device)
+A_log = torch.zeros(HV, dtype=torch.float32, device=device)
+a = torch.zeros(B, 1, HV, dtype=torch.bfloat16, device=device)
+dt_bias = torch.zeros(HV, dtype=torch.float32, device=device)
+b_ = torch.zeros(B, 1, HV, dtype=torch.bfloat16, device=device)
+flashinfer.gdn_decode.gated_delta_rule_decode(q, k, v, state, A_log, a, dt_bias, b_)
+
+# ── GDN MTP (Qwen3-Next TP=4, spec_len=4) ────────────────────────────────────
+T_mtp, pool_size = 4, 8
+q_m = torch.randn(B, T_mtp, H, K, dtype=torch.bfloat16, device=device)
+k_m = torch.randn(B, T_mtp, H, K, dtype=torch.bfloat16, device=device)
+v_m = torch.randn(B, T_mtp, HV, K, dtype=torch.bfloat16, device=device)
+init_state = torch.zeros(pool_size, HV, K, K, dtype=torch.float32, device=device)
+init_idx = torch.arange(B, dtype=torch.int32, device=device)
+A_log_m = torch.zeros(HV, dtype=torch.float32, device=device)
+a_m = torch.zeros(B, T_mtp, HV, dtype=torch.bfloat16, device=device)
+dt_bias_m = torch.zeros(HV, dtype=torch.float32, device=device)
+b_m = torch.zeros(B, T_mtp, HV, dtype=torch.bfloat16, device=device)
+flashinfer.gdn_decode.gated_delta_rule_mtp(
+    q_m, k_m, v_m, init_state, init_idx, A_log_m, a_m, dt_bias_m, b_m
+)
+
+# ── MoE FP8 (256 experts, 32 local, h=7168, i=2048) ─────────────────────────
+# routing_method_type: 0=Default, 1=Renormalize, 2=DeepSeekV3,
+#                      3=Llama4,   4=RenormalizeNaive, 5=TopK
+T_moe, H_moe, I_moe, E_tot, E_loc, BS = 128, 7168, 2048, 256, 32, 128
+routing_logits = torch.randn(T_moe, E_tot, dtype=torch.float32, device=device)
+routing_bias = torch.zeros(E_tot, dtype=torch.bfloat16, device=device)
+hs = torch.zeros(T_moe, H_moe, dtype=torch.float8_e4m3fn, device=device)
+hs_scale = torch.ones(H_moe // BS, T_moe, dtype=torch.float32, device=device)
+w1 = torch.zeros(E_loc, 2 * I_moe, H_moe, dtype=torch.float8_e4m3fn, device=device)
+w1s = torch.ones(
+    E_loc, (2 * I_moe) // BS, H_moe // BS, dtype=torch.float32, device=device
+)
+w2 = torch.zeros(E_loc, H_moe, I_moe, dtype=torch.float8_e4m3fn, device=device)
+w2s = torch.ones(E_loc, H_moe // BS, I_moe // BS, dtype=torch.float32, device=device)
+_moe_common = dict(
+    num_experts=E_tot,
+    intermediate_size=I_moe,
+    local_expert_offset=0,
+    local_num_experts=E_loc,
+    routed_scaling_factor=2.5,
+)
+_moe_args = (routing_logits, routing_bias, hs, hs_scale, w1, w1s, w2, w2s)
+
+# Each routing type in its own try/except so a GPU-support failure on one
+# variant does not prevent the remaining traces from being dumped.
+
+# 0: Default routing (Softmax -> TopK)
+with contextlib.suppress(Exception):
+    flashinfer.fused_moe.trtllm_fp8_block_scale_moe(
+        *_moe_args,
+        top_k=8,
+        n_group=None,
+        topk_group=None,
+        routing_method_type=0,
+        **_moe_common,
+    )
+
+# 1: Renormalize routing (TopK -> Softmax)
+with contextlib.suppress(Exception):
+    flashinfer.fused_moe.trtllm_fp8_block_scale_moe(
+        *_moe_args,
+        top_k=8,
+        n_group=None,
+        topk_group=None,
+        routing_method_type=1,
+        **_moe_common,
+    )
+
+# 2: DeepSeekV3 routing (Sigmoid -> group selection -> top_k=8)
+with contextlib.suppress(Exception):
+    flashinfer.fused_moe.trtllm_fp8_block_scale_moe(
+        *_moe_args,
+        top_k=8,
+        n_group=8,
+        topk_group=4,
+        routing_method_type=2,
+        **_moe_common,
+    )
+
+# 3: Llama4 routing (Top1 -> Sigmoid)
+with contextlib.suppress(Exception):
+    flashinfer.fused_moe.trtllm_fp8_block_scale_moe(
+        *_moe_args,
+        top_k=1,
+        n_group=None,
+        topk_group=None,
+        routing_method_type=3,
+        **_moe_common,
+    )
+
+# 4: RenormalizeNaive routing (Softmax -> TopK -> Renormalize)
+with contextlib.suppress(Exception):
+    flashinfer.fused_moe.trtllm_fp8_block_scale_moe(
+        *_moe_args,
+        top_k=8,
+        n_group=None,
+        topk_group=None,
+        routing_method_type=4,
+        **_moe_common,
+    )
+
+# 5: TopK routing (plain TopK, no normalisation)
+with contextlib.suppress(Exception):
+    flashinfer.fused_moe.trtllm_fp8_block_scale_moe(
+        *_moe_args,
+        top_k=8,
+        n_group=None,
+        topk_group=None,
+        routing_method_type=5,
+        **_moe_common,
+    )
+
+# ── MoE FP4 (NvFP4, 256 experts, 32 local, h=7168, i=2048) ──────────────────
+# routing_method_type: 0=Default, 1=Renormalize, 2=DeepSeekV3,
+#                      3=Llama4,   4=RenormalizeNaive, 5=TopK
+# NvFP4: block_size=16; hidden_states packed as [T, H//2] uint8,
+#        scale as [T, H//16] float8.
+try:
+    import flashinfer
+    from flashinfer import fp4_quantize
+
+    T_fp4, H_fp4, I_fp4, E_tot_fp4, E_loc_fp4 = 128, 7168, 2048, 256, 32
+    SF_VEC = 16
+
+    routing_logits_fp4 = torch.randn(
+        T_fp4, E_tot_fp4, dtype=torch.bfloat16, device=device
+    )
+    hs_bf16 = torch.randn(T_fp4, H_fp4, dtype=torch.bfloat16, device=device) * 0.1
+    hs_fp4, hs_fp4_scale = fp4_quantize(
+        hs_bf16,
+        torch.tensor([448.0 * 6.0], device=device),
+        sf_vec_size=SF_VEC,
+        sf_use_ue8m0=False,
+        is_sf_swizzled_layout=False,
+    )
+    hs_fp4_scale = hs_fp4_scale.view(torch.float8_e4m3fn).reshape(T_fp4, -1)
+
+    w13_bf16 = (
+        torch.randn(E_loc_fp4, 2 * I_fp4, H_fp4, dtype=torch.bfloat16, device=device)
+        * 0.1
+    )
+    w13_fp4, w13_fp4_scale = fp4_quantize(
+        w13_bf16,
+        torch.tensor([448.0 * 6.0], device=device),
+        sf_vec_size=SF_VEC,
+        sf_use_ue8m0=False,
+    )
+    w13_fp4_scale = w13_fp4_scale.view(torch.float8_e4m3fn).reshape(
+        E_loc_fp4, 2 * I_fp4, -1
+    )
+    w2_bf16 = (
+        torch.randn(E_loc_fp4, H_fp4, I_fp4, dtype=torch.bfloat16, device=device) * 0.1
+    )
+    w2_fp4, w2_fp4_scale = fp4_quantize(
+        w2_bf16,
+        torch.tensor([448.0 * 6.0], device=device),
+        sf_vec_size=SF_VEC,
+        sf_use_ue8m0=False,
+    )
+    w2_fp4_scale = w2_fp4_scale.view(torch.float8_e4m3fn).reshape(E_loc_fp4, H_fp4, -1)
+
+    scale_val = 1.0 / 448.0 / 6.0
+    out1_scale = torch.full((E_loc_fp4,), scale_val**2, device=device)
+    out1_gate_scale = torch.full((E_loc_fp4,), scale_val**2, device=device)
+    out2_scale = torch.full((E_loc_fp4,), scale_val**2, device=device)
+
+    _fp4_moe_common = dict(
+        num_experts=E_tot_fp4,
+        intermediate_size=I_fp4,
+        local_expert_offset=0,
+        local_num_experts=E_loc_fp4,
+        routed_scaling_factor=None,
+    )
+    _fp4_moe_args = (
+        routing_logits_fp4,
+        None,  # routing_bias
+        hs_fp4,
+        hs_fp4_scale,
+        w13_fp4,
+        w13_fp4_scale,
+        None,  # gemm1_bias
+        None,  # gemm1_alpha
+        None,  # gemm1_beta
+        None,  # gemm1_clamp_limit
+        w2_fp4,
+        w2_fp4_scale,
+        None,  # gemm2_bias
+        out1_scale,
+        out1_gate_scale,
+        out2_scale,
+    )
+except Exception:
+    _fp4_moe_args = None  # fp4_quantize unavailable
+
+if _fp4_moe_args is not None:
+    # 0: Default routing (Softmax -> TopK)
+    with contextlib.suppress(Exception):
+        flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+            *_fp4_moe_args,
+            top_k=8,
+            n_group=None,
+            topk_group=None,
+            routing_method_type=0,
+            **_fp4_moe_common,
+        )
+
+    # 1: Renormalize routing (TopK -> Softmax)
+    with contextlib.suppress(Exception):
+        flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+            *_fp4_moe_args,
+            top_k=8,
+            n_group=None,
+            topk_group=None,
+            routing_method_type=1,
+            **_fp4_moe_common,
+        )
+
+    # 2: DeepSeekV3 routing (Sigmoid -> group selection -> top_k=8)
+    with contextlib.suppress(Exception):
+        flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+            *_fp4_moe_args,
+            top_k=8,
+            n_group=8,
+            topk_group=4,
+            routing_method_type=2,
+            **_fp4_moe_common,
+        )
+
+    # 3: Llama4 routing (Top1 -> Sigmoid)
+    with contextlib.suppress(Exception):
+        flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+            *_fp4_moe_args,
+            top_k=1,
+            n_group=None,
+            topk_group=None,
+            routing_method_type=3,
+            **_fp4_moe_common,
+        )
+
+    # 4: RenormalizeNaive routing (Softmax -> TopK -> Renormalize)
+    with contextlib.suppress(Exception):
+        flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+            *_fp4_moe_args,
+            top_k=8,
+            n_group=None,
+            topk_group=None,
+            routing_method_type=4,
+            **_fp4_moe_common,
+        )
+
+    # 5: TopK routing (plain TopK, no normalisation)
+    with contextlib.suppress(Exception):
+        flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+            *_fp4_moe_args,
+            top_k=8,
+            n_group=None,
+            topk_group=None,
+            routing_method_type=5,
+            **_fp4_moe_common,
+        )
+
+# ── Summary ───────────────────────────────────────────────────────────────────
+files = sorted(SAVE_DIR.glob("*.json"))
+print(f"\nWrote {len(files)} definition files:\n")
+for f in files:
+    defn = json.loads(f.read_text())
+    print(f"  {f.name}")
+    print(f"    op_type : {defn['op_type']}")
+    print(f"    fi_api  : {next(t for t in defn['tags'] if t.startswith('fi_api:'))}")
+    const_axes = {
+        k: v["value"]
+        for k, v in defn["axes"].items()
+        if v["type"] == "const" and "value" in v
+    }
+    if const_axes:
+        print(f"    axes    : {const_axes}")
+    print()
+
+
+# ── Extra APIs (category A+B additions) ───────────────────────────────────────
+# Many of these require SM100+ kernels; traces dump before the kernel runs so
+# the JSONs appear on any GPU. Wrap runtime-only calls in contextlib.suppress.
+
+# append_paged_kv_cache: exercise via a single page write.
+with contextlib.suppress(Exception):
+    from flashinfer import append_paged_kv_cache
+
+    _pap_B, _pap_H, _pap_D, _pap_PS = 2, 8, 128, 16
+    _pap_nnz = 4
+    _k_cache = torch.zeros(
+        4, _pap_PS, _pap_H, _pap_D, dtype=torch.bfloat16, device=device
+    )
+    _v_cache = torch.zeros_like(_k_cache)
+    _append_k = torch.randn(
+        _pap_nnz, _pap_H, _pap_D, dtype=torch.bfloat16, device=device
+    )
+    _append_v = torch.randn_like(_append_k)
+    _bidx = torch.tensor([0, 0, 1, 1], dtype=torch.int32, device=device)
+    _pos = torch.tensor([0, 1, 0, 1], dtype=torch.int32, device=device)
+    _kv_idx = torch.tensor([0, 1, 2, 3], dtype=torch.int32, device=device)
+    _kv_indptr = torch.tensor([0, 2, 4], dtype=torch.int32, device=device)
+    _last = torch.tensor([2, 2], dtype=torch.int32, device=device)
+    append_paged_kv_cache(
+        _append_k,
+        _append_v,
+        _bidx,
+        _pos,
+        (_k_cache, _v_cache),
+        _kv_idx,
+        _kv_indptr,
+        _last,
+    )
+
+# SegmentGEMMWrapper: small per-segment matmul.
+with contextlib.suppress(Exception):
+    ws = torch.empty(WORKSPACE, dtype=torch.uint8, device=device)
+    seg = flashinfer.SegmentGEMMWrapper(ws)
+    seg_x = torch.randn(256, 128, dtype=torch.bfloat16, device=device)
+    seg_w = torch.randn(4, 128, 64, dtype=torch.bfloat16, device=device)
+    seg_indptr = torch.tensor([0, 64, 128, 192, 256], dtype=torch.int64, device=device)
+    seg.run(
+        seg_x,
+        seg_w,
+        batch_size=4,
+        weight_column_major=False,
+        seg_indptr=seg_indptr,
+    )
+
+# softmax + sampling_from_probs + sampling_from_logits + min_p_sampling.
+_sp_probs = torch.rand(64, 32000, dtype=torch.float32, device=device)
+_sp_probs = _sp_probs / _sp_probs.sum(dim=-1, keepdim=True)
+_sp_logits = torch.randn(64, 32000, dtype=torch.float32, device=device)
+with contextlib.suppress(Exception):
+    flashinfer.softmax(_sp_logits, temperature=1.0)
+with contextlib.suppress(Exception):
+    flashinfer.sampling_from_probs(_sp_probs)
+with contextlib.suppress(Exception):
+    flashinfer.sampling_from_logits(_sp_logits)
+with contextlib.suppress(Exception):
+    flashinfer.min_p_sampling_from_probs(_sp_probs, 0.1)
+with contextlib.suppress(Exception):
+    flashinfer.top_p_renorm_probs(_sp_probs, 0.9)
+with contextlib.suppress(Exception):
+    flashinfer.top_k_renorm_probs(_sp_probs, 50)
+with contextlib.suppress(Exception):
+    flashinfer.top_k_mask_logits(_sp_logits, 50)
+with contextlib.suppress(Exception):
+    flashinfer.top_k_top_p_sampling_from_logits(_sp_logits, 50, 0.9)
+
+# chain_speculative_sampling.
+with contextlib.suppress(Exception):
+    _csd_B, _csd_S, _csd_V = 4, 3, 32000
+    _draft_p = torch.softmax(
+        torch.randn(_csd_B, _csd_S + 1, _csd_V, dtype=torch.float32, device=device),
+        dim=-1,
+    )
+    _target_p = torch.softmax(
+        torch.randn(_csd_B, _csd_S + 1, _csd_V, dtype=torch.float32, device=device),
+        dim=-1,
+    )
+    _draft_ids = torch.randint(
+        0,
+        _csd_V,
+        (_csd_B, _csd_S),
+        dtype=torch.int32,
+        device=device,
+    )
+    flashinfer.chain_speculative_sampling(_draft_p, _draft_ids, _target_p)
diff --git a/tests/trace/example_cuda_graph.py b/tests/trace/example_cuda_graph.py
new file mode 100644
index 0000000000..c3a13d1976
--- /dev/null
+++ b/tests/trace/example_cuda_graph.py
@@ -0,0 +1,146 @@
+"""
+fi_trace + CUDA graph example.
+
+Demonstrates that @flashinfer_api(trace=...) auto-dump is compatible with
+`torch.cuda.graph` capture:
+
+  * The schema extraction path reads only CPU-side tensor metadata (shape,
+    dtype) and writes a JSON file on the host thread — no CUDA stream ops,
+    so nothing gets baked into the captured graph.
+  * On graph *replay*, Python code does not run at all, so auto-dump cannot
+    fire again. The _DUMPED_NAMES dedup in flashinfer/trace/template.py
+    already prevents re-writes even when Python does run.
+
+Run:
+    python tests/trace/example_cuda_graph.py
+
+Produces one file in ./fi_trace_out_cudagraph/:
+    gqa_paged_decode_h32_kv8_d128_ps16.json
+"""
+
+import os
+from pathlib import Path
+
+# Must be set before any flashinfer import: template.py reads these at import time.
+SAVE_DIR = Path(__file__).parent / "fi_trace_out_cudagraph"
+os.environ.setdefault("FLASHINFER_TRACE_DUMP_DIR", str(SAVE_DIR))
+os.environ.setdefault("FLASHINFER_TRACE_DUMP", "1")
+
+import torch
+
+from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
+
+
+def main() -> None:
+    device = "cuda"
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA is required for this example.")
+
+    # Llama-3.1-8B paged decode: 32 qo heads / 8 kv heads / head_dim=128, 32 seqs
+    batch_size, num_qo, num_kv, head_dim, page_size = 32, 32, 8, 128, 16
+    num_pages_per_seq = 8
+    total_pages = batch_size * num_pages_per_seq
+    workspace = 128 * 1024 * 1024  # 128 MB
+
+    # Static buffers the wrapper reuses across captures.
+    kv_indptr_buf = torch.empty(batch_size + 1, dtype=torch.int32, device=device)
+    kv_indices_buf = torch.empty(total_pages, dtype=torch.int32, device=device)
+    kv_last_buf = torch.empty(batch_size, dtype=torch.int32, device=device)
+    ws = torch.empty(workspace, dtype=torch.uint8, device=device)
+
+    wrapper = CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
+        ws, kv_indptr_buf, kv_indices_buf, kv_last_buf, "NHD"
+    )
+
+    # Fill the static buffers with the layout we will replay against.
+    kv_indptr_buf.copy_(
+        torch.arange(batch_size + 1, dtype=torch.int32, device=device)
+        * num_pages_per_seq
+    )
+    kv_indices_buf.copy_(torch.arange(total_pages, dtype=torch.int32, device=device))
+    kv_last_buf.copy_(
+        torch.full((batch_size,), page_size, dtype=torch.int32, device=device)
+    )
+
+    # Plan runs on the CPU — never captured.
+    wrapper.plan(
+        kv_indptr_buf,
+        kv_indices_buf,
+        kv_last_buf,
+        num_qo,
+        num_kv,
+        head_dim,
+        page_size,
+        q_data_type=torch.bfloat16,
+        kv_data_type=torch.bfloat16,
+    )
+
+    q = torch.randn(batch_size, num_qo, head_dim, dtype=torch.bfloat16, device=device)
+    kc = torch.randn(
+        total_pages, page_size, num_kv, head_dim, dtype=torch.bfloat16, device=device
+    )
+    vc = torch.randn(
+        total_pages, page_size, num_kv, head_dim, dtype=torch.bfloat16, device=device
+    )
+
+    expected = SAVE_DIR / "gqa_paged_decode_h32_kv8_d128_ps16.json"
+    if expected.exists():
+        expected.unlink()  # Start clean so we can observe the first dump.
+
+    # Warmup on a side stream so the first captured iteration is well-behaved.
+    # The first wrapper.run() triggers auto-dump on the host thread (schema
+    # extraction is CPU-only: .shape / .dtype / json.dumps). Subsequent calls
+    # hit the _DUMPED_NAMES dedup and skip file I/O.
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for _ in range(3):
+            _ = wrapper.run(q, (kc, vc))
+    torch.cuda.current_stream().wait_stream(s)
+
+    assert expected.exists(), (
+        f"Expected trace JSON at {expected} to be written on the first call."
+    )
+    size_after_warmup = expected.stat().st_size
+    mtime_after_warmup = expected.stat().st_mtime_ns
+    print(f"[warmup]  wrote {expected.name} ({size_after_warmup} bytes)")
+
+    # Capture: the @flashinfer_api(trace=...) wrapper's Python code still
+    # runs once inside the capture block, but dedup skips the write. Kernel
+    # launches are captured into the graph; host-side file I/O is never a
+    # captured CUDA op, so it cannot corrupt the graph even when it does fire.
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(g):
+        out_captured = wrapper.run(q, (kc, vc))
+
+    assert expected.stat().st_mtime_ns == mtime_after_warmup, (
+        "Trace file was rewritten during capture — dedup failed."
+    )
+    print("[capture] graph captured; trace file untouched (dedup skipped re-write)")
+
+    # Replay: Python doesn't run at all, so auto-dump definitely cannot fire.
+    for _ in range(5):
+        g.replay()
+    torch.cuda.synchronize()
+    assert expected.stat().st_mtime_ns == mtime_after_warmup, (
+        "Trace file was rewritten during replay — auto-dump is not replay-idempotent."
+    )
+    print("[replay]  5 replays completed; trace file still untouched")
+
+    # Correctness: eager call should match the graph output (same inputs,
+    # same plan). Use the bound method's own fi_trace to confirm the schema
+    # was generated even without file dump.
+    eager_out = wrapper.run(q, (kc, vc))
+    torch.testing.assert_close(out_captured, eager_out, rtol=1e-3, atol=1e-3)
+    print("[verify]  captured output matches eager reference")
+
+    # fi_trace() is still directly callable on the bound method for ad-hoc use.
+    # Takes kwargs; positional tensor args are not supported.
+    schema = wrapper.run.fi_trace(q=q, paged_kv_cache=(kc, vc))
+    print(
+        f"[fi_trace] {schema['name']} op_type={schema['op_type']} axes={schema['axes']}"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/trace/example_sglang.py b/tests/trace/example_sglang.py
new file mode 100644
index 0000000000..68a699cd90
--- /dev/null
+++ b/tests/trace/example_sglang.py
@@ -0,0 +1,69 @@
+"""
+fi_trace + sglang example: run one inference pass in sglang with the
+flashinfer backend and verify trace JSONs are produced.
+
+sglang calls flashinfer APIs (rmsnorm, RoPE, attention, GEMM, activation,
+sampling) during a forward pass; every ``@flashinfer_api(trace=...)``
+decorated call writes a trace JSON when ``FLASHINFER_TRACE_DUMP=1`` is set.
+
+Uses the locally cached Llama-3.2-3B-Instruct. One inference pass (prefill
++ one decode step) is sufficient to exercise most of the instrumented
+flashinfer APIs.
+"""
+
+import os
+import shutil
+from pathlib import Path
+
+
+# Must be set before any flashinfer / sglang import.
+SAVE_DIR = Path(__file__).parent / "fi_trace_out_sglang"
+os.environ["FLASHINFER_TRACE_DUMP_DIR"] = str(SAVE_DIR)
+os.environ["FLASHINFER_TRACE_DUMP"] = "1"
+# Disable cubin cache download to avoid network hit.
+os.environ.setdefault("SGLANG_SKIP_CUBIN_DOWNLOAD", "1")
+
+if SAVE_DIR.exists():
+    shutil.rmtree(SAVE_DIR)
+
+from sglang.srt.entrypoints.engine import Engine  # noqa: E402
+
+
+def main() -> None:
+    model = os.environ.get("FI_TRACE_SGLANG_MODEL", "meta-llama/Llama-3.2-3B-Instruct")
+    print(f"Loading sglang Engine with model={model} (attention_backend=flashinfer)")
+    engine = Engine(
+        model_path=model,
+        attention_backend="flashinfer",
+        disable_cuda_graph=True,  # keep the first call on the Python path
+        mem_fraction_static=0.5,
+        tp_size=1,
+        disable_radix_cache=True,
+        log_level="warning",
+    )
+
+    prompts = ["The capital of France is"]
+    sampling_params = {
+        "temperature": 0.0,
+        "max_new_tokens": 4,
+        "top_k": 50,
+        "top_p": 0.9,
+    }
+    print("Running one inference pass…")
+    outputs = engine.generate(prompts, sampling_params)
+    for p, out in zip(prompts, outputs, strict=True):
+        text = out.get("text") if isinstance(out, dict) else out
+        print(f"  prompt: {p!r}")
+        print(f"  output: {text!r}")
+
+    engine.shutdown()
+
+    json_files = sorted(SAVE_DIR.glob("*.json"))
+    print()
+    print(f"Produced {len(json_files)} trace JSON files in {SAVE_DIR}:")
+    for f in json_files:
+        print(f"  {f.name}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/trace/fi_trace_out/append_paged_kv_cache_kv8_d128.json b/tests/trace/fi_trace_out/append_paged_kv_cache_kv8_d128.json
new file mode 100644
index 0000000000..208ceb0eee
--- /dev/null
+++ b/tests/trace/fi_trace_out/append_paged_kv_cache_kv8_d128.json
@@ -0,0 +1,116 @@
+{
+  "name": "append_paged_kv_cache_kv8_d128",
+  "description": "Append a batch of (key, value) rows into a paged KV cache at positions determined by (batch_indices, positions) and the per-seq kv_indptr/kv_indices/kv_last_page_len layout.",
+  "op_type": "page_append",
+  "tags": [
+    "fi_api:flashinfer.page.append_paged_kv_cache",
+    "status:verified"
+  ],
+  "axes": {
+    "nnz_kv": {
+      "type": "var",
+      "description": "Total K/V tokens to append."
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "num_pages": {
+      "type": "var"
+    },
+    "page_size": {
+      "type": "const"
+    },
+    "batch_size": {
+      "type": "var"
+    },
+    "batch_size_plus_1": {
+      "type": "var",
+      "description": "batch_size + 1."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Flat length of kv_indices."
+    }
+  },
+  "constraints": [
+    "batch_size_plus_1 == batch_size + 1"
+  ],
+  "inputs": {
+    "append_key": {
+      "shape": [
+        "nnz_kv",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "append_value": {
+      "shape": [
+        "nnz_kv",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "batch_indices": {
+      "shape": [
+        "nnz_kv"
+      ],
+      "dtype": "int32",
+      "description": "Per-token batch index."
+    },
+    "positions": {
+      "shape": [
+        "nnz_kv"
+      ],
+      "dtype": "int32",
+      "description": "Per-token absolute position."
+    },
+    "paged_kv_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "unknown",
+      "description": "Paged KV cache (tuple or single tensor)."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "int32"
+    },
+    "kv_indptr": {
+      "shape": [
+        "batch_size_plus_1"
+      ],
+      "dtype": "int32"
+    },
+    "kv_last_page_len": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32"
+    }
+  },
+  "outputs": {
+    "paged_kv_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated paged KV cache (in-place)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _append_paged_kv_cache_reference(\n    append_key,\n    append_value,\n    batch_indices,\n    positions,\n    paged_kv_cache,\n    kv_indices,\n    kv_indptr,\n    kv_last_page_len,\n    kv_layout=\"NHD\",\n    **_unused,\n):\n    \"\"\"Append (append_key, append_value) into the paged KV cache at the\n    specified (batch_indices, positions) offsets.\n\n    Mutates ``paged_kv_cache`` in place. Accepts both tuple ``(k, v)`` and\n    single-tensor interleaved layouts. Only the NHD layout is modelled here;\n    HND is a permutation of the same data.\n    \"\"\"\n    if isinstance(paged_kv_cache, tuple):\n        k_cache, v_cache = paged_kv_cache\n    else:\n        # Single tensor: [num_pages, 2, page_size, num_kv_heads, head_dim] in NHD\n        k_cache = paged_kv_cache[:, 0]\n        v_cache = paged_kv_cache[:, 1]\n    N = int(batch_indices.shape[0])\n    page_size = k_cache.shape[1] if kv_layout == \"NHD\" else k_cache.shape[2]\n    for i in range(N):\n        b = int(batch_indices[i].item())\n        pos = int(positions[i].item())\n        page_offset = pos // page_size\n        in_page_offset = pos % page_size\n        # kv_indices maps to the global page id for this (batch, page_offset).\n        idx_base = int(kv_indptr[b].item())\n        page_id = int(kv_indices[idx_base + page_offset].item())\n        if kv_layout == \"NHD\":\n            k_cache[page_id, in_page_offset] = append_key[i]\n            v_cache[page_id, in_page_offset] = append_value[i]\n        else:  # HND\n            k_cache[page_id, :, in_page_offset] = append_key[i]\n            v_cache[page_id, :, in_page_offset] = append_value[i]\n    return paged_kv_cache\n"
+}
diff --git a/tests/trace/fi_trace_out/chain_speculative_sampling_v32000.json b/tests/trace/fi_trace_out/chain_speculative_sampling_v32000.json
new file mode 100644
index 0000000000..0d187285a8
--- /dev/null
+++ b/tests/trace/fi_trace_out/chain_speculative_sampling_v32000.json
@@ -0,0 +1,62 @@
+{
+  "name": "chain_speculative_sampling_v32000",
+  "description": "Chain speculative sampling: accept/reject draft tokens against target distribution and emit the accepted prefix + one sampled final token.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.chain_speculative_sampling",
+    "status:verified",
+    "speculative"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "num_speculative": {
+      "type": "var",
+      "description": "Draft tokens per step."
+    },
+    "num_speculative_plus_1": {
+      "type": "var",
+      "description": "num_speculative + 1 (draft_probs axis)."
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    }
+  },
+  "inputs": {
+    "draft_probs": {
+      "shape": [
+        "batch_size",
+        "num_speculative_plus_1",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "draft_token_ids": {
+      "shape": [
+        "batch_size",
+        "num_speculative"
+      ],
+      "dtype": "int32"
+    },
+    "target_probs": {
+      "shape": [
+        "batch_size",
+        "num_speculative_plus_1",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "outputs": {
+    "accepted_token_ids": {
+      "shape": [
+        "batch_size",
+        "num_speculative_plus_1"
+      ],
+      "dtype": "int32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _chain_speculative_sampling_reference(\n    draft_probs, draft_token_ids, target_probs, **_unused,\n):\n    \"\"\"Deterministic chain speculative sampling: accept draft[i] iff\n    target_prob[draft[i]] >= draft_prob[draft[i]]; emit argmax of the\n    first rejecting target distribution (or last step).\"\"\"\n    B, S = draft_token_ids.shape\n    dp = draft_probs.to(torch.float32)\n    tp = target_probs.to(torch.float32)\n    out = torch.full(\n        (B, S + 1), -1, dtype=torch.int32, device=draft_token_ids.device,\n    )\n    for b in range(B):\n        for s in range(S):\n            tok = int(draft_token_ids[b, s].item())\n            if tp[b, s, tok] >= dp[b, s, tok]:\n                out[b, s] = tok\n            else:\n                out[b, s] = int(tp[b, s].argmax().item())\n                break\n        else:\n            out[b, S] = int(tp[b, S].argmax().item())\n    return out\n"
+}
diff --git a/tests/trace/fi_trace_out/fp4_quantize_k4096.json b/tests/trace/fi_trace_out/fp4_quantize_k4096.json
new file mode 100644
index 0000000000..3c0a4510ba
--- /dev/null
+++ b/tests/trace/fi_trace_out/fp4_quantize_k4096.json
@@ -0,0 +1,78 @@
+{
+  "name": "fp4_quantize_k4096",
+  "description": "Generic FP4 quantization: bf16/fp16 input \u2192 packed FP4 e2m1fn + block scales.",
+  "op_type": "quantization",
+  "tags": [
+    "fi_api:flashinfer.quantization.fp4_quantization.fp4_quantize",
+    "status:verified",
+    "quantization:fp4"
+  ],
+  "axes": {
+    "M": {
+      "type": "var",
+      "description": "Number of rows."
+    },
+    "K": {
+      "type": "const",
+      "value": 4096,
+      "description": "Number of input columns."
+    },
+    "K_packed": {
+      "type": "var",
+      "description": "Packed column dimension (K/2 for FP4, two values per uint8)."
+    },
+    "num_scale_elems": {
+      "type": "var",
+      "description": "Total number of scale factor elements (layout-dependent)."
+    },
+    "one": {
+      "type": "var",
+      "description": "Placeholder for shape [1] scalar tensors."
+    }
+  },
+  "constraints": [
+    "K_packed == K // 2"
+  ],
+  "inputs": {
+    "input": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "bfloat16",
+      "description": "Input tensor, fp16/bf16/fp8_e4m3fn."
+    },
+    "global_scale": {
+      "shape": [
+        "one"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Optional per-tensor global scale (shape [1])."
+    },
+    "sf_vec_size": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Scale-factor vector size (16 for NVFP4, 32 for MXFP4)."
+    }
+  },
+  "outputs": {
+    "quantized": {
+      "shape": [
+        "M",
+        "K_packed"
+      ],
+      "dtype": "uint8",
+      "description": "Packed FP4 output (two e2m1fn values per byte)."
+    },
+    "scales": {
+      "shape": [
+        "num_scale_elems"
+      ],
+      "dtype": "uint8",
+      "description": "Block scale factors packed as uint8 bytes (layout-dependent shape)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _fp4_quantize_reference(\n    input: torch.Tensor,\n    global_scale: Optional[torch.Tensor] = None,\n    sf_vec_size: int = 16,\n    sf_use_ue8m0: bool = False,\n    is_sf_swizzled_layout: bool = True,\n    is_sf_8x4_layout: bool = False,\n    enable_pdl: Optional[bool] = None,\n    backend: str = \"cuda\",\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Reference FP4 quantize. Produces packed uint8 + scales in LINEAR layout.\n\n    The runtime API may return scales in a swizzled layout; consumers should\n    dequantize before comparing.\n    \"\"\"\n    packed, scales = _quantize_fp4_block_scale(\n        input.reshape(-1, input.shape[-1]),\n        block_size=int(sf_vec_size),\n        use_ue8m0=bool(sf_use_ue8m0),\n        global_scale=global_scale,\n    )\n    packed = packed.reshape(*input.shape[:-1], input.shape[-1] // 2)\n    scales = scales.reshape(*input.shape[:-1], input.shape[-1] // int(sf_vec_size))\n    return packed, scales\n"
+}
diff --git a/tests/trace/fi_trace_out/fp4_quantize_k7168.json b/tests/trace/fi_trace_out/fp4_quantize_k7168.json
new file mode 100644
index 0000000000..d6ad123c66
--- /dev/null
+++ b/tests/trace/fi_trace_out/fp4_quantize_k7168.json
@@ -0,0 +1,78 @@
+{
+  "name": "fp4_quantize_k7168",
+  "description": "Generic FP4 quantization: bf16/fp16 input \u2192 packed FP4 e2m1fn + block scales.",
+  "op_type": "quantization",
+  "tags": [
+    "fi_api:flashinfer.quantization.fp4_quantization.fp4_quantize",
+    "status:verified",
+    "quantization:fp4"
+  ],
+  "axes": {
+    "M": {
+      "type": "var",
+      "description": "Number of rows."
+    },
+    "K": {
+      "type": "const",
+      "value": 7168,
+      "description": "Number of input columns."
+    },
+    "K_packed": {
+      "type": "var",
+      "description": "Packed column dimension (K/2 for FP4, two values per uint8)."
+    },
+    "num_scale_elems": {
+      "type": "var",
+      "description": "Total number of scale factor elements (layout-dependent)."
+    },
+    "one": {
+      "type": "var",
+      "description": "Placeholder for shape [1] scalar tensors."
+    }
+  },
+  "constraints": [
+    "K_packed == K // 2"
+  ],
+  "inputs": {
+    "input": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "bfloat16",
+      "description": "Input tensor, fp16/bf16/fp8_e4m3fn."
+    },
+    "global_scale": {
+      "shape": [
+        "one"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Optional per-tensor global scale (shape [1])."
+    },
+    "sf_vec_size": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Scale-factor vector size (16 for NVFP4, 32 for MXFP4)."
+    }
+  },
+  "outputs": {
+    "quantized": {
+      "shape": [
+        "M",
+        "K_packed"
+      ],
+      "dtype": "uint8",
+      "description": "Packed FP4 output (two e2m1fn values per byte)."
+    },
+    "scales": {
+      "shape": [
+        "num_scale_elems"
+      ],
+      "dtype": "uint8",
+      "description": "Block scale factors packed as uint8 bytes (layout-dependent shape)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _fp4_quantize_reference(\n    input: torch.Tensor,\n    global_scale: Optional[torch.Tensor] = None,\n    sf_vec_size: int = 16,\n    sf_use_ue8m0: bool = False,\n    is_sf_swizzled_layout: bool = True,\n    is_sf_8x4_layout: bool = False,\n    enable_pdl: Optional[bool] = None,\n    backend: str = \"cuda\",\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Reference FP4 quantize. Produces packed uint8 + scales in LINEAR layout.\n\n    The runtime API may return scales in a swizzled layout; consumers should\n    dequantize before comparing.\n    \"\"\"\n    packed, scales = _quantize_fp4_block_scale(\n        input.reshape(-1, input.shape[-1]),\n        block_size=int(sf_vec_size),\n        use_ue8m0=bool(sf_use_ue8m0),\n        global_scale=global_scale,\n    )\n    packed = packed.reshape(*input.shape[:-1], input.shape[-1] // 2)\n    scales = scales.reshape(*input.shape[:-1], input.shape[-1] // int(sf_vec_size))\n    return packed, scales\n"
+}
diff --git a/tests/trace/fi_trace_out/fused_add_rmsnorm_h5120.json b/tests/trace/fi_trace_out/fused_add_rmsnorm_h5120.json
new file mode 100644
index 0000000000..a2a5efd989
--- /dev/null
+++ b/tests/trace/fi_trace_out/fused_add_rmsnorm_h5120.json
@@ -0,0 +1,59 @@
+{
+  "name": "fused_add_rmsnorm_h5120",
+  "description": "Fused Add + RMSNorm. Epsilon is fixed at 1e-6.",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.fused_add_rmsnorm",
+    "status:verified",
+    "fused"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 5120
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "residual": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "residual": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated residual (in-place: residual += hidden_states)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _fused_add_rmsnorm_reference(hidden_states, residual, weight):\n    \"\"\"Fused Add + RMSNorm. Epsilon is fixed at 1e-6.\"\"\"\n    EPS = 1e-6\n    x = hidden_states.to(torch.float32) + residual.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    y = (x * inv_rms) * weight.to(torch.float32)\n    return y.to(hidden_states.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/fused_add_rmsnorm_quant_h7168.json b/tests/trace/fi_trace_out/fused_add_rmsnorm_quant_h7168.json
new file mode 100644
index 0000000000..10b7f6bb43
--- /dev/null
+++ b/tests/trace/fi_trace_out/fused_add_rmsnorm_quant_h7168.json
@@ -0,0 +1,66 @@
+{
+  "name": "fused_add_rmsnorm_quant_h7168",
+  "description": "Fused Add + RMSNorm + FP8 quantization. residual += input; out = quantize(rmsnorm(residual, weight), scale).",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.fused_add_rmsnorm_quant",
+    "status:verified",
+    "fused",
+    "quantization:fp8"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "residual": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "scale": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Per-tensor quantization scale, shape (1,)."
+    }
+  },
+  "outputs": {
+    "out": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Quantized output (dtype matches pre-allocated out tensor)."
+    },
+    "residual": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated residual (in-place: residual += input)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _fused_add_rmsnorm_quant_reference(hidden_states, residual, weight, scale):\n    \"\"\"Fused Add + RMSNorm + FP8 quantize.\n\n    ``residual' = hidden_states + residual``\n    ``out = quantize(rmsnorm(residual', weight), scale)``\n    Returns ``(out, residual')``.\n    \"\"\"\n    EPS = 1e-6\n    x = hidden_states.to(torch.float32) + residual.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    y = (x * inv_rms) * weight.to(torch.float32)\n    s = (\n        scale.to(torch.float32).reshape(())\n        if isinstance(scale, torch.Tensor)\n        else float(scale)\n    )\n    y = y / s\n    fp8_max = 448.0\n    y = y.clamp(-fp8_max, fp8_max).to(torch.float8_e4m3fn)\n    return y, x.to(hidden_states.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/gdn_decode_qk4_v8_d128.json b/tests/trace/fi_trace_out/gdn_decode_qk4_v8_d128.json
new file mode 100644
index 0000000000..75f481b0a5
--- /dev/null
+++ b/tests/trace/fi_trace_out/gdn_decode_qk4_v8_d128.json
@@ -0,0 +1,150 @@
+{
+  "name": "gdn_decode_qk4_v8_d128",
+  "description": "Gated Delta Net decode with GVA configuration and k-last state layout. Single-token generation with recurrent state update.",
+  "op_type": "gdn",
+  "tags": [
+    "fi_api:flashinfer.gdn_decode.gated_delta_rule_decode",
+    "stage:decode",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences being decoded concurrently."
+    },
+    "seq_len": {
+      "type": "const",
+      "value": 1,
+      "description": "Sequence length (always 1 for single-token decode)."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 4,
+      "description": "Number of query heads (same as key heads in GVA mode)."
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 4,
+      "description": "Number of key heads."
+    },
+    "num_v_heads": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of value heads (GVA: more value heads than query heads)."
+    },
+    "head_size": {
+      "type": "const",
+      "value": 128,
+      "description": "Dimension of each attention head (K dimension in query/key space, V dimension in value space)."
+    }
+  },
+  "constraints": [
+    "num_v_heads >= num_q_heads",
+    "num_v_heads % num_q_heads == 0",
+    "num_k_heads == num_q_heads"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_q_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Query tensor for single token decode."
+    },
+    "k": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_k_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Key tensor for single token decode."
+    },
+    "v": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_v_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Value tensor for single token decode."
+    },
+    "state": {
+      "shape": [
+        "batch_size",
+        "num_v_heads",
+        "head_size",
+        "head_size"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Recurrent state in k-last layout [B, H, V, K]."
+    },
+    "A_log": {
+      "shape": [
+        "num_v_heads"
+      ],
+      "dtype": "float32",
+      "description": "Log decay parameter (learnable). Used to compute g = exp(-exp(A_log) * softplus(a + dt_bias))."
+    },
+    "a": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_v_heads"
+      ],
+      "dtype": "bfloat16",
+      "description": "Input-dependent decay from projection."
+    },
+    "dt_bias": {
+      "shape": [
+        "num_v_heads"
+      ],
+      "dtype": "float32",
+      "description": "Decay bias (learnable). Added to 'a' before softplus."
+    },
+    "b": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_v_heads"
+      ],
+      "dtype": "bfloat16",
+      "description": "Update gate input from projection. beta = sigmoid(b)."
+    },
+    "scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scale factor. Default is 1/sqrt(head_size)."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_v_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Attention output. Shape follows num_v_heads in GVA mode."
+    },
+    "new_state": {
+      "shape": [
+        "batch_size",
+        "num_v_heads",
+        "head_size",
+        "head_size"
+      ],
+      "dtype": "float32",
+      "description": "Updated recurrent state in k-last layout [B, H, V, K]."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gdn_decode_reference(q, k, v, state, A_log, a, dt_bias, b, scale):\n    \"\"\"\n    Gated Delta Net decode reference implementation (k-last layout).\n\n    State layout: [B, H, V, K] (k-last, K dimension at the end)\n\n    Gate computation:\n    g = exp(-exp(A_log) * softplus(a + dt_bias))\n    beta = sigmoid(b)\n\n    Delta rule update:\n    state_new = g * state_old + k^T @ (beta * v + (1-beta) * k @ state_old) - k^T @ (k @ state_old)\n    output = scale * q @ state_new\n    \"\"\"\n    B, T, num_q_heads, K = q.shape\n    _, _, num_k_heads, _ = k.shape\n    _, _, num_v_heads, V = v.shape\n    num_heads = num_v_heads\n    device = q.device\n\n    if scale is None or scale == 0.0:\n        scale = 1.0 / math.sqrt(K)\n\n    x = a.float() + dt_bias.float()  # [B, 1, HV]\n    g = torch.exp(-torch.exp(A_log.float()) * F.softplus(x))  # [B, 1, HV]\n    beta = torch.sigmoid(b.float())  # [B, 1, HV]\n\n    q_f32 = q.squeeze(1).float()\n    k_f32 = k.squeeze(1).float()\n    v_f32 = v.squeeze(1).float()\n    g_f32 = g.squeeze(1).float()\n    beta_f32 = beta.squeeze(1).float()\n\n    if state is not None:\n        state_f32 = state.float()\n    else:\n        state_f32 = torch.zeros(B, num_heads, V, K, dtype=torch.float32, device=device)\n\n    q_exp = q_f32.repeat_interleave(num_v_heads // num_q_heads, dim=1)\n    k_exp = k_f32.repeat_interleave(num_v_heads // num_k_heads, dim=1)\n\n    new_state = torch.zeros_like(state_f32)\n    output = torch.zeros(B, num_heads, V, dtype=torch.float32, device=device)\n\n    for b_idx in range(B):\n        for h_idx in range(num_heads):\n            q_h = q_exp[b_idx, h_idx]\n            k_h = k_exp[b_idx, h_idx]\n            v_h = v_f32[b_idx, h_idx]\n            h_state = (\n                state_f32[b_idx, h_idx].clone().transpose(-1, -2)\n            )  # [V,K] -> [K,V]\n            g_val = g_f32[b_idx, h_idx]\n            beta_val = beta_f32[b_idx, h_idx]\n\n            old_state = g_val * h_state\n            old_v = k_h @ old_state\n            new_v = beta_val * v_h + (1 - beta_val) * old_v\n            state_remove = k_h.unsqueeze(1) @ old_v.unsqueeze(0)\n            state_update = k_h.unsqueeze(1) @ new_v.unsqueeze(0)\n            h_state = old_state - state_remove + state_update\n\n            output[b_idx, h_idx] = scale * (q_h @ h_state)\n            new_state[b_idx, h_idx] = h_state.transpose(-1, -2)  # [K,V] -> [V,K]\n\n    output = output.unsqueeze(1).to(torch.bfloat16)\n    return output, new_state\n"
+}
diff --git a/tests/trace/fi_trace_out/gdn_mtp_qk4_v8_d128.json b/tests/trace/fi_trace_out/gdn_mtp_qk4_v8_d128.json
new file mode 100644
index 0000000000..e005e07dc0
--- /dev/null
+++ b/tests/trace/fi_trace_out/gdn_mtp_qk4_v8_d128.json
@@ -0,0 +1,172 @@
+{
+  "name": "gdn_mtp_qk4_v8_d128",
+  "description": "Gated Delta Net Multi-Token Prediction (MTP) with GVA configuration. Used for speculative decoding verification where multiple tokens (T > 1) need to be processed in sequence. State layout is k-last [pool_size, H, V, K].",
+  "op_type": "gdn",
+  "tags": [
+    "fi_api:flashinfer.gdn_decode.gated_delta_rule_mtp",
+    "stage:mtp",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences being verified concurrently."
+    },
+    "seq_len": {
+      "type": "var",
+      "description": "Number of tokens to process (T > 1 for MTP)."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 4,
+      "description": "Number of query heads (same as key heads in GVA mode)."
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 4,
+      "description": "Number of key heads."
+    },
+    "num_v_heads": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of value heads (GVA: more value heads than query heads)."
+    },
+    "head_size": {
+      "type": "const",
+      "value": 128,
+      "description": "Dimension of each attention head (K dimension in query/key space, V dimension in value space)."
+    },
+    "pool_size": {
+      "type": "var",
+      "description": "Size of the state pool for efficient batching."
+    }
+  },
+  "constraints": [
+    "num_v_heads >= num_q_heads",
+    "num_v_heads % num_q_heads == 0",
+    "num_k_heads == num_q_heads",
+    "seq_len > 1"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_q_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Query tensor for multiple tokens."
+    },
+    "k": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_k_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Key tensor for multiple tokens."
+    },
+    "v": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_v_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Value tensor for multiple tokens."
+    },
+    "initial_state": {
+      "shape": [
+        "pool_size",
+        "num_v_heads",
+        "head_size",
+        "head_size"
+      ],
+      "dtype": "float32",
+      "description": "Initial recurrent state pool in k-last layout [pool_size, H, V, K]."
+    },
+    "initial_state_indices": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32",
+      "description": "Indices mapping each batch to its initial state in the pool."
+    },
+    "A_log": {
+      "shape": [
+        "num_v_heads"
+      ],
+      "dtype": "float32",
+      "description": "Log decay parameter (learnable). Used to compute g = exp(-exp(A_log) * softplus(a + dt_bias))."
+    },
+    "a": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_v_heads"
+      ],
+      "dtype": "bfloat16",
+      "description": "Input-dependent decay from projection."
+    },
+    "dt_bias": {
+      "shape": [
+        "num_v_heads"
+      ],
+      "dtype": "float32",
+      "description": "Decay bias (learnable). Added to 'a' before softplus."
+    },
+    "b": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_v_heads"
+      ],
+      "dtype": "bfloat16",
+      "description": "Update gate input from projection. beta = sigmoid(b)."
+    },
+    "scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scale factor. Default is 1/sqrt(head_size)."
+    },
+    "intermediate_states_buffer": {
+      "shape": [
+        "pool_size",
+        "seq_len",
+        "num_v_heads",
+        "head_size",
+        "head_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Optional buffer for caching intermediate states for potential rollback."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "seq_len",
+        "num_v_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Attention output for all T tokens. Shape follows num_v_heads in GVA mode."
+    },
+    "final_state": {
+      "shape": [
+        "pool_size",
+        "num_v_heads",
+        "head_size",
+        "head_size"
+      ],
+      "dtype": "float32",
+      "description": "Updated recurrent state pool in k-last layout [pool_size, H, V, K]."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gdn_mtp_reference(\n    q,\n    k,\n    v,\n    initial_state,\n    initial_state_indices,\n    A_log,\n    a,\n    dt_bias,\n    b,\n    scale,\n    intermediate_states_buffer=None,\n):\n    \"\"\"\n    Gated Delta Net MTP (Multi-Token Prediction) reference implementation.\n\n    State layout: [pool_size, H, V, K] (k-last, K dimension at the end)\n\n    Gate computation:\n    g = exp(-exp(A_log) * softplus(a + dt_bias))\n    beta = sigmoid(b)\n\n    For each token t in sequence:\n        state_new = g_t * state_old + k_t^T @ (beta_t * v_t + (1-beta_t) * k_t @ state_old) - k_t^T @ (k_t @ state_old)\n        output_t = scale * q_t @ state_new\n        state_old = state_new  # Update for next token\n    \"\"\"\n    B, T, num_q_heads, head_size = q.shape\n    _, _, num_k_heads, _ = k.shape\n    _, _, num_v_heads, _ = v.shape\n    device = q.device\n\n    if scale is None or scale == 0.0:\n        scale = 1.0 / math.sqrt(head_size)\n\n    x = a.float() + dt_bias.float()  # [B, T, HV]\n    g = torch.exp(-torch.exp(A_log.float()) * F.softplus(x))  # [B, T, HV]\n    beta = torch.sigmoid(b.float())  # [B, T, HV]\n\n    q_exp = q.repeat_interleave(num_v_heads // num_q_heads, dim=2)  # [B, T, HV, K]\n    k_exp = k.repeat_interleave(num_v_heads // num_k_heads, dim=2)  # [B, T, HV, K]\n\n    output = torch.zeros(\n        (B, T, num_v_heads, head_size), dtype=torch.bfloat16, device=device\n    )\n    cache_intermediate = intermediate_states_buffer is not None\n    final_state = initial_state.clone().float()\n\n    for b_idx in range(B):\n        state_idx = int(initial_state_indices[b_idx].item())\n        state_HVK = (\n            initial_state[state_idx].clone().float().transpose(-1, -2)\n        )  # [H,V,K] -> [H,K,V]\n\n        for t in range(T):\n            q_HK = q_exp[b_idx, t].float()  # [HV, K]\n            k_HK = k_exp[b_idx, t].float()  # [HV, K]\n            v_HV = v[b_idx, t].float()  # [HV, V]\n            g_H = g[b_idx, t]  # [HV]\n            beta_H = beta[b_idx, t]  # [HV]\n\n            for h_idx in range(num_v_heads):\n                q_h = q_HK[h_idx]\n                k_h = k_HK[h_idx]\n                v_h = v_HV[h_idx]\n                h_state = state_HVK[h_idx]\n                g_val = g_H[h_idx]\n                beta_val = beta_H[h_idx]\n\n                old_state = g_val * h_state\n                old_v = k_h @ old_state\n                new_v = beta_val * v_h + (1 - beta_val) * old_v\n                state_remove = k_h.unsqueeze(1) @ old_v.unsqueeze(0)\n                state_update = k_h.unsqueeze(1) @ new_v.unsqueeze(0)\n                h_state = old_state - state_remove + state_update\n\n                output[b_idx, t, h_idx] = (scale * (q_h @ h_state)).to(torch.bfloat16)\n                state_HVK[h_idx] = h_state\n\n            if cache_intermediate:\n                intermediate_states_buffer[state_idx, t] = state_HVK.transpose(\n                    -1, -2\n                )  # [H,K,V] -> [H,V,K]\n\n        # Commit accumulated state back to the pool slot [H,K,V] -> [H,V,K].\n        final_state[state_idx] = state_HVK.transpose(-1, -2)\n\n    return output, final_state\n"
+}
diff --git a/tests/trace/fi_trace_out/gdn_prefill_qk4_v8_d128.json b/tests/trace/fi_trace_out/gdn_prefill_qk4_v8_d128.json
new file mode 100644
index 0000000000..42c4f0b83c
--- /dev/null
+++ b/tests/trace/fi_trace_out/gdn_prefill_qk4_v8_d128.json
@@ -0,0 +1,158 @@
+{
+  "name": "gdn_prefill_qk4_v8_d128",
+  "description": "Gated Delta Net prefill with GVA configuration and k-last state layout. The state is in k-last layout [N, H, V, K].",
+  "op_type": "gdn",
+  "tags": [
+    "fi_api:flashinfer.gdn_prefill.chunk_gated_delta_rule",
+    "stage:prefill",
+    "status:verified"
+  ],
+  "axes": {
+    "total_seq_len": {
+      "type": "var",
+      "description": "Total number of tokens across all sequences in the batch."
+    },
+    "num_seqs": {
+      "type": "var",
+      "description": "Number of sequences in the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 4,
+      "description": "Number of query heads (same as key heads in GVA mode)."
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 4,
+      "description": "Number of key heads."
+    },
+    "num_v_heads": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of value heads (GVA: more value heads than query heads)."
+    },
+    "head_size": {
+      "type": "const",
+      "value": 128,
+      "description": "Dimension of each attention head (K dimension in query/key space, V dimension in value space)."
+    },
+    "len_cu_seqlens": {
+      "type": "var",
+      "description": "Length of cu_seqlens array (num_seqs + 1)."
+    }
+  },
+  "constraints": [
+    "num_v_heads >= num_q_heads",
+    "num_v_heads % num_q_heads == 0",
+    "num_k_heads == num_q_heads",
+    "len_cu_seqlens == num_seqs + 1",
+    "total_seq_len == cu_seqlens[-1].item()"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "total_seq_len",
+        "num_q_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Query tensor."
+    },
+    "k": {
+      "shape": [
+        "total_seq_len",
+        "num_k_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Key tensor."
+    },
+    "v": {
+      "shape": [
+        "total_seq_len",
+        "num_v_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Value tensor."
+    },
+    "state": {
+      "shape": [
+        "num_seqs",
+        "num_v_heads",
+        "head_size",
+        "head_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Recurrent state in k-last layout [N, H, V, K]."
+    },
+    "A_log": {
+      "shape": [
+        "num_v_heads"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Log decay parameter (conceptual; not passed directly \u2014 precomputed into g)."
+    },
+    "a": {
+      "shape": [
+        "total_seq_len",
+        "num_v_heads"
+      ],
+      "dtype": "unknown",
+      "description": "Precomputed gate values (g = exp(-exp(A_log) * softplus(a + dt_bias)))."
+    },
+    "dt_bias": {
+      "shape": [
+        "num_v_heads"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Decay bias (conceptual; not passed directly \u2014 precomputed into g)."
+    },
+    "b": {
+      "shape": [
+        "total_seq_len",
+        "num_v_heads"
+      ],
+      "dtype": "unknown",
+      "description": "Update gate values (beta = sigmoid(b))."
+    },
+    "cu_seqlens": {
+      "shape": [
+        "len_cu_seqlens"
+      ],
+      "dtype": "int64",
+      "description": "Cumulative sequence lengths for variable-length batching."
+    },
+    "scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scale factor. Default is 1/sqrt(head_size)."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "total_seq_len",
+        "num_v_heads",
+        "head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Attention output. Shape follows num_v_heads in GVA mode."
+    },
+    "new_state": {
+      "shape": [
+        "num_seqs",
+        "num_v_heads",
+        "head_size",
+        "head_size"
+      ],
+      "dtype": "float32",
+      "description": "Updated recurrent state in k-last layout [N, H, V, K]."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gdn_prefill_reference(q, k, v, state, A_log, a, dt_bias, b, cu_seqlens, scale):\n    \"\"\"\n    Gated Delta Net prefill reference implementation (k-last layout).\n\n    State layout: [H, V, K] (k-last, K dimension at the end)\n\n    Gate computation:\n    g = exp(-exp(A_log) * softplus(a + dt_bias))\n    beta = sigmoid(b)\n\n    Delta rule update:\n    state_new = g * state_old + k^T @ (beta * v + (1-beta) * k @ state_old) - k^T @ (k @ state_old)\n    output = scale * q @ state_new\n    \"\"\"\n    total_seq_len, num_q_heads, head_size = q.shape\n    num_v_heads = v.shape[1]\n    num_k_heads = k.shape[1]\n    num_sab_heads = max(num_q_heads, num_v_heads)\n    num_seqs = cu_seqlens.size(0) - 1\n    device = q.device\n\n    if scale is None or scale == 0.0:\n        scale = 1.0 / math.sqrt(head_size)\n\n    x = a.float() + dt_bias.float()  # [total_seq_len, HV]\n    g = torch.exp(-torch.exp(A_log.float()) * F.softplus(x))  # [total_seq_len, HV]\n    beta = torch.sigmoid(b.float())  # [total_seq_len, HV]\n\n    q_exp = q.repeat_interleave(num_v_heads // num_q_heads, dim=1)\n    k_exp = k.repeat_interleave(num_v_heads // num_k_heads, dim=1)\n\n    output = torch.zeros(\n        (total_seq_len, num_sab_heads, head_size), dtype=torch.bfloat16, device=device\n    )\n    new_state = torch.zeros(\n        (num_seqs, num_sab_heads, head_size, head_size),\n        dtype=torch.float32,\n        device=device,\n    )\n\n    for seq_idx in range(num_seqs):\n        seq_start = int(cu_seqlens[seq_idx].item())\n        seq_end = int(cu_seqlens[seq_idx + 1].item())\n        seq_len = seq_end - seq_start\n        if seq_len <= 0:\n            continue\n\n        if state is not None:\n            state_HKV = (\n                state[seq_idx].clone().float().transpose(-1, -2)\n            )  # [H,V,K] -> [H,K,V]\n        else:\n            state_HKV = torch.zeros(\n                (num_sab_heads, head_size, head_size),\n                dtype=torch.float32,\n                device=device,\n            )\n\n        for i in range(seq_len):\n            t = seq_start + i\n            q_H1K = q_exp[t].unsqueeze(1).float()\n            k_H1K = k_exp[t].unsqueeze(1).float()\n            v_H1V = v[t].unsqueeze(1).float()\n            g_H11 = g[t].unsqueeze(1).unsqueeze(2)\n            beta_H11 = beta[t].unsqueeze(1).unsqueeze(2)\n\n            old_state_HKV = g_H11 * state_HKV\n            old_v_H1V = q_H1K.float() @ old_state_HKV  # reuse shape pattern\n            old_v_H1V = k_H1K @ old_state_HKV\n            new_v_H1V = beta_H11 * v_H1V + (1 - beta_H11) * old_v_H1V\n            state_remove = torch.einsum(\n                \"hkl,hlv->hkv\", k_H1K.transpose(-1, -2), old_v_H1V\n            )\n            state_update = torch.einsum(\n                \"hkl,hlv->hkv\", k_H1K.transpose(-1, -2), new_v_H1V\n            )\n            state_HKV = old_state_HKV - state_remove + state_update\n\n            o_H1V = scale * (q_H1K @ state_HKV)\n            output[t] = o_H1V.squeeze(1).to(torch.bfloat16)\n\n        new_state[seq_idx] = state_HKV.transpose(-1, -2)  # [H,K,V] -> [H,V,K]\n\n    return output, new_state\n"
+}
diff --git a/tests/trace/fi_trace_out/gelu_and_mul_h16384.json b/tests/trace/fi_trace_out/gelu_and_mul_h16384.json
new file mode 100644
index 0000000000..181db814ea
--- /dev/null
+++ b/tests/trace/fi_trace_out/gelu_and_mul_h16384.json
@@ -0,0 +1,41 @@
+{
+  "name": "gelu_and_mul_h16384",
+  "description": "Fused GeLU (exact) + Mul: gelu(x[:H]) * x[H:].",
+  "op_type": "activation",
+  "tags": [
+    "fi_api:flashinfer.activation.gelu_and_mul",
+    "status:verified",
+    "fused"
+  ],
+  "axes": {
+    "num_tokens": {
+      "type": "var",
+      "description": "Total number of tokens."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 16384,
+      "description": "Output hidden size (input is 2*h)."
+    }
+  },
+  "inputs": {
+    "input": {
+      "shape": [
+        "num_tokens",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Gated input tensor of shape [num_tokens, 2*hidden_size]."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "num_tokens",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gelu_and_mul_reference(input):\n    \"\"\"Fused GeLU (exact) + Mul: gelu(x[:H]) * x[H:]\"\"\"\n    half = input.shape[-1] // 2\n    return F.gelu(input[..., :half]) * input[..., half:]\n"
+}
diff --git a/tests/trace/fi_trace_out/gelu_tanh_and_mul_h16384.json b/tests/trace/fi_trace_out/gelu_tanh_and_mul_h16384.json
new file mode 100644
index 0000000000..f0e7a8dd02
--- /dev/null
+++ b/tests/trace/fi_trace_out/gelu_tanh_and_mul_h16384.json
@@ -0,0 +1,41 @@
+{
+  "name": "gelu_tanh_and_mul_h16384",
+  "description": "Fused GeLU (tanh approx) + Mul: gelu_tanh(x[:H]) * x[H:]. Used in BERT/GPT FFN.",
+  "op_type": "activation",
+  "tags": [
+    "fi_api:flashinfer.activation.gelu_tanh_and_mul",
+    "status:verified",
+    "fused"
+  ],
+  "axes": {
+    "num_tokens": {
+      "type": "var",
+      "description": "Total number of tokens."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 16384,
+      "description": "Output hidden size (input is 2*h)."
+    }
+  },
+  "inputs": {
+    "input": {
+      "shape": [
+        "num_tokens",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Gated input tensor of shape [num_tokens, 2*hidden_size]."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "num_tokens",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gelu_tanh_and_mul_reference(input):\n    \"\"\"Fused GeLU (tanh approx) + Mul: gelu_tanh(x[:H]) * x[H:]\"\"\"\n    half = input.shape[-1] // 2\n    return F.gelu(input[..., :half], approximate=\"tanh\") * input[..., half:]\n"
+}
diff --git a/tests/trace/fi_trace_out/gemm_bf16_N256_K7168.json b/tests/trace/fi_trace_out/gemm_bf16_N256_K7168.json
new file mode 100644
index 0000000000..fa80fe9be2
--- /dev/null
+++ b/tests/trace/fi_trace_out/gemm_bf16_N256_K7168.json
@@ -0,0 +1,49 @@
+{
+  "name": "gemm_bf16_N256_K7168",
+  "description": "General matrix multiply (GEMM) C = A @ B (B is column-major [K, N]).",
+  "op_type": "gemm_bf16",
+  "tags": [
+    "fi_api:flashinfer.gemm.gemm_base.mm_bf16",
+    "status:verified"
+  ],
+  "axes": {
+    "M": {
+      "type": "var"
+    },
+    "N": {
+      "type": "const",
+      "value": 256
+    },
+    "K": {
+      "type": "const",
+      "value": 7168
+    }
+  },
+  "inputs": {
+    "A": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "bfloat16"
+    },
+    "B": {
+      "shape": [
+        "K",
+        "N"
+      ],
+      "dtype": "bfloat16",
+      "description": "Weight matrix in column-major layout (physical shape [K, N])."
+    }
+  },
+  "outputs": {
+    "C": {
+      "shape": [
+        "M",
+        "N"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "def _mm_reference(A, B):\n    # B is physically [K, N] (column-major weight), so C = A @ B.\n    return torch.matmul(A, B)\n"
+}
diff --git a/tests/trace/fi_trace_out/gemm_bf16_N4096_K4096.json b/tests/trace/fi_trace_out/gemm_bf16_N4096_K4096.json
new file mode 100644
index 0000000000..0e3f8420d1
--- /dev/null
+++ b/tests/trace/fi_trace_out/gemm_bf16_N4096_K4096.json
@@ -0,0 +1,49 @@
+{
+  "name": "gemm_bf16_N4096_K4096",
+  "description": "General matrix multiply (GEMM) C = A @ B (B is column-major [K, N]).",
+  "op_type": "gemm_bf16",
+  "tags": [
+    "fi_api:flashinfer.gemm.gemm_base.mm_bf16",
+    "status:verified"
+  ],
+  "axes": {
+    "M": {
+      "type": "var"
+    },
+    "N": {
+      "type": "const",
+      "value": 4096
+    },
+    "K": {
+      "type": "const",
+      "value": 4096
+    }
+  },
+  "inputs": {
+    "A": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "bfloat16"
+    },
+    "B": {
+      "shape": [
+        "K",
+        "N"
+      ],
+      "dtype": "bfloat16",
+      "description": "Weight matrix in column-major layout (physical shape [K, N])."
+    }
+  },
+  "outputs": {
+    "C": {
+      "shape": [
+        "M",
+        "N"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "def _mm_reference(A, B):\n    # B is physically [K, N] (column-major weight), so C = A @ B.\n    return torch.matmul(A, B)\n"
+}
diff --git a/tests/trace/fi_trace_out/gemm_fp4_N2048_K7168_block_size16.json b/tests/trace/fi_trace_out/gemm_fp4_N2048_K7168_block_size16.json
new file mode 100644
index 0000000000..a79eae3c54
--- /dev/null
+++ b/tests/trace/fi_trace_out/gemm_fp4_N2048_K7168_block_size16.json
@@ -0,0 +1,77 @@
+{
+  "name": "gemm_fp4_N2048_K7168_block_size16",
+  "description": "FP4 GEMM C = A @ B. A and B are fp4 (e2m1fn_x2 packed as uint8); scale tensors use block_size.",
+  "op_type": "gemm_fp4",
+  "tags": [
+    "fi_api:flashinfer.gemm.gemm_base.mm_fp4",
+    "status:verified",
+    "quantization:fp4"
+  ],
+  "axes": {
+    "M": {
+      "type": "var"
+    },
+    "N": {
+      "type": "const",
+      "value": 2048
+    },
+    "K": {
+      "type": "const",
+      "value": 7168
+    },
+    "block_size": {
+      "type": "const",
+      "value": 16,
+      "description": "FP4 quantization block size (16 for nvfp4, 32 for mxfp4)."
+    }
+  },
+  "inputs": {
+    "A": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "uint8",
+      "description": "Input A tensor, fp4 e2m1fn_x2 packed as uint8."
+    },
+    "B": {
+      "shape": [
+        "K",
+        "N"
+      ],
+      "dtype": "uint8",
+      "description": "Input B tensor, fp4 e2m1fn_x2 packed as uint8, column-major."
+    },
+    "a_descale": {
+      "shape": [
+        "M",
+        "K_div_block_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block scale for A, shape [M, K//block_size], float8_e4m3fn or uint8."
+    },
+    "b_descale": {
+      "shape": [
+        "K",
+        "N_div_block_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block scale for B, shape [K, N//block_size], float8_e4m3fn or uint8."
+    },
+    "block_size": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "FP4 quantization block size (16 for nvfp4, 32 for mxfp4)."
+    }
+  },
+  "outputs": {
+    "C": {
+      "shape": [
+        "M",
+        "N"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "def _mm_fp4_reference(A, B, a_descale, b_descale, block_size=16):\n    \"\"\"Dequantize FP4 inputs and compute C = A @ B.\n\n    A and B are fp4 e2m1fn values packed two-per-byte as uint8.\n    a_descale: [M, K//block_size], b_descale: [K, N//block_size].\n    The reference unpacks the nibbles and applies the block scales.\n    \"\"\"\n\n    def _unpack_fp4(packed, rows, cols):\n        # Each byte holds two fp4 nibbles (low nibble = first element).\n        lo = (packed & 0x0F).to(torch.float32)\n        hi = ((packed >> 4) & 0x0F).to(torch.float32)\n        # Interleave low/high nibbles along the last dimension.\n        out = torch.stack([lo, hi], dim=-1).reshape(rows, cols)\n        return out\n\n    M, K_packed = A.shape\n    K = K_packed * 2\n    _, N_packed = B.shape\n    N = N_packed * 2\n\n    A_fp32 = _unpack_fp4(A, M, K)\n    B_fp32 = _unpack_fp4(B, K, N)\n\n    # Apply per-block scales.\n    a_scale = a_descale.to(torch.float32).repeat_interleave(block_size, dim=1)  # [M, K]\n    b_scale = b_descale.to(torch.float32).repeat_interleave(block_size, dim=1)  # [K, N]\n    A_scaled = A_fp32 * a_scale\n    B_scaled = B_fp32 * b_scale\n    return torch.matmul(A_scaled, B_scaled).to(torch.bfloat16)\n"
+}
diff --git a/tests/trace/fi_trace_out/gemm_fp8_N1536_K7168.json b/tests/trace/fi_trace_out/gemm_fp8_N1536_K7168.json
new file mode 100644
index 0000000000..bfa75489ae
--- /dev/null
+++ b/tests/trace/fi_trace_out/gemm_fp8_N1536_K7168.json
@@ -0,0 +1,51 @@
+{
+  "name": "gemm_fp8_N1536_K7168",
+  "description": "FP8 block-scale GEMM C = A @ B (TRT-LLM layout). A is [M, K] float8_e4m3fn; B is [K//block_size, N, block_size] float8_e4m3fn.",
+  "op_type": "gemm_fp8",
+  "tags": [
+    "fi_api:flashinfer.gemm.gemm_base.mm_fp8",
+    "status:verified",
+    "quantization:float8_e4m3fn"
+  ],
+  "axes": {
+    "M": {
+      "type": "var"
+    },
+    "N": {
+      "type": "const",
+      "value": 1536
+    },
+    "K": {
+      "type": "const",
+      "value": 7168
+    }
+  },
+  "inputs": {
+    "A": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "float8_e4m3fn"
+    },
+    "B": {
+      "shape": [
+        "K_div_block_size",
+        "N",
+        "block_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "FP8 weight in TRT-LLM block layout [K//block_size, N, block_size]."
+    }
+  },
+  "outputs": {
+    "C": {
+      "shape": [
+        "M",
+        "N"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "def _mm_fp8_reference(A, B):\n    \"\"\"Dequantize FP8 block-scale inputs and compute C = A @ B.\n\n    B is in TRT-LLM block layout [K//block_size, N, block_size] and is\n    reshaped to [K, N] before the matmul.\n    \"\"\"\n    K_div_bs, N, block_size = B.shape\n    B_fp32 = B.reshape(K_div_bs * block_size, N).to(torch.float32)\n    A_fp32 = A.to(torch.float32)\n    return torch.matmul(A_fp32, B_fp32).to(torch.bfloat16)\n"
+}
diff --git a/tests/trace/fi_trace_out/gemm_mxfp8_N4096_K4096.json b/tests/trace/fi_trace_out/gemm_mxfp8_N4096_K4096.json
new file mode 100644
index 0000000000..70a65a5d8d
--- /dev/null
+++ b/tests/trace/fi_trace_out/gemm_mxfp8_N4096_K4096.json
@@ -0,0 +1,67 @@
+{
+  "name": "gemm_mxfp8_N4096_K4096",
+  "description": "MXFP8 GEMM C = A @ B (MX block size 32). A and B are float8_e4m3fn; scale tensors use block size 32.",
+  "op_type": "gemm_mxfp8",
+  "tags": [
+    "fi_api:flashinfer.gemm.gemm_base.mm_mxfp8",
+    "status:verified",
+    "quantization:mxfp8"
+  ],
+  "axes": {
+    "M": {
+      "type": "var"
+    },
+    "N": {
+      "type": "const",
+      "value": 4096
+    },
+    "K": {
+      "type": "const",
+      "value": 4096
+    }
+  },
+  "inputs": {
+    "A": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Input A tensor, float8_e4m3fn."
+    },
+    "B": {
+      "shape": [
+        "K",
+        "N"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Input B tensor, float8_e4m3fn, column-major."
+    },
+    "a_descale": {
+      "shape": [
+        "M",
+        "K_div_32"
+      ],
+      "dtype": "uint8",
+      "description": "Block scale for A, shape [M, K//32], uint8."
+    },
+    "b_descale": {
+      "shape": [
+        "K_div_32",
+        "N"
+      ],
+      "dtype": "uint8",
+      "description": "Block scale for B, shape [K//32, N], uint8."
+    }
+  },
+  "outputs": {
+    "C": {
+      "shape": [
+        "M",
+        "N"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "def _mm_mxfp8_reference(A, B, a_descale, b_descale):\n    \"\"\"Dequantize MXFP8 inputs (block size 32) and compute C = A @ B.\n\n    a_descale: [M, K//32] uint8 interpreted as float scale per block.\n    b_descale: [K//32, N] uint8 interpreted as float scale per block.\n    \"\"\"\n    _, K = A.shape\n    block_size = 32\n    A_fp32 = A.to(torch.float32)\n    B_fp32 = B.to(torch.float32)\n    # Apply per-block scales along the K dimension.\n    a_scale = a_descale.to(torch.float32).repeat_interleave(block_size, dim=1)  # [M, K]\n    b_scale = b_descale.to(torch.float32).repeat_interleave(block_size, dim=0)  # [K, N]\n    A_scaled = A_fp32 * a_scale\n    B_scaled = B_fp32 * b_scale\n    return torch.matmul(A_scaled, B_scaled).to(torch.bfloat16)\n"
+}
diff --git a/tests/trace/fi_trace_out/gemma_fused_add_rmsnorm_h4608.json b/tests/trace/fi_trace_out/gemma_fused_add_rmsnorm_h4608.json
new file mode 100644
index 0000000000..66183f86c4
--- /dev/null
+++ b/tests/trace/fi_trace_out/gemma_fused_add_rmsnorm_h4608.json
@@ -0,0 +1,60 @@
+{
+  "name": "gemma_fused_add_rmsnorm_h4608",
+  "description": "Gemma-style Fused Add + RMSNorm: residual += input; out = gemma_rmsnorm(residual).",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.gemma_fused_add_rmsnorm",
+    "status:verified",
+    "fused",
+    "model:gemma"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 4608
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "residual": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "residual": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated residual (in-place: residual += input)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gemma_fused_add_rmsnorm_reference(input, residual, weight):\n    \"\"\"Gemma-style Fused Add + RMSNorm.\"\"\"\n    EPS = 1e-6\n    x = input.to(torch.float32) + residual.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    return (x * inv_rms * (weight.to(torch.float32) + 1)).to(input.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/gemma_rmsnorm_h4608.json b/tests/trace/fi_trace_out/gemma_rmsnorm_h4608.json
new file mode 100644
index 0000000000..8ba99df65b
--- /dev/null
+++ b/tests/trace/fi_trace_out/gemma_rmsnorm_h4608.json
@@ -0,0 +1,44 @@
+{
+  "name": "gemma_rmsnorm_h4608",
+  "description": "Gemma-style RMSNorm: out = rmsnorm(x) * (weight + 1).",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.gemma_rmsnorm",
+    "status:verified",
+    "model:gemma"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 4608
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gemma_rmsnorm_reference(input, weight):\n    \"\"\"Gemma-style RMSNorm: out = rmsnorm(input) * (weight + 1). Epsilon fixed at 1e-6.\"\"\"\n    EPS = 1e-6\n    x = input.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    return (x * inv_rms * (weight.to(torch.float32) + 1)).to(input.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/gqa_paged_decode_h32_kv8_d128_ps16.json b/tests/trace/fi_trace_out/gqa_paged_decode_h32_kv8_d128_ps16.json
new file mode 100644
index 0000000000..f45c2f6df9
--- /dev/null
+++ b/tests/trace/fi_trace_out/gqa_paged_decode_h32_kv8_d128_ps16.json
@@ -0,0 +1,116 @@
+{
+  "name": "gqa_paged_decode_h32_kv8_d128_ps16",
+  "description": "Batched GQA decode (1 query per seq) with a paged KV cache as a (k_cache, v_cache) tuple and ragged kv_indptr+kv_indices baked in at plan() time. Wraps BatchDecodeWithPagedKVCacheWrapper.run().",
+  "op_type": "gqa_paged",
+  "tags": [
+    "fi_api:flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper.run",
+    "stage:decode",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Total number of query tokens."
+    },
+    "num_qo_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "num_pages": {
+      "type": "var"
+    },
+    "page_size": {
+      "type": "const",
+      "value": 16
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of kv_indptr array."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Total number of KV page indices."
+    }
+  },
+  "constraints": [
+    "len_indptr == batch_size + 1",
+    "num_kv_indices == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "v_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "KV page offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Page IDs for KV cache lookups. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "lse": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gqa_paged_decode_reference(q, k_cache, v_cache, kv_indptr, kv_indices, sm_scale):\n    batch_size, num_qo_heads, head_dim = q.shape\n    _, page_size, num_kv_heads, _ = k_cache.shape\n\n    output = torch.zeros(\n        (batch_size, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device\n    )\n    lse = torch.full(\n        (batch_size, num_qo_heads), -float(\"inf\"), dtype=torch.float32, device=q.device\n    )\n\n    gqa_ratio = num_qo_heads // num_kv_heads\n    k_cache_f32 = k_cache.to(torch.float32)\n    v_cache_f32 = v_cache.to(torch.float32)\n\n    for b in range(batch_size):\n        page_start = int(kv_indptr[b].item())\n        page_end = int(kv_indptr[b + 1].item())\n        if page_start >= page_end:\n            output[b].zero_()\n            continue\n        # kv_indices are page IDs. Gather pages first, then flatten the\n        # [num_selected_pages, page_size] axis into a single token axis.\n        page_ids = kv_indices[page_start:page_end].to(torch.long)\n        k_b = k_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        v_b = v_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        q_b = q[b].to(torch.float32)  # [num_qo_heads, head_dim]\n        for h in range(num_qo_heads):\n            kv_h = h // gqa_ratio\n            logits = torch.matmul(q_b[h], k_b[:, kv_h].T) * sm_scale\n            lse[b, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n            attn = torch.softmax(logits, dim=-1)\n            output[b, h] = torch.matmul(attn, v_b[:, kv_h]).to(torch.bfloat16)\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out/gqa_paged_decode_h32_kv8_d128_ps64.json b/tests/trace/fi_trace_out/gqa_paged_decode_h32_kv8_d128_ps64.json
new file mode 100644
index 0000000000..fa29a5e06a
--- /dev/null
+++ b/tests/trace/fi_trace_out/gqa_paged_decode_h32_kv8_d128_ps64.json
@@ -0,0 +1,116 @@
+{
+  "name": "gqa_paged_decode_h32_kv8_d128_ps64",
+  "description": "Batched GQA decode (1 query per seq) with a paged KV cache as a (k_cache, v_cache) tuple and ragged kv_indptr+kv_indices baked in at plan() time. Wraps BatchDecodeWithPagedKVCacheWrapper.run().",
+  "op_type": "gqa_paged",
+  "tags": [
+    "fi_api:flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper.run",
+    "stage:decode",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Total number of query tokens."
+    },
+    "num_qo_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "num_pages": {
+      "type": "var"
+    },
+    "page_size": {
+      "type": "const",
+      "value": 64
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of kv_indptr array."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Total number of KV page indices."
+    }
+  },
+  "constraints": [
+    "len_indptr == batch_size + 1",
+    "num_kv_indices == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "v_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "KV page offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Page IDs for KV cache lookups. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "lse": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gqa_paged_decode_reference(q, k_cache, v_cache, kv_indptr, kv_indices, sm_scale):\n    batch_size, num_qo_heads, head_dim = q.shape\n    _, page_size, num_kv_heads, _ = k_cache.shape\n\n    output = torch.zeros(\n        (batch_size, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device\n    )\n    lse = torch.full(\n        (batch_size, num_qo_heads), -float(\"inf\"), dtype=torch.float32, device=q.device\n    )\n\n    gqa_ratio = num_qo_heads // num_kv_heads\n    k_cache_f32 = k_cache.to(torch.float32)\n    v_cache_f32 = v_cache.to(torch.float32)\n\n    for b in range(batch_size):\n        page_start = int(kv_indptr[b].item())\n        page_end = int(kv_indptr[b + 1].item())\n        if page_start >= page_end:\n            output[b].zero_()\n            continue\n        # kv_indices are page IDs. Gather pages first, then flatten the\n        # [num_selected_pages, page_size] axis into a single token axis.\n        page_ids = kv_indices[page_start:page_end].to(torch.long)\n        k_b = k_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        v_b = v_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        q_b = q[b].to(torch.float32)  # [num_qo_heads, head_dim]\n        for h in range(num_qo_heads):\n            kv_h = h // gqa_ratio\n            logits = torch.matmul(q_b[h], k_b[:, kv_h].T) * sm_scale\n            lse[b, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n            attn = torch.softmax(logits, dim=-1)\n            output[b, h] = torch.matmul(attn, v_b[:, kv_h]).to(torch.bfloat16)\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out/gqa_paged_prefill_h32_kv8_d128_ps16.json b/tests/trace/fi_trace_out/gqa_paged_prefill_h32_kv8_d128_ps16.json
new file mode 100644
index 0000000000..3fd1cd852a
--- /dev/null
+++ b/tests/trace/fi_trace_out/gqa_paged_prefill_h32_kv8_d128_ps16.json
@@ -0,0 +1,124 @@
+{
+  "name": "gqa_paged_prefill_h32_kv8_d128_ps16",
+  "description": "Batched GQA prefill (multi-token per seq, causal) with a paged KV cache. Adds qo_indptr to gqa_paged_decode's indptr/indices. Wraps BatchPrefillWithPagedKVCacheWrapper.run().",
+  "op_type": "gqa_paged",
+  "tags": [
+    "fi_api:flashinfer.prefill.BatchPrefillWithPagedKVCacheWrapper.run",
+    "stage:prefill",
+    "status:verified"
+  ],
+  "axes": {
+    "num_qo_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "page_size": {
+      "type": "const",
+      "value": 16
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of indptr arrays (batch_size + 1)."
+    },
+    "total_q": {
+      "type": "var",
+      "description": "Total number of query tokens."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Total number of KV page indices."
+    },
+    "num_pages": {
+      "type": "var"
+    }
+  },
+  "constraints": [
+    "total_q == qo_indptr[-1].item()",
+    "num_kv_indices == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "total_q",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "v_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "qo_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Query offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "KV page offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Page IDs for KV cache lookups. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "total_q",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "lse": {
+      "shape": [
+        "total_q",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gqa_paged_prefill_reference(\n    q, k_cache, v_cache, qo_indptr, kv_indptr, kv_indices, sm_scale\n):\n    total_q, num_qo_heads, head_dim = q.shape\n    num_pages, page_size, num_kv_heads, _ = k_cache.shape\n    len_indptr = qo_indptr.shape[0]\n\n    output = torch.zeros(\n        (total_q, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device\n    )\n    lse = torch.full(\n        (total_q, num_qo_heads), -float(\"inf\"), dtype=torch.float32, device=q.device\n    )\n\n    gqa_ratio = num_qo_heads // num_kv_heads\n    q_f32 = q.to(torch.float32)\n    k_cache_f32 = k_cache.to(torch.float32)\n    v_cache_f32 = v_cache.to(torch.float32)\n\n    for b in range(len_indptr - 1):\n        q_start = int(qo_indptr[b].item())\n        q_end = int(qo_indptr[b + 1].item())\n        kv_start = int(kv_indptr[b].item())\n        kv_end = int(kv_indptr[b + 1].item())\n        if q_start >= q_end or kv_start >= kv_end:\n            continue\n        # kv_indices are page IDs. Gather pages and flatten to a token axis.\n        page_ids = kv_indices[kv_start:kv_end].to(torch.long)\n        k_b = k_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        v_b = v_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        num_kv_tokens = k_b.shape[0]\n        q_b = q_f32[q_start:q_end]\n        delta = num_kv_tokens - q_b.shape[0]\n        for q_idx in range(q_b.shape[0]):\n            max_kv = min(q_idx + 1 + delta, num_kv_tokens)\n            if max_kv <= 0:\n                continue\n            global_q = q_start + q_idx\n            for h in range(num_qo_heads):\n                kv_h = h // gqa_ratio\n                logits = torch.matmul(q_b[q_idx, h], k_b[:max_kv, kv_h].T) * sm_scale\n                lse[global_q, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n                attn = torch.softmax(logits, dim=-1)\n                output[global_q, h] = torch.matmul(attn, v_b[:max_kv, kv_h]).to(\n                    torch.bfloat16\n                )\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out/gqa_ragged_h32_kv8_d128.json b/tests/trace/fi_trace_out/gqa_ragged_h32_kv8_d128.json
new file mode 100644
index 0000000000..f22ed03d8d
--- /dev/null
+++ b/tests/trace/fi_trace_out/gqa_ragged_h32_kv8_d128.json
@@ -0,0 +1,108 @@
+{
+  "name": "gqa_ragged_h32_kv8_d128",
+  "description": "Batched GQA prefill (causal) with contiguous (non-paged) K/V tensors and qo_indptr/kv_indptr offsets baked in at plan() time. Wraps BatchPrefillWithRaggedKVCacheWrapper.run().",
+  "op_type": "gqa_ragged",
+  "tags": [
+    "fi_api:flashinfer.prefill.BatchPrefillWithRaggedKVCacheWrapper.run",
+    "stage:prefill",
+    "status:verified"
+  ],
+  "axes": {
+    "num_qo_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of indptr arrays (batch_size + 1)."
+    },
+    "total_q": {
+      "type": "var",
+      "description": "Total number of query tokens."
+    },
+    "total_kv": {
+      "type": "var",
+      "description": "Total key-value tokens across all sequences."
+    }
+  },
+  "constraints": [
+    "total_q == qo_indptr[-1].item()",
+    "total_kv == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "total_q",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "total_kv",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "v": {
+      "shape": [
+        "total_kv",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "qo_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Query offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Key-value offsets for each sequence. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "total_q",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Attention output tensor."
+    },
+    "lse": {
+      "shape": [
+        "total_q",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gqa_ragged_prefill_reference(q, k, v, qo_indptr, kv_indptr, sm_scale):\n    total_q, num_qo_heads, head_dim = q.shape\n    total_kv, num_kv_heads, _ = k.shape\n    len_indptr = qo_indptr.shape[0]\n\n    output = torch.zeros(\n        (total_q, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device\n    )\n    lse = torch.full(\n        (total_q, num_qo_heads), -float(\"inf\"), dtype=torch.float32, device=q.device\n    )\n\n    gqa_ratio = num_qo_heads // num_kv_heads\n    q_f32 = q.to(torch.float32)\n    k_f32 = k.to(torch.float32)\n    v_f32 = v.to(torch.float32)\n\n    for b in range(len_indptr - 1):\n        q_start = int(qo_indptr[b].item())\n        q_end = int(qo_indptr[b + 1].item())\n        kv_start = int(kv_indptr[b].item())\n        kv_end = int(kv_indptr[b + 1].item())\n        if q_start >= q_end or kv_start >= kv_end:\n            continue\n        q_b = q_f32[q_start:q_end]  # [S, num_qo_heads, head_dim]\n        k_b = k_f32[kv_start:kv_end]  # [T, num_kv_heads, head_dim]\n        v_b = v_f32[kv_start:kv_end]\n        num_q_tokens = q_b.shape[0]\n        num_kv_tokens = k_b.shape[0]\n        delta = num_kv_tokens - num_q_tokens\n        for q_idx in range(num_q_tokens):\n            max_kv = min(q_idx + 1 + delta, num_kv_tokens)\n            if max_kv <= 0:\n                continue\n            global_q = q_start + q_idx\n            for h in range(num_qo_heads):\n                kv_h = h // gqa_ratio\n                logits = torch.matmul(q_b[q_idx, h], k_b[:max_kv, kv_h].T) * sm_scale\n                lse[global_q, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n                attn = torch.softmax(logits, dim=-1)\n                output[global_q, h] = torch.matmul(attn, v_b[:max_kv, kv_h]).to(\n                    torch.bfloat16\n                )\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out/layernorm_h768.json b/tests/trace/fi_trace_out/layernorm_h768.json
new file mode 100644
index 0000000000..af7dddae38
--- /dev/null
+++ b/tests/trace/fi_trace_out/layernorm_h768.json
@@ -0,0 +1,51 @@
+{
+  "name": "layernorm_h768",
+  "description": "Standard LayerNorm with gamma and beta. Epsilon fixed at 1e-6.",
+  "op_type": "layernorm",
+  "tags": [
+    "fi_api:flashinfer.norm.layernorm",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 768
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "float32",
+      "description": "Scale (gamma) tensor, float32."
+    },
+    "bias": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "float32",
+      "description": "Bias (beta) tensor, float32."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _layernorm_reference(input, weight, bias):\n    \"\"\"Standard LayerNorm with gamma (weight) and beta (bias). Epsilon fixed at 1e-6.\"\"\"\n    EPS = 1e-6\n    x = input.to(torch.float32)\n    mean = x.mean(dim=-1, keepdim=True)\n    var = ((x - mean) ** 2).mean(dim=-1, keepdim=True)\n    x_norm = (x - mean) / torch.sqrt(var + EPS)\n    return (x_norm * weight.to(torch.float32) + bias.to(torch.float32)).to(input.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/llama31_rope_h32_kv8_d128.json b/tests/trace/fi_trace_out/llama31_rope_h32_kv8_d128.json
new file mode 100644
index 0000000000..23d6a7e849
--- /dev/null
+++ b/tests/trace/fi_trace_out/llama31_rope_h32_kv8_d128.json
@@ -0,0 +1,132 @@
+{
+  "name": "llama31_rope_h32_kv8_d128",
+  "description": "Llama 3.1 RoPE on ragged q/k with indptr + offsets.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_llama31_rope",
+    "status:verified",
+    "model:llama"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences in the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "batch_size_plus_1": {
+      "type": "var",
+      "description": "batch_size + 1."
+    }
+  },
+  "constraints": [
+    "batch_size_plus_1 == batch_size + 1"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "indptr": {
+      "shape": [
+        "batch_size_plus_1"
+      ],
+      "dtype": "int32",
+      "description": "Ragged batch indptr, shape (batch_size + 1)."
+    },
+    "offsets": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32",
+      "description": "Per-sequence starting position offset."
+    },
+    "rotary_dim": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "If None, uses head_dim. Rotate only the first `rotary_dim` dims."
+    },
+    "interleave": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Bool: interleaved (True) vs half-split (False) rotation."
+    },
+    "rope_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scale factor."
+    },
+    "rope_theta": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Theta value."
+    },
+    "low_freq_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Llama 3.1 low-frequency scaling factor."
+    },
+    "high_freq_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Llama 3.1 high-frequency scaling factor."
+    },
+    "old_context_len": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Original pretraining context length."
+    }
+  },
+  "outputs": {
+    "q_rope": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_rope": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_llama31_rope_reference(\n    q: torch.Tensor,\n    k: torch.Tensor,\n    indptr: torch.Tensor,\n    offsets: torch.Tensor,\n    rotary_dim: Optional[int] = None,\n    interleave: bool = False,\n    rope_scale: float = 8,\n    rope_theta: float = 5e5,\n    low_freq_factor: float = 1,\n    high_freq_factor: float = 4,\n    old_context_len: int = 8192,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    if rotary_dim is None:\n        rotary_dim = q.shape[-1]\n    freqs = _llama31_freqs(\n        rotary_dim,\n        rope_theta,\n        rope_scale,\n        low_freq_factor,\n        high_freq_factor,\n        float(old_context_len),\n        q.device,\n    )\n    positions = _positions_from_indptr(indptr, offsets, q.shape[0])\n    return _apply_rope_core(q, k, positions, freqs, interleave)\n"
+}
diff --git a/tests/trace/fi_trace_out/llama31_rope_inplace_h32_kv8_d128.json b/tests/trace/fi_trace_out/llama31_rope_inplace_h32_kv8_d128.json
new file mode 100644
index 0000000000..66109d1df0
--- /dev/null
+++ b/tests/trace/fi_trace_out/llama31_rope_inplace_h32_kv8_d128.json
@@ -0,0 +1,134 @@
+{
+  "name": "llama31_rope_inplace_h32_kv8_d128",
+  "description": "In-place Llama 3.1 RoPE with indptr + offsets.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_llama31_rope_inplace",
+    "status:verified",
+    "model:llama"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences in the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "batch_size_plus_1": {
+      "type": "var",
+      "description": "batch_size + 1."
+    }
+  },
+  "constraints": [
+    "batch_size_plus_1 == batch_size + 1"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "indptr": {
+      "shape": [
+        "batch_size_plus_1"
+      ],
+      "dtype": "int32",
+      "description": "Ragged batch indptr, shape (batch_size + 1)."
+    },
+    "offsets": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32",
+      "description": "Per-sequence starting position offset."
+    },
+    "rotary_dim": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "If None, uses head_dim. Rotate only the first `rotary_dim` dims."
+    },
+    "interleave": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Bool: interleaved (True) vs half-split (False) rotation."
+    },
+    "rope_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scale factor."
+    },
+    "rope_theta": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Theta value."
+    },
+    "low_freq_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Llama 3.1 low-frequency scaling factor."
+    },
+    "high_freq_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Llama 3.1 high-frequency scaling factor."
+    },
+    "old_context_len": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Original pretraining context length."
+    }
+  },
+  "outputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated q (in-place)."
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated k (in-place)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_llama31_rope_reference(\n    q: torch.Tensor,\n    k: torch.Tensor,\n    indptr: torch.Tensor,\n    offsets: torch.Tensor,\n    rotary_dim: Optional[int] = None,\n    interleave: bool = False,\n    rope_scale: float = 8,\n    rope_theta: float = 5e5,\n    low_freq_factor: float = 1,\n    high_freq_factor: float = 4,\n    old_context_len: int = 8192,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    if rotary_dim is None:\n        rotary_dim = q.shape[-1]\n    freqs = _llama31_freqs(\n        rotary_dim,\n        rope_theta,\n        rope_scale,\n        low_freq_factor,\n        high_freq_factor,\n        float(old_context_len),\n        q.device,\n    )\n    positions = _positions_from_indptr(indptr, offsets, q.shape[0])\n    return _apply_rope_core(q, k, positions, freqs, interleave)\n"
+}
diff --git a/tests/trace/fi_trace_out/llama31_rope_pos_ids_h32_kv8_d128.json b/tests/trace/fi_trace_out/llama31_rope_pos_ids_h32_kv8_d128.json
new file mode 100644
index 0000000000..306b57ab3d
--- /dev/null
+++ b/tests/trace/fi_trace_out/llama31_rope_pos_ids_h32_kv8_d128.json
@@ -0,0 +1,110 @@
+{
+  "name": "llama31_rope_pos_ids_h32_kv8_d128",
+  "description": "Llama 3.1 RoPE using per-token position ids.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_llama31_rope_pos_ids",
+    "status:verified",
+    "model:llama"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    }
+  },
+  "inputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "pos_ids": {
+      "shape": [
+        "nnz"
+      ],
+      "dtype": "int32",
+      "description": "Per-token position index."
+    },
+    "rotary_dim": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true
+    },
+    "interleave": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true
+    },
+    "rope_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true
+    },
+    "rope_theta": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true
+    },
+    "low_freq_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Llama 3.1 low-frequency scaling factor."
+    },
+    "high_freq_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Llama 3.1 high-frequency scaling factor."
+    },
+    "old_context_len": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Original pretraining context length."
+    }
+  },
+  "outputs": {
+    "q_rope": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_rope": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_llama31_rope_pos_ids_reference(\n    q: torch.Tensor,\n    k: torch.Tensor,\n    pos_ids: torch.Tensor,\n    rotary_dim: Optional[int] = None,\n    interleave: bool = False,\n    rope_scale: float = 8,\n    rope_theta: float = 5e5,\n    low_freq_factor: float = 1,\n    high_freq_factor: float = 4,\n    old_context_len: int = 8192,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    if rotary_dim is None:\n        rotary_dim = q.shape[-1]\n    freqs = _llama31_freqs(\n        rotary_dim,\n        rope_theta,\n        rope_scale,\n        low_freq_factor,\n        high_freq_factor,\n        float(old_context_len),\n        q.device,\n    )\n    return _apply_rope_core(q, k, pos_ids.to(torch.float32), freqs, interleave)\n"
+}
diff --git a/tests/trace/fi_trace_out/llama31_rope_pos_ids_inplace_h32_kv8_d128.json b/tests/trace/fi_trace_out/llama31_rope_pos_ids_inplace_h32_kv8_d128.json
new file mode 100644
index 0000000000..e9cfa1df1a
--- /dev/null
+++ b/tests/trace/fi_trace_out/llama31_rope_pos_ids_inplace_h32_kv8_d128.json
@@ -0,0 +1,112 @@
+{
+  "name": "llama31_rope_pos_ids_inplace_h32_kv8_d128",
+  "description": "In-place Llama 3.1 RoPE using per-token position ids.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_llama31_rope_pos_ids_inplace",
+    "status:verified",
+    "model:llama"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    }
+  },
+  "inputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "pos_ids": {
+      "shape": [
+        "nnz"
+      ],
+      "dtype": "int32",
+      "description": "Per-token position index."
+    },
+    "rotary_dim": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true
+    },
+    "interleave": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true
+    },
+    "rope_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true
+    },
+    "rope_theta": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true
+    },
+    "low_freq_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Llama 3.1 low-frequency scaling factor."
+    },
+    "high_freq_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Llama 3.1 high-frequency scaling factor."
+    },
+    "old_context_len": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Original pretraining context length."
+    }
+  },
+  "outputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated q (in-place)."
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated k (in-place)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_llama31_rope_pos_ids_reference(\n    q: torch.Tensor,\n    k: torch.Tensor,\n    pos_ids: torch.Tensor,\n    rotary_dim: Optional[int] = None,\n    interleave: bool = False,\n    rope_scale: float = 8,\n    rope_theta: float = 5e5,\n    low_freq_factor: float = 1,\n    high_freq_factor: float = 4,\n    old_context_len: int = 8192,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    if rotary_dim is None:\n        rotary_dim = q.shape[-1]\n    freqs = _llama31_freqs(\n        rotary_dim,\n        rope_theta,\n        rope_scale,\n        low_freq_factor,\n        high_freq_factor,\n        float(old_context_len),\n        q.device,\n    )\n    return _apply_rope_core(q, k, pos_ids.to(torch.float32), freqs, interleave)\n"
+}
diff --git a/tests/trace/fi_trace_out/merge_state_h32_d128.json b/tests/trace/fi_trace_out/merge_state_h32_d128.json
new file mode 100644
index 0000000000..f9230ea0c5
--- /dev/null
+++ b/tests/trace/fi_trace_out/merge_state_h32_d128.json
@@ -0,0 +1,77 @@
+{
+  "name": "merge_state_h32_d128",
+  "description": "Merge two attention (V, S) states for cascade/speculative attention.",
+  "op_type": "cascade_merge",
+  "tags": [
+    "fi_api:flashinfer.cascade.merge_state",
+    "status:verified"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of query tokens."
+    },
+    "num_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    }
+  },
+  "inputs": {
+    "v_a": {
+      "shape": [
+        "seq_len",
+        "num_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Attention output from KV segment A."
+    },
+    "s_a": {
+      "shape": [
+        "seq_len",
+        "num_heads"
+      ],
+      "dtype": "float32",
+      "description": "Logsumexp (base-2) from KV segment A."
+    },
+    "v_b": {
+      "shape": [
+        "seq_len",
+        "num_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Attention output from KV segment B."
+    },
+    "s_b": {
+      "shape": [
+        "seq_len",
+        "num_heads"
+      ],
+      "dtype": "float32",
+      "description": "Logsumexp (base-2) from KV segment B."
+    }
+  },
+  "outputs": {
+    "v_merged": {
+      "shape": [
+        "seq_len",
+        "num_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "s_merged": {
+      "shape": [
+        "seq_len",
+        "num_heads"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _merge_state_reference(v_a, s_a, v_b, s_b):\n    \"\"\"Merge two attention (V, S) states via numerically stable log-sum-exp.\"\"\"\n    # s_a, s_b are log2-scale logsumexp values; convert to natural scale\n    s_a = s_a.to(torch.float32) * math.log(2.0)\n    s_b = s_b.to(torch.float32) * math.log(2.0)\n    v_a = v_a.to(torch.float32)\n    v_b = v_b.to(torch.float32)\n    s_max = torch.maximum(s_a, s_b)\n    exp_a = torch.exp(s_a - s_max)\n    exp_b = torch.exp(s_b - s_max)\n    exp_sum = exp_a + exp_b\n    v_merged = (\n        v_a * exp_a.unsqueeze(-1) + v_b * exp_b.unsqueeze(-1)\n    ) / exp_sum.unsqueeze(-1)\n    s_merged = (s_max + torch.log(exp_sum)) / math.log(2.0)\n    return v_merged.to(v_a.dtype), s_merged.to(torch.float32)\n"
+}
diff --git a/tests/trace/fi_trace_out/merge_state_in_place_h32_d128.json b/tests/trace/fi_trace_out/merge_state_in_place_h32_d128.json
new file mode 100644
index 0000000000..baf1961b34
--- /dev/null
+++ b/tests/trace/fi_trace_out/merge_state_in_place_h32_d128.json
@@ -0,0 +1,87 @@
+{
+  "name": "merge_state_in_place_h32_d128",
+  "description": "Merge attention (V, S) states in-place. v and s are updated with merged result.",
+  "op_type": "cascade_merge",
+  "tags": [
+    "fi_api:flashinfer.cascade.merge_state_in_place",
+    "status:verified"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of query tokens."
+    },
+    "num_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    }
+  },
+  "inputs": {
+    "v": {
+      "shape": [
+        "seq_len",
+        "num_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Attention output (updated in-place with merged result)."
+    },
+    "s": {
+      "shape": [
+        "seq_len",
+        "num_heads"
+      ],
+      "dtype": "float32",
+      "description": "Logsumexp (base-2) (updated in-place)."
+    },
+    "v_other": {
+      "shape": [
+        "seq_len",
+        "num_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Other attention output to merge in."
+    },
+    "s_other": {
+      "shape": [
+        "seq_len",
+        "num_heads"
+      ],
+      "dtype": "float32",
+      "description": "Other logsumexp (base-2) to merge in."
+    },
+    "mask": {
+      "shape": [
+        "seq_len"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Boolean mask; if set, only merge where mask is True."
+    }
+  },
+  "outputs": {
+    "v": {
+      "shape": [
+        "seq_len",
+        "num_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated v (in-place)."
+    },
+    "s": {
+      "shape": [
+        "seq_len",
+        "num_heads"
+      ],
+      "dtype": "float32",
+      "description": "Updated s (in-place)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _merge_state_in_place_reference(v, s, v_other, s_other, mask=None):\n    \"\"\"In-place LSE-weighted merge of (v, s) with (v_other, s_other).\n\n    When ``mask`` is provided, only rows where mask is True are merged;\n    other rows are returned unchanged. Scales are base-2 logsumexp as in\n    ``_merge_state_reference``.\n    \"\"\"\n    s_a = s.to(torch.float32) * math.log(2.0)\n    s_b = s_other.to(torch.float32) * math.log(2.0)\n    v_a = v.to(torch.float32)\n    v_b = v_other.to(torch.float32)\n    s_max = torch.maximum(s_a, s_b)\n    exp_a = torch.exp(s_a - s_max)\n    exp_b = torch.exp(s_b - s_max)\n    exp_sum = exp_a + exp_b\n    v_merged = (\n        v_a * exp_a.unsqueeze(-1) + v_b * exp_b.unsqueeze(-1)\n    ) / exp_sum.unsqueeze(-1)\n    s_merged = (s_max + torch.log(exp_sum)) / math.log(2.0)\n    if mask is not None:\n        m = mask.to(torch.bool)\n        v_merged = torch.where(m[:, None, None], v_merged, v_a)\n        s_merged = torch.where(m[:, None], s_merged, s.to(torch.float32))\n    return v_merged.to(v.dtype), s_merged.to(torch.float32)\n"
+}
diff --git a/tests/trace/fi_trace_out/merge_states_h32_d128.json b/tests/trace/fi_trace_out/merge_states_h32_d128.json
new file mode 100644
index 0000000000..b971b960c6
--- /dev/null
+++ b/tests/trace/fi_trace_out/merge_states_h32_d128.json
@@ -0,0 +1,66 @@
+{
+  "name": "merge_states_h32_d128",
+  "description": "Merge multiple (num_states) attention (V, S) states.",
+  "op_type": "cascade_merge",
+  "tags": [
+    "fi_api:flashinfer.cascade.merge_states",
+    "status:verified"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of query tokens."
+    },
+    "num_states": {
+      "type": "var",
+      "description": "Number of KV segments to merge."
+    },
+    "num_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    }
+  },
+  "inputs": {
+    "v": {
+      "shape": [
+        "seq_len",
+        "num_states",
+        "num_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Attention outputs from all KV segments."
+    },
+    "s": {
+      "shape": [
+        "seq_len",
+        "num_states",
+        "num_heads"
+      ],
+      "dtype": "float32",
+      "description": "Logsumexp (base-2) values from all KV segments."
+    }
+  },
+  "outputs": {
+    "v_merged": {
+      "shape": [
+        "seq_len",
+        "num_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "s_merged": {
+      "shape": [
+        "seq_len",
+        "num_heads"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _merge_states_reference(v, s):\n    \"\"\"Merge num_states attention (V, S) states via numerically stable log-sum-exp.\"\"\"\n    # v: [seq_len, num_states, num_heads, head_dim]\n    # s: [seq_len, num_states, num_heads]  (log2 scale)\n    s_nat = s.to(torch.float32) * math.log(2.0)\n    v_f32 = v.to(torch.float32)\n    s_max, _ = s_nat.max(dim=1, keepdim=True)\n    exp_s = torch.exp(s_nat - s_max)  # [seq_len, num_states, num_heads]\n    exp_sum = exp_s.sum(dim=1, keepdim=True)\n    weights = exp_s / exp_sum  # [seq_len, num_states, num_heads]\n    v_merged = (v_f32 * weights.unsqueeze(-1)).sum(dim=1)\n    s_merged = (s_max.squeeze(1) + torch.log(exp_sum.squeeze(1))) / math.log(2.0)\n    return v_merged.to(v.dtype), s_merged.to(torch.float32)\n"
+}
diff --git a/tests/trace/fi_trace_out/min_p_sampling_v32000.json b/tests/trace/fi_trace_out/min_p_sampling_v32000.json
new file mode 100644
index 0000000000..72df2ee9e7
--- /dev/null
+++ b/tests/trace/fi_trace_out/min_p_sampling_v32000.json
@@ -0,0 +1,52 @@
+{
+  "name": "min_p_sampling_v32000",
+  "description": "Fused min-p sampling: keep probs >= min_p * max_prob, renormalise, categorical sample.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.min_p_sampling_from_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    },
+    "num_indices": {
+      "type": "var",
+      "description": "Length of optional indices tensor."
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "min_p": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Min-p threshold (scalar or per-row tensor)."
+    },
+    "indices": {
+      "shape": [
+        "num_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _min_p_sampling_reference(probs, min_p, indices=None, **_unused):\n    \"\"\"Min-p sampling: keep probs >= min_p * max_prob, renormalise, then argmax.\"\"\"\n    p = probs.to(torch.float32)\n    if indices is not None:\n        p = p[indices.to(torch.long)]\n    if isinstance(min_p, torch.Tensor):\n        mp = min_p.to(torch.float32).reshape(-1, 1)\n    else:\n        mp = float(min_p)\n    threshold = p.max(dim=-1, keepdim=True).values * mp\n    mask = p >= threshold\n    p_masked = torch.where(mask, p, torch.zeros_like(p))\n    p_masked = p_masked / (p_masked.sum(dim=-1, keepdim=True) + 1e-20)\n    return p_masked.argmax(dim=-1).to(torch.int32)\n"
+}
diff --git a/tests/trace/fi_trace_out/mla_paged_decode_h16_ckv512_kpe64_ps1.json b/tests/trace/fi_trace_out/mla_paged_decode_h16_ckv512_kpe64_ps1.json
new file mode 100644
index 0000000000..b4434f32fe
--- /dev/null
+++ b/tests/trace/fi_trace_out/mla_paged_decode_h16_ckv512_kpe64_ps1.json
@@ -0,0 +1,127 @@
+{
+  "name": "mla_paged_decode_h16_ckv512_kpe64_ps1",
+  "description": "Batched MLA decode (DeepSeek-V2/V3/R1). Query and KV are split into NoPE (ckv, head_dim_ckv=512) and RoPE (kpe, head_dim_kpe=64) parts: inputs are (q_nope, q_pe) and (ckv_cache, kpe_cache). Wraps BatchMLAPagedAttentionWrapper.run() post matrix-absorption.",
+  "op_type": "mla_paged",
+  "tags": [
+    "fi_api:flashinfer.mla._core.BatchMLAPagedAttentionWrapper.run",
+    "stage:decode",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "num_qo_heads": {
+      "type": "const",
+      "value": 16,
+      "description": "Number of query heads after tensor parallel split."
+    },
+    "head_dim_ckv": {
+      "type": "const",
+      "value": 512
+    },
+    "head_dim_kpe": {
+      "type": "const",
+      "value": 64
+    },
+    "page_size": {
+      "type": "const",
+      "value": 1
+    },
+    "num_pages": {
+      "type": "var",
+      "description": "Total number of allocated pages in the KV cache."
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of kv_indptr array."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Total number of KV page indices."
+    }
+  },
+  "constraints": [
+    "len_indptr == batch_size + 1",
+    "num_kv_indices == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q_nope": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim_ckv"
+      ],
+      "dtype": "bfloat16",
+      "description": "Query tensor without positional encoding component."
+    },
+    "q_pe": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim_kpe"
+      ],
+      "dtype": "bfloat16",
+      "description": "Query positional encoding component."
+    },
+    "ckv_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "head_dim_ckv"
+      ],
+      "dtype": "bfloat16",
+      "description": "Compressed key-value cache."
+    },
+    "kpe_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "head_dim_kpe"
+      ],
+      "dtype": "bfloat16",
+      "description": "Key positional encoding cache."
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "KV page offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Page indices for KV cache lookups. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(128 + 64) = 1/sqrt(192)), based on head dimensions before matrix absorption. Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim_ckv"
+      ],
+      "dtype": "bfloat16"
+    },
+    "lse": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _mla_paged_decode_reference(\n    q_nope, q_pe, ckv_cache, kpe_cache, kv_indptr, kv_indices, sm_scale\n):\n    batch_size, num_qo_heads, head_dim_ckv = q_nope.shape\n    _, _, head_dim_kpe = q_pe.shape\n\n    # [num_pages, page_size, head_dim_*] \u2014 keep the page dim; flatten after gather.\n    Kc_all = ckv_cache.to(torch.float32)\n    Kp_all = kpe_cache.to(torch.float32)\n\n    output = torch.zeros(\n        (batch_size, num_qo_heads, head_dim_ckv),\n        dtype=torch.bfloat16,\n        device=q_nope.device,\n    )\n    lse = torch.full(\n        (batch_size, num_qo_heads),\n        -float(\"inf\"),\n        dtype=torch.float32,\n        device=q_nope.device,\n    )\n\n    for b in range(batch_size):\n        page_beg = int(kv_indptr[b].item())\n        page_end = int(kv_indptr[b + 1].item())\n        if page_beg >= page_end:\n            output[b].zero_()\n            continue\n        # kv_indices are page IDs; gather pages then flatten to a token axis.\n        page_ids = kv_indices[page_beg:page_end].to(torch.long)\n        Kc = Kc_all[page_ids].reshape(-1, head_dim_ckv)  # [L, head_dim_ckv]\n        Kp = Kp_all[page_ids].reshape(-1, head_dim_kpe)  # [L, head_dim_kpe]\n        qn = q_nope[b].to(torch.float32)  # [num_qo_heads, head_dim_ckv]\n        qp = q_pe[b].to(torch.float32)  # [num_qo_heads, head_dim_kpe]\n        logits = ((qn @ Kc.T) + (qp @ Kp.T)) * sm_scale  # [num_qo_heads, L]\n        lse[b] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n        output[b] = (torch.softmax(logits, dim=-1) @ Kc).to(torch.bfloat16)\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out/mla_paged_decode_h16_ckv512_kpe64_ps64.json b/tests/trace/fi_trace_out/mla_paged_decode_h16_ckv512_kpe64_ps64.json
new file mode 100644
index 0000000000..bc949c246b
--- /dev/null
+++ b/tests/trace/fi_trace_out/mla_paged_decode_h16_ckv512_kpe64_ps64.json
@@ -0,0 +1,127 @@
+{
+  "name": "mla_paged_decode_h16_ckv512_kpe64_ps64",
+  "description": "Batched MLA decode (DeepSeek-V2/V3/R1). Query and KV are split into NoPE (ckv, head_dim_ckv=512) and RoPE (kpe, head_dim_kpe=64) parts: inputs are (q_nope, q_pe) and (ckv_cache, kpe_cache). Wraps BatchMLAPagedAttentionWrapper.run() post matrix-absorption.",
+  "op_type": "mla_paged",
+  "tags": [
+    "fi_api:flashinfer.mla._core.BatchMLAPagedAttentionWrapper.run",
+    "stage:decode",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "num_qo_heads": {
+      "type": "const",
+      "value": 16,
+      "description": "Number of query heads after tensor parallel split."
+    },
+    "head_dim_ckv": {
+      "type": "const",
+      "value": 512
+    },
+    "head_dim_kpe": {
+      "type": "const",
+      "value": 64
+    },
+    "page_size": {
+      "type": "const",
+      "value": 64
+    },
+    "num_pages": {
+      "type": "var",
+      "description": "Total number of allocated pages in the KV cache."
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of kv_indptr array."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Total number of KV page indices."
+    }
+  },
+  "constraints": [
+    "len_indptr == batch_size + 1",
+    "num_kv_indices == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q_nope": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim_ckv"
+      ],
+      "dtype": "bfloat16",
+      "description": "Query tensor without positional encoding component."
+    },
+    "q_pe": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim_kpe"
+      ],
+      "dtype": "bfloat16",
+      "description": "Query positional encoding component."
+    },
+    "ckv_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "head_dim_ckv"
+      ],
+      "dtype": "bfloat16",
+      "description": "Compressed key-value cache."
+    },
+    "kpe_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "head_dim_kpe"
+      ],
+      "dtype": "bfloat16",
+      "description": "Key positional encoding cache."
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "KV page offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Page indices for KV cache lookups. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(128 + 64) = 1/sqrt(192)), based on head dimensions before matrix absorption. Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim_ckv"
+      ],
+      "dtype": "bfloat16"
+    },
+    "lse": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _mla_paged_decode_reference(\n    q_nope, q_pe, ckv_cache, kpe_cache, kv_indptr, kv_indices, sm_scale\n):\n    batch_size, num_qo_heads, head_dim_ckv = q_nope.shape\n    _, _, head_dim_kpe = q_pe.shape\n\n    # [num_pages, page_size, head_dim_*] \u2014 keep the page dim; flatten after gather.\n    Kc_all = ckv_cache.to(torch.float32)\n    Kp_all = kpe_cache.to(torch.float32)\n\n    output = torch.zeros(\n        (batch_size, num_qo_heads, head_dim_ckv),\n        dtype=torch.bfloat16,\n        device=q_nope.device,\n    )\n    lse = torch.full(\n        (batch_size, num_qo_heads),\n        -float(\"inf\"),\n        dtype=torch.float32,\n        device=q_nope.device,\n    )\n\n    for b in range(batch_size):\n        page_beg = int(kv_indptr[b].item())\n        page_end = int(kv_indptr[b + 1].item())\n        if page_beg >= page_end:\n            output[b].zero_()\n            continue\n        # kv_indices are page IDs; gather pages then flatten to a token axis.\n        page_ids = kv_indices[page_beg:page_end].to(torch.long)\n        Kc = Kc_all[page_ids].reshape(-1, head_dim_ckv)  # [L, head_dim_ckv]\n        Kp = Kp_all[page_ids].reshape(-1, head_dim_kpe)  # [L, head_dim_kpe]\n        qn = q_nope[b].to(torch.float32)  # [num_qo_heads, head_dim_ckv]\n        qp = q_pe[b].to(torch.float32)  # [num_qo_heads, head_dim_kpe]\n        logits = ((qn @ Kc.T) + (qp @ Kp.T)) * sm_scale  # [num_qo_heads, L]\n        lse[b] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n        output[b] = (torch.softmax(logits, dim=-1) @ Kc).to(torch.bfloat16)\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp4_block_scale_default_routing_topk8_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp4_block_scale_default_routing_topk8_e32_h7168_i2048.json
new file mode 100644
index 0000000000..73905b9d0b
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp4_block_scale_default_routing_topk8_e32_h7168_i2048.json
@@ -0,0 +1,225 @@
+{
+  "name": "moe_fp4_block_scale_default_routing_topk8_e32_h7168_i2048",
+  "description": "NvFP4 block-scale MoE with Default routing (Softmax \u2192 TopK).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp4_block_scale_moe",
+    "status:experimental",
+    "quantization:nvfp4"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of tokens."
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts selected per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of FC1 (2 \u00d7 intermediate_size for SwiGLU)."
+    },
+    "num_packed_hidden": {
+      "type": "const",
+      "value": 3584,
+      "description": "Packed hidden dimension (hidden_size // 2 for NvFP4)."
+    },
+    "num_fp4_hidden_blocks": {
+      "type": "const",
+      "value": 448,
+      "description": "Number of FP4 scale blocks along hidden_size (hidden_size // 16 for NvFP4)."
+    },
+    "num_packed_intermediate": {
+      "type": "const",
+      "value": 1024,
+      "description": "Packed intermediate dimension (intermediate_size // 2 for NvFP4)."
+    },
+    "num_fp4_intermediate_blocks": {
+      "type": "const",
+      "value": 128,
+      "description": "Number of FP4 scale blocks along intermediate_size (intermediate_size // 16 for NvFP4)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Bias added to routing logits. Pass None when not used."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "Input hidden states, NvFP4-packed (uint8, 2 fp4 per byte)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "seq_len",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "optional": true,
+      "description": "Block-wise scale factors for hidden_states (float8). None for bf16 input."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "FC1 weights, NvFP4-packed (uint8). Shape includes gate+up for SwiGLU."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm1_weights (float8)."
+    },
+    "gemm1_bias": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC1 bias (float32). Optional."
+    },
+    "gemm1_alpha": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU alpha (float32). Optional."
+    },
+    "gemm1_beta": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU beta (float32). Optional."
+    },
+    "gemm1_clamp_limit": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU clamp limit (float32). Optional."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_packed_intermediate"
+      ],
+      "dtype": "uint8",
+      "description": "FC2 weights, NvFP4-packed (uint8)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_fp4_intermediate_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm2_weights (float8)."
+    },
+    "gemm2_bias": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC2 bias (float32). Optional."
+    },
+    "output1_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 activation (float32). Optional."
+    },
+    "output1_scale_gate_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 gate (float32). Optional."
+    },
+    "output2_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC2 (float32). Optional."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in the global expert array."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scaling factor applied to routing weights. None for some routing methods."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp4_block_scale_moe_default_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm1_bias,\n    gemm2_weights,\n    gemm2_weights_scale,\n    gemm2_bias,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"FP4 MoE with Default routing (Softmax \u2192 TopK).\"\"\"\n    TOP_K = int(top_k)\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    s = torch.softmax(logits, dim=-1)\n    _, topk_idx = torch.topk(s, k=TOP_K, dim=1, largest=True, sorted=False)\n    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)\n    w_topk = s.gather(1, topk_idx) * scale\n    return _fp4_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        gemm1_bias,\n        gemm2_bias,\n        w_topk,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp4_block_scale_ds_routing_topk8_e32_h7168_i2048_ng8_kg4.json b/tests/trace/fi_trace_out/moe_fp4_block_scale_ds_routing_topk8_e32_h7168_i2048_ng8_kg4.json
new file mode 100644
index 0000000000..f7e1fa1242
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp4_block_scale_ds_routing_topk8_e32_h7168_i2048_ng8_kg4.json
@@ -0,0 +1,235 @@
+{
+  "name": "moe_fp4_block_scale_ds_routing_topk8_e32_h7168_i2048_ng8_kg4",
+  "description": "NvFP4 block-scale MoE with DeepSeekV3 routing (Sigmoid \u2192 group selection \u2192 top_k).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp4_block_scale_moe",
+    "status:experimental",
+    "quantization:nvfp4"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of tokens."
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts selected per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of FC1 (2 \u00d7 intermediate_size for SwiGLU)."
+    },
+    "num_packed_hidden": {
+      "type": "const",
+      "value": 3584,
+      "description": "Packed hidden dimension (hidden_size // 2 for NvFP4)."
+    },
+    "num_fp4_hidden_blocks": {
+      "type": "const",
+      "value": 448,
+      "description": "Number of FP4 scale blocks along hidden_size (hidden_size // 16 for NvFP4)."
+    },
+    "num_packed_intermediate": {
+      "type": "const",
+      "value": 1024,
+      "description": "Packed intermediate dimension (intermediate_size // 2 for NvFP4)."
+    },
+    "num_fp4_intermediate_blocks": {
+      "type": "const",
+      "value": 128,
+      "description": "Number of FP4 scale blocks along intermediate_size (intermediate_size // 16 for NvFP4)."
+    },
+    "n_group": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of expert groups for group routing."
+    },
+    "topk_group": {
+      "type": "const",
+      "value": 4,
+      "description": "Number of groups selected in top-k routing."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Bias added to routing logits. Pass None when not used."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "Input hidden states, NvFP4-packed (uint8, 2 fp4 per byte)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "seq_len",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "optional": true,
+      "description": "Block-wise scale factors for hidden_states (float8). None for bf16 input."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "FC1 weights, NvFP4-packed (uint8). Shape includes gate+up for SwiGLU."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm1_weights (float8)."
+    },
+    "gemm1_bias": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC1 bias (float32). Optional."
+    },
+    "gemm1_alpha": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU alpha (float32). Optional."
+    },
+    "gemm1_beta": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU beta (float32). Optional."
+    },
+    "gemm1_clamp_limit": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU clamp limit (float32). Optional."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_packed_intermediate"
+      ],
+      "dtype": "uint8",
+      "description": "FC2 weights, NvFP4-packed (uint8)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_fp4_intermediate_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm2_weights (float8)."
+    },
+    "gemm2_bias": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC2 bias (float32). Optional."
+    },
+    "output1_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 activation (float32). Optional."
+    },
+    "output1_scale_gate_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 gate (float32). Optional."
+    },
+    "output2_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC2 (float32). Optional."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in the global expert array."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scaling factor applied to routing weights. None for some routing methods."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp4_block_scale_moe_ds_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm1_bias,\n    gemm2_weights,\n    gemm2_weights_scale,\n    gemm2_bias,\n    top_k,\n    n_group,\n    topk_group,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"FP4 MoE with DeepSeek-V3 routing: sigmoid + groups + top_k.\"\"\"\n    TOP_K = int(top_k)\n    N_GROUP = int(n_group)\n    TOPK_GROUP = int(topk_group)\n    E_global = routing_logits.shape[1]\n    T = routing_logits.shape[0]\n\n    logits = routing_logits.to(torch.float32)\n    bias = routing_bias.to(torch.float32).reshape(-1)\n    s = 1.0 / (1.0 + torch.exp(-logits))\n    s_with_bias = s + bias\n\n    group_size = E_global // N_GROUP\n    s_wb_grouped = s_with_bias.view(T, N_GROUP, group_size)\n    top2_vals, _ = torch.topk(s_wb_grouped, k=2, dim=2, largest=True, sorted=False)\n    group_scores = top2_vals.sum(dim=2)\n\n    _, group_idx = torch.topk(\n        group_scores, k=TOPK_GROUP, dim=1, largest=True, sorted=False\n    )\n    group_mask = torch.zeros_like(group_scores)\n    group_mask.scatter_(1, group_idx, 1.0)\n    score_mask = (\n        group_mask.unsqueeze(2).expand(T, N_GROUP, group_size).reshape(T, E_global)\n    )\n\n    neg_inf = torch.finfo(torch.float32).min\n    scores_pruned = s_with_bias.masked_fill(score_mask == 0, neg_inf)\n    _, topk_idx = torch.topk(scores_pruned, k=TOP_K, dim=1, largest=True, sorted=False)\n\n    M = torch.zeros_like(s)\n    M.scatter_(1, topk_idx, 1.0)\n    raw_w = s * M\n    weights_sum = raw_w.sum(dim=1, keepdim=True) + 1e-20\n    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)\n    full_weights = (raw_w / weights_sum) * scale\n    w_topk = full_weights.gather(1, topk_idx)\n\n    return _fp4_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        gemm1_bias,\n        gemm2_bias,\n        w_topk,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp4_block_scale_llama4_routing_topk1_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp4_block_scale_llama4_routing_topk1_e32_h7168_i2048.json
new file mode 100644
index 0000000000..2d372f6e97
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp4_block_scale_llama4_routing_topk1_e32_h7168_i2048.json
@@ -0,0 +1,225 @@
+{
+  "name": "moe_fp4_block_scale_llama4_routing_topk1_e32_h7168_i2048",
+  "description": "NvFP4 block-scale MoE with Llama4 routing (Top1 \u2192 Sigmoid).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp4_block_scale_moe",
+    "status:experimental",
+    "quantization:nvfp4"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of tokens."
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 1,
+      "description": "Number of experts selected per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of FC1 (2 \u00d7 intermediate_size for SwiGLU)."
+    },
+    "num_packed_hidden": {
+      "type": "const",
+      "value": 3584,
+      "description": "Packed hidden dimension (hidden_size // 2 for NvFP4)."
+    },
+    "num_fp4_hidden_blocks": {
+      "type": "const",
+      "value": 448,
+      "description": "Number of FP4 scale blocks along hidden_size (hidden_size // 16 for NvFP4)."
+    },
+    "num_packed_intermediate": {
+      "type": "const",
+      "value": 1024,
+      "description": "Packed intermediate dimension (intermediate_size // 2 for NvFP4)."
+    },
+    "num_fp4_intermediate_blocks": {
+      "type": "const",
+      "value": 128,
+      "description": "Number of FP4 scale blocks along intermediate_size (intermediate_size // 16 for NvFP4)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Bias added to routing logits. Pass None when not used."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "Input hidden states, NvFP4-packed (uint8, 2 fp4 per byte)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "seq_len",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "optional": true,
+      "description": "Block-wise scale factors for hidden_states (float8). None for bf16 input."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "FC1 weights, NvFP4-packed (uint8). Shape includes gate+up for SwiGLU."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm1_weights (float8)."
+    },
+    "gemm1_bias": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC1 bias (float32). Optional."
+    },
+    "gemm1_alpha": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU alpha (float32). Optional."
+    },
+    "gemm1_beta": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU beta (float32). Optional."
+    },
+    "gemm1_clamp_limit": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU clamp limit (float32). Optional."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_packed_intermediate"
+      ],
+      "dtype": "uint8",
+      "description": "FC2 weights, NvFP4-packed (uint8)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_fp4_intermediate_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm2_weights (float8)."
+    },
+    "gemm2_bias": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC2 bias (float32). Optional."
+    },
+    "output1_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 activation (float32). Optional."
+    },
+    "output1_scale_gate_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 gate (float32). Optional."
+    },
+    "output2_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC2 (float32). Optional."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in the global expert array."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scaling factor applied to routing weights. None for some routing methods."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp4_block_scale_moe_llama4_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm1_bias,\n    gemm2_weights,\n    gemm2_weights_scale,\n    gemm2_bias,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"FP4 MoE with Llama4 routing (Top1 \u2192 Sigmoid). top_k is fixed at 1.\"\"\"\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    topk_idx = logits.argmax(dim=-1, keepdim=True)\n    top1_logit = logits.gather(1, topk_idx)\n    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)\n    w_topk = (1.0 / (1.0 + torch.exp(-top1_logit))) * scale\n    return _fp4_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        gemm1_bias,\n        gemm2_bias,\n        w_topk,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp4_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp4_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json
new file mode 100644
index 0000000000..49ea91fcfe
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp4_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json
@@ -0,0 +1,225 @@
+{
+  "name": "moe_fp4_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048",
+  "description": "NvFP4 block-scale MoE with RenormalizeNaive routing (Softmax \u2192 TopK \u2192 Renormalize).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp4_block_scale_moe",
+    "status:experimental",
+    "quantization:nvfp4"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of tokens."
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts selected per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of FC1 (2 \u00d7 intermediate_size for SwiGLU)."
+    },
+    "num_packed_hidden": {
+      "type": "const",
+      "value": 3584,
+      "description": "Packed hidden dimension (hidden_size // 2 for NvFP4)."
+    },
+    "num_fp4_hidden_blocks": {
+      "type": "const",
+      "value": 448,
+      "description": "Number of FP4 scale blocks along hidden_size (hidden_size // 16 for NvFP4)."
+    },
+    "num_packed_intermediate": {
+      "type": "const",
+      "value": 1024,
+      "description": "Packed intermediate dimension (intermediate_size // 2 for NvFP4)."
+    },
+    "num_fp4_intermediate_blocks": {
+      "type": "const",
+      "value": 128,
+      "description": "Number of FP4 scale blocks along intermediate_size (intermediate_size // 16 for NvFP4)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Bias added to routing logits. Pass None when not used."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "Input hidden states, NvFP4-packed (uint8, 2 fp4 per byte)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "seq_len",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "optional": true,
+      "description": "Block-wise scale factors for hidden_states (float8). None for bf16 input."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "FC1 weights, NvFP4-packed (uint8). Shape includes gate+up for SwiGLU."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm1_weights (float8)."
+    },
+    "gemm1_bias": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC1 bias (float32). Optional."
+    },
+    "gemm1_alpha": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU alpha (float32). Optional."
+    },
+    "gemm1_beta": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU beta (float32). Optional."
+    },
+    "gemm1_clamp_limit": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU clamp limit (float32). Optional."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_packed_intermediate"
+      ],
+      "dtype": "uint8",
+      "description": "FC2 weights, NvFP4-packed (uint8)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_fp4_intermediate_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm2_weights (float8)."
+    },
+    "gemm2_bias": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC2 bias (float32). Optional."
+    },
+    "output1_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 activation (float32). Optional."
+    },
+    "output1_scale_gate_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 gate (float32). Optional."
+    },
+    "output2_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC2 (float32). Optional."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in the global expert array."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scaling factor applied to routing weights. None for some routing methods."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp4_block_scale_moe_renormalize_naive_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm1_bias,\n    gemm2_weights,\n    gemm2_weights_scale,\n    gemm2_bias,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"FP4 MoE with RenormalizeNaive routing (Softmax \u2192 TopK \u2192 sum-to-1).\"\"\"\n    TOP_K = int(top_k)\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    s = torch.softmax(logits, dim=-1)\n    _, topk_idx = torch.topk(s, k=TOP_K, dim=1, largest=True, sorted=False)\n    gathered = s.gather(1, topk_idx)\n    w_topk = gathered / (gathered.sum(dim=1, keepdim=True) + 1e-20)\n    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)\n    w_topk = w_topk * scale\n    return _fp4_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        gemm1_bias,\n        gemm2_bias,\n        w_topk,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp4_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp4_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json
new file mode 100644
index 0000000000..a77a8bcde5
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp4_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json
@@ -0,0 +1,225 @@
+{
+  "name": "moe_fp4_block_scale_renormalize_routing_topk8_e32_h7168_i2048",
+  "description": "NvFP4 block-scale MoE with Renormalize routing (TopK \u2192 Softmax).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp4_block_scale_moe",
+    "status:experimental",
+    "quantization:nvfp4"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of tokens."
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts selected per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of FC1 (2 \u00d7 intermediate_size for SwiGLU)."
+    },
+    "num_packed_hidden": {
+      "type": "const",
+      "value": 3584,
+      "description": "Packed hidden dimension (hidden_size // 2 for NvFP4)."
+    },
+    "num_fp4_hidden_blocks": {
+      "type": "const",
+      "value": 448,
+      "description": "Number of FP4 scale blocks along hidden_size (hidden_size // 16 for NvFP4)."
+    },
+    "num_packed_intermediate": {
+      "type": "const",
+      "value": 1024,
+      "description": "Packed intermediate dimension (intermediate_size // 2 for NvFP4)."
+    },
+    "num_fp4_intermediate_blocks": {
+      "type": "const",
+      "value": 128,
+      "description": "Number of FP4 scale blocks along intermediate_size (intermediate_size // 16 for NvFP4)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Bias added to routing logits. Pass None when not used."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "Input hidden states, NvFP4-packed (uint8, 2 fp4 per byte)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "seq_len",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "optional": true,
+      "description": "Block-wise scale factors for hidden_states (float8). None for bf16 input."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "FC1 weights, NvFP4-packed (uint8). Shape includes gate+up for SwiGLU."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm1_weights (float8)."
+    },
+    "gemm1_bias": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC1 bias (float32). Optional."
+    },
+    "gemm1_alpha": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU alpha (float32). Optional."
+    },
+    "gemm1_beta": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU beta (float32). Optional."
+    },
+    "gemm1_clamp_limit": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU clamp limit (float32). Optional."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_packed_intermediate"
+      ],
+      "dtype": "uint8",
+      "description": "FC2 weights, NvFP4-packed (uint8)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_fp4_intermediate_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm2_weights (float8)."
+    },
+    "gemm2_bias": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC2 bias (float32). Optional."
+    },
+    "output1_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 activation (float32). Optional."
+    },
+    "output1_scale_gate_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 gate (float32). Optional."
+    },
+    "output2_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC2 (float32). Optional."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in the global expert array."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scaling factor applied to routing weights. None for some routing methods."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp4_block_scale_moe_renormalize_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm1_bias,\n    gemm2_weights,\n    gemm2_weights_scale,\n    gemm2_bias,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"FP4 MoE with Renormalize routing (TopK on logits \u2192 Softmax).\"\"\"\n    TOP_K = int(top_k)\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    _, topk_idx = torch.topk(logits, k=TOP_K, dim=1, largest=True, sorted=False)\n    gathered = logits.gather(1, topk_idx)\n    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)\n    w_topk = torch.softmax(gathered, dim=-1) * scale\n    return _fp4_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        gemm1_bias,\n        gemm2_bias,\n        w_topk,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp4_block_scale_topk_routing_topk8_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp4_block_scale_topk_routing_topk8_e32_h7168_i2048.json
new file mode 100644
index 0000000000..7815139e08
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp4_block_scale_topk_routing_topk8_e32_h7168_i2048.json
@@ -0,0 +1,225 @@
+{
+  "name": "moe_fp4_block_scale_topk_routing_topk8_e32_h7168_i2048",
+  "description": "NvFP4 block-scale MoE with TopK-only routing (no softmax, uniform weights).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp4_block_scale_moe",
+    "status:experimental",
+    "quantization:nvfp4"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Number of tokens."
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts selected per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of FC1 (2 \u00d7 intermediate_size for SwiGLU)."
+    },
+    "num_packed_hidden": {
+      "type": "const",
+      "value": 3584,
+      "description": "Packed hidden dimension (hidden_size // 2 for NvFP4)."
+    },
+    "num_fp4_hidden_blocks": {
+      "type": "const",
+      "value": 448,
+      "description": "Number of FP4 scale blocks along hidden_size (hidden_size // 16 for NvFP4)."
+    },
+    "num_packed_intermediate": {
+      "type": "const",
+      "value": 1024,
+      "description": "Packed intermediate dimension (intermediate_size // 2 for NvFP4)."
+    },
+    "num_fp4_intermediate_blocks": {
+      "type": "const",
+      "value": 128,
+      "description": "Number of FP4 scale blocks along intermediate_size (intermediate_size // 16 for NvFP4)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Bias added to routing logits. Pass None when not used."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "Input hidden states, NvFP4-packed (uint8, 2 fp4 per byte)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "seq_len",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "optional": true,
+      "description": "Block-wise scale factors for hidden_states (float8). None for bf16 input."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_packed_hidden"
+      ],
+      "dtype": "uint8",
+      "description": "FC1 weights, NvFP4-packed (uint8). Shape includes gate+up for SwiGLU."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "num_fp4_hidden_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm1_weights (float8)."
+    },
+    "gemm1_bias": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC1 bias (float32). Optional."
+    },
+    "gemm1_alpha": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU alpha (float32). Optional."
+    },
+    "gemm1_beta": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU beta (float32). Optional."
+    },
+    "gemm1_clamp_limit": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Per-expert SwiGLU clamp limit (float32). Optional."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_packed_intermediate"
+      ],
+      "dtype": "uint8",
+      "description": "FC2 weights, NvFP4-packed (uint8)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "num_fp4_intermediate_blocks"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Block-wise scale factors for gemm2_weights (float8)."
+    },
+    "gemm2_bias": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "FC2 bias (float32). Optional."
+    },
+    "output1_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 activation (float32). Optional."
+    },
+    "output1_scale_gate_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC1 gate (float32). Optional."
+    },
+    "output2_scale_scalar": {
+      "shape": [
+        "num_local_experts"
+      ],
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-expert output scale for FC2 (float32). Optional."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in the global expert array."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scaling factor applied to routing weights. None for some routing methods."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp4_block_scale_moe_topk_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm1_bias,\n    gemm2_weights,\n    gemm2_weights_scale,\n    gemm2_bias,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"FP4 MoE with TopK-only routing (uniform weights).\"\"\"\n    TOP_K = int(top_k)\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    _, topk_idx = torch.topk(logits, k=TOP_K, dim=1, largest=True, sorted=False)\n    T = logits.shape[0]\n    scale = 1.0 if routed_scaling_factor is None else float(routed_scaling_factor)\n    w_topk = torch.full(\n        (T, TOP_K), scale / TOP_K, dtype=torch.float32, device=logits.device\n    )\n    return _fp4_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        gemm1_bias,\n        gemm2_bias,\n        w_topk,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp8_block_scale_default_routing_topk8_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp8_block_scale_default_routing_topk8_e32_h7168_i2048.json
new file mode 100644
index 0000000000..969189d2f9
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp8_block_scale_default_routing_topk8_e32_h7168_i2048.json
@@ -0,0 +1,157 @@
+{
+  "name": "moe_fp8_block_scale_default_routing_topk8_e32_h7168_i2048",
+  "description": "FP8 block scale MoE with Default routing (Softmax \u2192 TopK).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp8_block_scale_moe",
+    "status:verified",
+    "quantization:float8_e4m3fn"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Sequence length (number of tokens)"
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts to route to per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of the first GEMM (W13). Should be 2 * intermediate_size."
+    },
+    "num_hidden_blocks": {
+      "type": "const",
+      "value": 56,
+      "description": "Number of quantized blocks along the hidden_size dimension (block_size=128)."
+    },
+    "num_intermediate_blocks": {
+      "type": "const",
+      "value": 16,
+      "description": "Number of quantized blocks along the intermediate_size dimension (block_size=128)."
+    },
+    "num_gemm1_out_blocks": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of quantized blocks along the gemm1_out_size dimension (block_size=128)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "float32",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "optional": true,
+      "description": "Bias added to logits before routing. Pass None for no bias."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Input hidden states tensor (FP8 quantized)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "num_hidden_blocks",
+        "seq_len"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for hidden states."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "First GEMM weights for all local experts (gate and up projections)."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_gemm1_out_blocks",
+        "num_hidden_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for first GEMM weights."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "intermediate_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Second GEMM weights for all local experts (down projection)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_hidden_blocks",
+        "num_intermediate_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for second GEMM weights."
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Number of experts to route to per token."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in global expert space."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Scaling factor applied to routing weights."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp8_block_scale_moe_default_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm2_weights,\n    gemm2_weights_scale,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"\n    FP8 block-scale MoE with Default routing: Softmax \u2192 TopK.\n    routing_bias is added to logits before softmax when provided.\n    \"\"\"\n    TOP_K = int(top_k)\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    s = torch.softmax(logits, dim=-1)\n    _, topk_idx = torch.topk(s, k=TOP_K, dim=1, largest=True, sorted=False)\n    weights = s.gather(1, topk_idx) * routed_scaling_factor\n    return _fp8_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        weights,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp8_block_scale_ds_routing_topk8_ng8_kg4_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp8_block_scale_ds_routing_topk8_ng8_kg4_e32_h7168_i2048.json
new file mode 100644
index 0000000000..bea3ad4faf
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp8_block_scale_ds_routing_topk8_ng8_kg4_e32_h7168_i2048.json
@@ -0,0 +1,176 @@
+{
+  "name": "moe_fp8_block_scale_ds_routing_topk8_ng8_kg4_e32_h7168_i2048",
+  "description": "FP8 block scale MoE with DeepSeek-V3 routing. Includes grouped sigmoid routing and two grouped-GEMM.",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp8_block_scale_moe",
+    "status:verified",
+    "quantization:float8_e4m3fn"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Sequence length (number of tokens)"
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts to route to per token."
+    },
+    "n_group": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of expert groups for group routing."
+    },
+    "topk_group": {
+      "type": "const",
+      "value": 4,
+      "description": "Number of groups to select for top-k routing."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of the first GEMM (W13). Should be 2 * intermediate_size."
+    },
+    "num_hidden_blocks": {
+      "type": "const",
+      "value": 56,
+      "description": "Number of quantized blocks along the hidden_size dimension (block_size=128)."
+    },
+    "num_intermediate_blocks": {
+      "type": "const",
+      "value": 16,
+      "description": "Number of quantized blocks along the intermediate_size dimension (block_size=128)."
+    },
+    "num_gemm1_out_blocks": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of quantized blocks along the gemm1_out_size dimension (block_size=128)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "float32",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "description": "Bias tensor for routing. Pass all zeros for no bias."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Input hidden states tensor (FP8 quantized)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "num_hidden_blocks",
+        "seq_len"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for hidden states."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "First GEMM weights for all local experts (gate and up projections)."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_gemm1_out_blocks",
+        "num_hidden_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for first GEMM weights."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "intermediate_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Second GEMM weights for all local experts (down projection)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_hidden_blocks",
+        "num_intermediate_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for second GEMM weights."
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Number of experts to route to per token (DeepSeek-V3 uses 8)."
+    },
+    "n_group": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Number of expert groups (DeepSeek-V3 uses 8)."
+    },
+    "topk_group": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Number of groups to keep after group-level top-k (DeepSeek-V3 uses 4)."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in global expert space."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Scaling factor for routing weights."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp8_block_scale_moe_ds_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm2_weights,\n    gemm2_weights_scale,\n    top_k,\n    n_group,\n    topk_group,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"\n    FP8 block-scale MoE with DeepSeek-V3 routing:\n        s = sigmoid(logits)\n        s_with_bias = s + bias\n        group by n_group; per group take top-2 sum \u2192 pick topk_group groups\n        on the kept groups, take global top_k experts\n        combine with weights derived from s (without bias), normalised and\n        scaled by routed_scaling_factor\n    \"\"\"\n    E_global = routing_logits.shape[1]\n    T = routing_logits.shape[0]\n    TOP_K = int(top_k)\n    N_GROUP = int(n_group)\n    TOPK_GROUP = int(topk_group)\n\n    logits = routing_logits.to(torch.float32)\n    bias = routing_bias.to(torch.float32).reshape(-1)\n\n    s = 1.0 / (1.0 + torch.exp(-logits))\n    s_with_bias = s + bias\n\n    group_size = E_global // N_GROUP\n    s_wb_grouped = s_with_bias.view(T, N_GROUP, group_size)\n    top2_vals, _ = torch.topk(s_wb_grouped, k=2, dim=2, largest=True, sorted=False)\n    group_scores = top2_vals.sum(dim=2)\n\n    _, group_idx = torch.topk(\n        group_scores, k=TOPK_GROUP, dim=1, largest=True, sorted=False\n    )\n    group_mask = torch.zeros_like(group_scores)\n    group_mask.scatter_(1, group_idx, 1.0)\n    score_mask = (\n        group_mask.unsqueeze(2).expand(T, N_GROUP, group_size).reshape(T, E_global)\n    )\n\n    neg_inf = torch.finfo(torch.float32).min\n    scores_pruned = s_with_bias.masked_fill(score_mask == 0, neg_inf)\n    _, topk_idx = torch.topk(scores_pruned, k=TOP_K, dim=1, largest=True, sorted=False)\n\n    M = torch.zeros_like(s)\n    M.scatter_(1, topk_idx, 1.0)\n    raw_w = s * M\n    weights_sum = raw_w.sum(dim=1, keepdim=True) + 1e-20\n    weights = (raw_w / weights_sum) * routed_scaling_factor\n\n    # Gather per-row weights into [T, TOP_K] for the shared GEMM helper\n    w_topk = weights.gather(1, topk_idx)\n\n    return _fp8_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        w_topk,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp8_block_scale_llama4_routing_topk1_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp8_block_scale_llama4_routing_topk1_e32_h7168_i2048.json
new file mode 100644
index 0000000000..7359c2d9b6
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp8_block_scale_llama4_routing_topk1_e32_h7168_i2048.json
@@ -0,0 +1,157 @@
+{
+  "name": "moe_fp8_block_scale_llama4_routing_topk1_e32_h7168_i2048",
+  "description": "FP8 block scale MoE with Llama4 routing (Top1 \u2192 Sigmoid).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp8_block_scale_moe",
+    "status:verified",
+    "quantization:float8_e4m3fn"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Sequence length (number of tokens)"
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 1,
+      "description": "Number of experts to route to per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of the first GEMM (W13). Should be 2 * intermediate_size."
+    },
+    "num_hidden_blocks": {
+      "type": "const",
+      "value": 56,
+      "description": "Number of quantized blocks along the hidden_size dimension (block_size=128)."
+    },
+    "num_intermediate_blocks": {
+      "type": "const",
+      "value": 16,
+      "description": "Number of quantized blocks along the intermediate_size dimension (block_size=128)."
+    },
+    "num_gemm1_out_blocks": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of quantized blocks along the gemm1_out_size dimension (block_size=128)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "float32",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "optional": true,
+      "description": "Bias added to logits before routing. Pass None for no bias."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Input hidden states tensor (FP8 quantized)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "num_hidden_blocks",
+        "seq_len"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for hidden states."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "First GEMM weights for all local experts (gate and up projections)."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_gemm1_out_blocks",
+        "num_hidden_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for first GEMM weights."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "intermediate_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Second GEMM weights for all local experts (down projection)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_hidden_blocks",
+        "num_intermediate_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for second GEMM weights."
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Number of experts to route to per token."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in global expert space."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Scaling factor applied to routing weights."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp8_block_scale_moe_llama4_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm2_weights,\n    gemm2_weights_scale,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"\n    FP8 block-scale MoE with Llama4 routing: Top1 \u2192 Sigmoid.\n    Single expert selected per token; weight derived from sigmoid of its logit.\n    By definition Llama4 routing uses top_k=1; the parameter is accepted for\n    schema consistency with the other routing methods.\n    \"\"\"\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    topk_idx = logits.argmax(dim=-1, keepdim=True)  # [T, 1]\n    top1_logit = logits.gather(1, topk_idx)\n    weights = (1.0 / (1.0 + torch.exp(-top1_logit))) * routed_scaling_factor\n    return _fp8_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        weights,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp8_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp8_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json
new file mode 100644
index 0000000000..d55e617145
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp8_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048.json
@@ -0,0 +1,157 @@
+{
+  "name": "moe_fp8_block_scale_renormalize_naive_routing_topk8_e32_h7168_i2048",
+  "description": "FP8 block scale MoE with RenormalizeNaive routing (Softmax \u2192 TopK \u2192 Renormalize).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp8_block_scale_moe",
+    "status:verified",
+    "quantization:float8_e4m3fn"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Sequence length (number of tokens)"
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts to route to per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of the first GEMM (W13). Should be 2 * intermediate_size."
+    },
+    "num_hidden_blocks": {
+      "type": "const",
+      "value": 56,
+      "description": "Number of quantized blocks along the hidden_size dimension (block_size=128)."
+    },
+    "num_intermediate_blocks": {
+      "type": "const",
+      "value": 16,
+      "description": "Number of quantized blocks along the intermediate_size dimension (block_size=128)."
+    },
+    "num_gemm1_out_blocks": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of quantized blocks along the gemm1_out_size dimension (block_size=128)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "float32",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "optional": true,
+      "description": "Bias added to logits before routing. Pass None for no bias."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Input hidden states tensor (FP8 quantized)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "num_hidden_blocks",
+        "seq_len"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for hidden states."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "First GEMM weights for all local experts (gate and up projections)."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_gemm1_out_blocks",
+        "num_hidden_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for first GEMM weights."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "intermediate_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Second GEMM weights for all local experts (down projection)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_hidden_blocks",
+        "num_intermediate_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for second GEMM weights."
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Number of experts to route to per token."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in global expert space."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Scaling factor applied to routing weights."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp8_block_scale_moe_renormalize_naive_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm2_weights,\n    gemm2_weights_scale,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"\n    FP8 block-scale MoE with RenormalizeNaive routing: Softmax \u2192 TopK \u2192 Renormalize.\n    Same as Default but the selected weights are re-normalised to sum to 1.\n    \"\"\"\n    TOP_K = int(top_k)\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    s = torch.softmax(logits, dim=-1)\n    _, topk_idx = torch.topk(s, k=TOP_K, dim=1, largest=True, sorted=False)\n    gathered = s.gather(1, topk_idx)\n    weights = gathered / (gathered.sum(dim=1, keepdim=True) + 1e-20)\n    weights = weights * routed_scaling_factor\n    return _fp8_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        weights,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp8_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp8_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json
new file mode 100644
index 0000000000..21c72b18a1
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp8_block_scale_renormalize_routing_topk8_e32_h7168_i2048.json
@@ -0,0 +1,157 @@
+{
+  "name": "moe_fp8_block_scale_renormalize_routing_topk8_e32_h7168_i2048",
+  "description": "FP8 block scale MoE with Renormalize routing (TopK \u2192 Softmax).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp8_block_scale_moe",
+    "status:verified",
+    "quantization:float8_e4m3fn"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Sequence length (number of tokens)"
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts to route to per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of the first GEMM (W13). Should be 2 * intermediate_size."
+    },
+    "num_hidden_blocks": {
+      "type": "const",
+      "value": 56,
+      "description": "Number of quantized blocks along the hidden_size dimension (block_size=128)."
+    },
+    "num_intermediate_blocks": {
+      "type": "const",
+      "value": 16,
+      "description": "Number of quantized blocks along the intermediate_size dimension (block_size=128)."
+    },
+    "num_gemm1_out_blocks": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of quantized blocks along the gemm1_out_size dimension (block_size=128)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "float32",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "optional": true,
+      "description": "Bias added to logits before routing. Pass None for no bias."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Input hidden states tensor (FP8 quantized)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "num_hidden_blocks",
+        "seq_len"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for hidden states."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "First GEMM weights for all local experts (gate and up projections)."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_gemm1_out_blocks",
+        "num_hidden_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for first GEMM weights."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "intermediate_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Second GEMM weights for all local experts (down projection)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_hidden_blocks",
+        "num_intermediate_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for second GEMM weights."
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Number of experts to route to per token."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in global expert space."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Scaling factor applied to routing weights."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp8_block_scale_moe_renormalize_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm2_weights,\n    gemm2_weights_scale,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"\n    FP8 block-scale MoE with Renormalize routing: TopK \u2192 Softmax.\n    TopK is applied on raw logits; weights are then derived by softmax\n    over the selected logits.\n    \"\"\"\n    TOP_K = int(top_k)\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    _, topk_idx = torch.topk(logits, k=TOP_K, dim=1, largest=True, sorted=False)\n    gathered = logits.gather(1, topk_idx)\n    weights = torch.softmax(gathered, dim=-1) * routed_scaling_factor\n    return _fp8_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        weights,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/moe_fp8_block_scale_topk_routing_topk8_e32_h7168_i2048.json b/tests/trace/fi_trace_out/moe_fp8_block_scale_topk_routing_topk8_e32_h7168_i2048.json
new file mode 100644
index 0000000000..fa32d64cf7
--- /dev/null
+++ b/tests/trace/fi_trace_out/moe_fp8_block_scale_topk_routing_topk8_e32_h7168_i2048.json
@@ -0,0 +1,157 @@
+{
+  "name": "moe_fp8_block_scale_topk_routing_topk8_e32_h7168_i2048",
+  "description": "FP8 block scale MoE with TopK-only routing (no softmax, uniform weights).",
+  "op_type": "moe",
+  "tags": [
+    "fi_api:flashinfer.fused_moe.core.trtllm_fp8_block_scale_moe",
+    "status:verified",
+    "quantization:float8_e4m3fn"
+  ],
+  "axes": {
+    "seq_len": {
+      "type": "var",
+      "description": "Sequence length (number of tokens)"
+    },
+    "num_experts": {
+      "type": "const",
+      "value": 256,
+      "description": "Total number of experts."
+    },
+    "top_k": {
+      "type": "const",
+      "value": 8,
+      "description": "Number of experts to route to per token."
+    },
+    "num_local_experts": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of local experts."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168,
+      "description": "Hidden dimension size."
+    },
+    "intermediate_size": {
+      "type": "const",
+      "value": 2048,
+      "description": "MoE intermediate layer size."
+    },
+    "gemm1_out_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "Output size of the first GEMM (W13). Should be 2 * intermediate_size."
+    },
+    "num_hidden_blocks": {
+      "type": "const",
+      "value": 56,
+      "description": "Number of quantized blocks along the hidden_size dimension (block_size=128)."
+    },
+    "num_intermediate_blocks": {
+      "type": "const",
+      "value": 16,
+      "description": "Number of quantized blocks along the intermediate_size dimension (block_size=128)."
+    },
+    "num_gemm1_out_blocks": {
+      "type": "const",
+      "value": 32,
+      "description": "Number of quantized blocks along the gemm1_out_size dimension (block_size=128)."
+    }
+  },
+  "inputs": {
+    "routing_logits": {
+      "shape": [
+        "seq_len",
+        "num_experts"
+      ],
+      "dtype": "float32",
+      "description": "Routing logits for expert selection."
+    },
+    "routing_bias": {
+      "shape": [
+        "num_experts"
+      ],
+      "dtype": "bfloat16",
+      "optional": true,
+      "description": "Bias added to logits before routing. Pass None for no bias."
+    },
+    "hidden_states": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Input hidden states tensor (FP8 quantized)."
+    },
+    "hidden_states_scale": {
+      "shape": [
+        "num_hidden_blocks",
+        "seq_len"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for hidden states."
+    },
+    "gemm1_weights": {
+      "shape": [
+        "num_local_experts",
+        "gemm1_out_size",
+        "hidden_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "First GEMM weights for all local experts (gate and up projections)."
+    },
+    "gemm1_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_gemm1_out_blocks",
+        "num_hidden_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for first GEMM weights."
+    },
+    "gemm2_weights": {
+      "shape": [
+        "num_local_experts",
+        "hidden_size",
+        "intermediate_size"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "Second GEMM weights for all local experts (down projection)."
+    },
+    "gemm2_weights_scale": {
+      "shape": [
+        "num_local_experts",
+        "num_hidden_blocks",
+        "num_intermediate_blocks"
+      ],
+      "dtype": "float32",
+      "description": "Block-wise scaling factors for second GEMM weights."
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Number of experts to route to per token."
+    },
+    "local_expert_offset": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Offset of local experts in global expert space."
+    },
+    "routed_scaling_factor": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Scaling factor applied to routing weights."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "seq_len",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Final MoE output tensor."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _trtllm_fp8_block_scale_moe_topk_routing_reference(\n    routing_logits,\n    routing_bias,\n    hidden_states,\n    hidden_states_scale,\n    gemm1_weights,\n    gemm1_weights_scale,\n    gemm2_weights,\n    gemm2_weights_scale,\n    top_k,\n    local_expert_offset,\n    routed_scaling_factor,\n):\n    \"\"\"\n    FP8 block-scale MoE with TopK-only routing: TopK, uniform weights.\n    No softmax or sigmoid; all selected experts receive equal weight.\n    \"\"\"\n    TOP_K = int(top_k)\n    E_global = routing_logits.shape[1]\n    logits = routing_logits.to(torch.float32)\n    if routing_bias is not None:\n        logits = logits + routing_bias.to(torch.float32).reshape(-1)\n    _, topk_idx = torch.topk(logits, k=TOP_K, dim=1, largest=True, sorted=False)\n    T = logits.shape[0]\n    weights = torch.full(\n        (T, TOP_K),\n        routed_scaling_factor / TOP_K,\n        dtype=torch.float32,\n        device=logits.device,\n    )\n    return _fp8_moe_run_experts(\n        hidden_states,\n        hidden_states_scale,\n        gemm1_weights,\n        gemm1_weights_scale,\n        gemm2_weights,\n        gemm2_weights_scale,\n        weights,\n        topk_idx,\n        local_expert_offset,\n        E_global,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/mxfp4_quantize_k4096.json b/tests/trace/fi_trace_out/mxfp4_quantize_k4096.json
new file mode 100644
index 0000000000..5b9b49d606
--- /dev/null
+++ b/tests/trace/fi_trace_out/mxfp4_quantize_k4096.json
@@ -0,0 +1,64 @@
+{
+  "name": "mxfp4_quantize_k4096",
+  "description": "MXFP4 quantization (sf_vec_size=32, UE8M0 scales). No global scale.",
+  "op_type": "quantization",
+  "tags": [
+    "fi_api:flashinfer.quantization.fp4_quantization.mxfp4_quantize",
+    "status:verified",
+    "quantization:mxfp4"
+  ],
+  "axes": {
+    "M": {
+      "type": "var",
+      "description": "Number of rows."
+    },
+    "K": {
+      "type": "const",
+      "value": 4096,
+      "description": "Number of input columns."
+    },
+    "K_packed": {
+      "type": "var",
+      "description": "Packed column dimension (K/2 for FP4, two values per uint8)."
+    },
+    "num_scale_elems": {
+      "type": "var",
+      "description": "Total number of scale factor elements (layout-dependent)."
+    },
+    "one": {
+      "type": "var",
+      "description": "Placeholder for shape [1] scalar tensors."
+    }
+  },
+  "constraints": [
+    "K_packed == K // 2"
+  ],
+  "inputs": {
+    "a": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "bfloat16",
+      "description": "Input tensor, fp16/bf16."
+    }
+  },
+  "outputs": {
+    "quantized": {
+      "shape": [
+        "M",
+        "K_packed"
+      ],
+      "dtype": "uint8",
+      "description": "Packed FP4 output."
+    },
+    "scales": {
+      "shape": [
+        "num_scale_elems"
+      ],
+      "dtype": "uint8",
+      "description": "UE8M0 block scale factors (1 byte per 32-element block)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _mxfp4_quantize_reference(\n    a: torch.Tensor,\n    backend: str = \"cuda\",\n    enable_pdl: Optional[bool] = None,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Reference MXFP4 quantize (block_size=32, UE8M0 scales).\"\"\"\n    return _fp4_quantize_reference(\n        a,\n        global_scale=None,\n        sf_vec_size=32,\n        sf_use_ue8m0=True,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/mxfp8_quantize_k4096.json b/tests/trace/fi_trace_out/mxfp8_quantize_k4096.json
new file mode 100644
index 0000000000..f94ad85690
--- /dev/null
+++ b/tests/trace/fi_trace_out/mxfp8_quantize_k4096.json
@@ -0,0 +1,53 @@
+{
+  "name": "mxfp8_quantize_k4096",
+  "description": "MXFP8 quantization (block size 32, UE8M0 scales). Output is fp8_e4m3fn.",
+  "op_type": "quantization",
+  "tags": [
+    "fi_api:flashinfer.quantization.fp8_quantization.mxfp8_quantize",
+    "status:verified",
+    "quantization:mxfp8"
+  ],
+  "axes": {
+    "M": {
+      "type": "var",
+      "description": "Number of rows."
+    },
+    "K": {
+      "type": "const",
+      "value": 4096,
+      "description": "Number of input columns."
+    },
+    "num_scale_elems": {
+      "type": "var",
+      "description": "Total number of scale factor elements (layout-dependent)."
+    }
+  },
+  "inputs": {
+    "input": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "bfloat16",
+      "description": "Input tensor, fp16/bf16."
+    }
+  },
+  "outputs": {
+    "quantized": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "float8_e4m3fn",
+      "description": "MXFP8 quantized output."
+    },
+    "scales": {
+      "shape": [
+        "num_scale_elems"
+      ],
+      "dtype": "uint8",
+      "description": "UE8M0 block scale factors (1 byte per 32-element block)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _mxfp8_quantize_reference(\n    input: torch.Tensor,\n    is_sf_swizzled_layout: bool = True,\n    alignment: int = 32,\n    enable_pdl: Optional[bool] = None,\n    backend: str = \"cuda\",\n    sf_swizzle_layout=None,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Reference MXFP8 quantize (block_size=32, UE8M0 scales).\"\"\"\n    return _quantize_mxfp8(\n        input.reshape(-1, input.shape[-1]),\n        block_size=int(alignment),\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/nvfp4_quantize_k4096.json b/tests/trace/fi_trace_out/nvfp4_quantize_k4096.json
new file mode 100644
index 0000000000..e5cbf248b5
--- /dev/null
+++ b/tests/trace/fi_trace_out/nvfp4_quantize_k4096.json
@@ -0,0 +1,77 @@
+{
+  "name": "nvfp4_quantize_k4096",
+  "description": "NVFP4 quantization (sf_vec_size=16). Requires a per-tensor global scale.",
+  "op_type": "quantization",
+  "tags": [
+    "fi_api:flashinfer.quantization.fp4_quantization.nvfp4_quantize",
+    "status:verified",
+    "quantization:nvfp4"
+  ],
+  "axes": {
+    "M": {
+      "type": "var",
+      "description": "Number of rows."
+    },
+    "K": {
+      "type": "const",
+      "value": 4096,
+      "description": "Number of input columns."
+    },
+    "K_packed": {
+      "type": "var",
+      "description": "Packed column dimension (K/2 for FP4, two values per uint8)."
+    },
+    "num_scale_elems": {
+      "type": "var",
+      "description": "Total number of scale factor elements (layout-dependent)."
+    },
+    "one": {
+      "type": "var",
+      "description": "Placeholder for shape [1] scalar tensors."
+    }
+  },
+  "constraints": [
+    "K_packed == K // 2"
+  ],
+  "inputs": {
+    "a": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "bfloat16",
+      "description": "Input tensor, fp16/bf16/fp8_e4m3fn."
+    },
+    "a_global_sf": {
+      "shape": [
+        "one"
+      ],
+      "dtype": "float32",
+      "description": "Global scale factor, shape [1]."
+    },
+    "sf_vec_size": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Scale-factor vector size (fixed at 16 for NVFP4)."
+    }
+  },
+  "outputs": {
+    "quantized": {
+      "shape": [
+        "M",
+        "K_packed"
+      ],
+      "dtype": "uint8",
+      "description": "Packed FP4 output."
+    },
+    "scales": {
+      "shape": [
+        "num_scale_elems"
+      ],
+      "dtype": "uint8",
+      "description": "Block scale factors packed as uint8 bytes (layout-dependent shape)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _nvfp4_quantize_reference(\n    a: torch.Tensor,\n    a_global_sf: torch.Tensor,\n    sfLayout=None,\n    do_shuffle: bool = False,\n    sf_vec_size: int = 16,\n    enable_pdl: Optional[bool] = None,\n    backend: str = \"cuda\",\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Reference NvFP4 quantize (block_size=16, fp8_e4m3fn scales).\"\"\"\n    return _fp4_quantize_reference(\n        a,\n        global_scale=a_global_sf,\n        sf_vec_size=sf_vec_size,\n        sf_use_ue8m0=False,\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/rmsnorm_h4096.json b/tests/trace/fi_trace_out/rmsnorm_h4096.json
new file mode 100644
index 0000000000..9bfac0e557
--- /dev/null
+++ b/tests/trace/fi_trace_out/rmsnorm_h4096.json
@@ -0,0 +1,43 @@
+{
+  "name": "rmsnorm_h4096",
+  "description": "Root Mean Square Normalization. Epsilon is fixed at 1e-6.",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.rmsnorm",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 4096
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _rmsnorm_reference(hidden_states, weight):\n    \"\"\"Root Mean Square Normalization. Epsilon is fixed at 1e-6.\"\"\"\n    EPS = 1e-6\n    x = hidden_states.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    y = (x * inv_rms) * weight.to(torch.float32)\n    return y.to(hidden_states.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/rmsnorm_h7168.json b/tests/trace/fi_trace_out/rmsnorm_h7168.json
new file mode 100644
index 0000000000..f1e6940f0b
--- /dev/null
+++ b/tests/trace/fi_trace_out/rmsnorm_h7168.json
@@ -0,0 +1,43 @@
+{
+  "name": "rmsnorm_h7168",
+  "description": "Root Mean Square Normalization. Epsilon is fixed at 1e-6.",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.rmsnorm",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _rmsnorm_reference(hidden_states, weight):\n    \"\"\"Root Mean Square Normalization. Epsilon is fixed at 1e-6.\"\"\"\n    EPS = 1e-6\n    x = hidden_states.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    y = (x * inv_rms) * weight.to(torch.float32)\n    return y.to(hidden_states.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/rmsnorm_quant_h7168.json b/tests/trace/fi_trace_out/rmsnorm_quant_h7168.json
new file mode 100644
index 0000000000..81f03e85ae
--- /dev/null
+++ b/tests/trace/fi_trace_out/rmsnorm_quant_h7168.json
@@ -0,0 +1,50 @@
+{
+  "name": "rmsnorm_quant_h7168",
+  "description": "RMSNorm + FP8 quantization. out = quantize(rmsnorm(input, weight), scale).",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.rmsnorm_quant",
+    "status:verified",
+    "quantization:fp8"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 7168
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "scale": {
+      "shape": null,
+      "dtype": "float32",
+      "description": "Per-tensor quantization scale, shape (1,)."
+    }
+  },
+  "outputs": {
+    "out": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Quantized output (dtype matches pre-allocated out tensor)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _rmsnorm_quant_reference(hidden_states, weight, scale):\n    \"\"\"RMSNorm followed by per-tensor FP8 (e4m3fn) quantization.\n\n    ``out = clamp(rmsnorm(input, weight) / scale, fp8_min, fp8_max).to(fp8_e4m3fn)``.\n    Epsilon is fixed at 1e-6.\n    \"\"\"\n    EPS = 1e-6\n    x = hidden_states.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    y = (x * inv_rms) * weight.to(torch.float32)\n    s = (\n        scale.to(torch.float32).reshape(())\n        if isinstance(scale, torch.Tensor)\n        else float(scale)\n    )\n    y = y / s\n    fp8_max = 448.0  # float8_e4m3fn max finite value\n    y = y.clamp(-fp8_max, fp8_max)\n    return y.to(torch.float8_e4m3fn)\n"
+}
diff --git a/tests/trace/fi_trace_out/rope_cos_sin_cache_d128.json b/tests/trace/fi_trace_out/rope_cos_sin_cache_d128.json
new file mode 100644
index 0000000000..ae1a7ea719
--- /dev/null
+++ b/tests/trace/fi_trace_out/rope_cos_sin_cache_d128.json
@@ -0,0 +1,99 @@
+{
+  "name": "rope_cos_sin_cache_d128",
+  "description": "RoPE with precomputed cos/sin cache (SGL/vLLM-compatible).",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_rope_with_cos_sin_cache",
+    "status:verified"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "num_q_heads_x_head_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "num_q_heads * head_size (flattened query dimension)."
+    },
+    "num_k_heads_x_head_size": {
+      "type": "const",
+      "value": 1024,
+      "description": "num_k_heads * head_size (flattened key dimension)."
+    },
+    "head_size": {
+      "type": "const",
+      "value": 128
+    },
+    "max_seq_len": {
+      "type": "var",
+      "description": "cos_sin_cache length (max supported position)."
+    },
+    "rotary_dim": {
+      "type": "const",
+      "value": 128,
+      "description": "Rotary dimension (cos+sin concatenated along last axis)."
+    }
+  },
+  "inputs": {
+    "positions": {
+      "shape": [
+        "nnz"
+      ],
+      "dtype": "int32",
+      "description": "Per-token position index."
+    },
+    "query": {
+      "shape": [
+        "nnz",
+        "num_q_heads_x_head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Flattened query tensor (nnz, num_q_heads * head_size)."
+    },
+    "key": {
+      "shape": [
+        "nnz",
+        "num_k_heads_x_head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Flattened key tensor (nnz, num_k_heads * head_size)."
+    },
+    "head_size": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Head dimension."
+    },
+    "cos_sin_cache": {
+      "shape": [
+        "max_seq_len",
+        "rotary_dim"
+      ],
+      "dtype": "float32",
+      "description": "Precomputed cos+sin cache; cos first half, sin second half."
+    },
+    "is_neox": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Bool: Neox (True) vs interleaved (False)."
+    }
+  },
+  "outputs": {
+    "query_out": {
+      "shape": [
+        "nnz",
+        "num_q_heads_x_head_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "key_out": {
+      "shape": [
+        "nnz",
+        "num_k_heads_x_head_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_rope_with_cos_sin_cache_reference(\n    positions: torch.Tensor,\n    query: torch.Tensor,\n    key: torch.Tensor,\n    head_size: int,\n    cos_sin_cache: torch.Tensor,\n    is_neox: bool = True,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Apply RoPE with a precomputed cos/sin cache.\n\n    cos_sin_cache is ``[max_seq_len, rotary_dim]`` where the first half is\n    cos and the second half is sin. is_neox=True \u2192 half-split rotation;\n    is_neox=False \u2192 interleaved rotation.\n    \"\"\"\n    rotary_dim = cos_sin_cache.shape[-1]\n    cos_cache = cos_sin_cache[:, : rotary_dim // 2]\n    sin_cache = cos_sin_cache[:, rotary_dim // 2 :]\n    cos = cos_cache[positions.to(torch.long)].unsqueeze(1)  # [nnz, 1, rotary_dim//2]\n    sin = sin_cache[positions.to(torch.long)].unsqueeze(1)\n    # Reshape flattened (nnz, H*D) \u2192 (nnz, H, D) for rotation.\n    q_view = query.view(query.shape[0], -1, head_size)\n    k_view = key.view(key.shape[0], -1, head_size)\n    q_rope = _rotate(q_view.to(torch.float32), cos, sin, interleave=not is_neox)\n    k_rope = _rotate(k_view.to(torch.float32), cos, sin, interleave=not is_neox)\n    return (\n        q_rope.reshape(query.shape).to(query.dtype),\n        k_rope.reshape(key.shape).to(key.dtype),\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/rope_cos_sin_cache_inplace_d128.json b/tests/trace/fi_trace_out/rope_cos_sin_cache_inplace_d128.json
new file mode 100644
index 0000000000..0a9fa4d85e
--- /dev/null
+++ b/tests/trace/fi_trace_out/rope_cos_sin_cache_inplace_d128.json
@@ -0,0 +1,101 @@
+{
+  "name": "rope_cos_sin_cache_inplace_d128",
+  "description": "In-place RoPE with precomputed cos/sin cache.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_rope_with_cos_sin_cache_inplace",
+    "status:verified"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "num_q_heads_x_head_size": {
+      "type": "const",
+      "value": 4096,
+      "description": "num_q_heads * head_size (flattened query dimension)."
+    },
+    "num_k_heads_x_head_size": {
+      "type": "const",
+      "value": 1024,
+      "description": "num_k_heads * head_size (flattened key dimension)."
+    },
+    "head_size": {
+      "type": "const",
+      "value": 128
+    },
+    "max_seq_len": {
+      "type": "var",
+      "description": "cos_sin_cache length (max supported position)."
+    },
+    "rotary_dim": {
+      "type": "const",
+      "value": 128,
+      "description": "Rotary dimension (cos+sin concatenated along last axis)."
+    }
+  },
+  "inputs": {
+    "positions": {
+      "shape": [
+        "nnz"
+      ],
+      "dtype": "int32",
+      "description": "Per-token position index."
+    },
+    "query": {
+      "shape": [
+        "nnz",
+        "num_q_heads_x_head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Flattened query tensor (nnz, num_q_heads * head_size)."
+    },
+    "key": {
+      "shape": [
+        "nnz",
+        "num_k_heads_x_head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Flattened key tensor (nnz, num_k_heads * head_size)."
+    },
+    "head_size": {
+      "shape": null,
+      "dtype": "int32",
+      "description": "Head dimension."
+    },
+    "cos_sin_cache": {
+      "shape": [
+        "max_seq_len",
+        "rotary_dim"
+      ],
+      "dtype": "float32",
+      "description": "Precomputed cos+sin cache; cos first half, sin second half."
+    },
+    "is_neox": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Bool: Neox (True) vs interleaved (False)."
+    }
+  },
+  "outputs": {
+    "query": {
+      "shape": [
+        "nnz",
+        "num_q_heads_x_head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated query (in-place)."
+    },
+    "key": {
+      "shape": [
+        "nnz",
+        "num_k_heads_x_head_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated key (in-place)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_rope_with_cos_sin_cache_reference(\n    positions: torch.Tensor,\n    query: torch.Tensor,\n    key: torch.Tensor,\n    head_size: int,\n    cos_sin_cache: torch.Tensor,\n    is_neox: bool = True,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    \"\"\"Apply RoPE with a precomputed cos/sin cache.\n\n    cos_sin_cache is ``[max_seq_len, rotary_dim]`` where the first half is\n    cos and the second half is sin. is_neox=True \u2192 half-split rotation;\n    is_neox=False \u2192 interleaved rotation.\n    \"\"\"\n    rotary_dim = cos_sin_cache.shape[-1]\n    cos_cache = cos_sin_cache[:, : rotary_dim // 2]\n    sin_cache = cos_sin_cache[:, rotary_dim // 2 :]\n    cos = cos_cache[positions.to(torch.long)].unsqueeze(1)  # [nnz, 1, rotary_dim//2]\n    sin = sin_cache[positions.to(torch.long)].unsqueeze(1)\n    # Reshape flattened (nnz, H*D) \u2192 (nnz, H, D) for rotation.\n    q_view = query.view(query.shape[0], -1, head_size)\n    k_view = key.view(key.shape[0], -1, head_size)\n    q_rope = _rotate(q_view.to(torch.float32), cos, sin, interleave=not is_neox)\n    k_rope = _rotate(k_view.to(torch.float32), cos, sin, interleave=not is_neox)\n    return (\n        q_rope.reshape(query.shape).to(query.dtype),\n        k_rope.reshape(key.shape).to(key.dtype),\n    )\n"
+}
diff --git a/tests/trace/fi_trace_out/rope_h32_kv8_d128.json b/tests/trace/fi_trace_out/rope_h32_kv8_d128.json
new file mode 100644
index 0000000000..0f72faf1be
--- /dev/null
+++ b/tests/trace/fi_trace_out/rope_h32_kv8_d128.json
@@ -0,0 +1,113 @@
+{
+  "name": "rope_h32_kv8_d128",
+  "description": "Standard RoPE on ragged q/k using indptr + per-seq offsets.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_rope",
+    "status:verified"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences in the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "batch_size_plus_1": {
+      "type": "var",
+      "description": "batch_size + 1."
+    }
+  },
+  "constraints": [
+    "batch_size_plus_1 == batch_size + 1"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "indptr": {
+      "shape": [
+        "batch_size_plus_1"
+      ],
+      "dtype": "int32",
+      "description": "Ragged batch indptr, shape (batch_size + 1)."
+    },
+    "offsets": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32",
+      "description": "Per-sequence starting position offset."
+    },
+    "rotary_dim": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "If None, uses head_dim. Rotate only the first `rotary_dim` dims."
+    },
+    "interleave": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Bool: interleaved (True) vs half-split (False) rotation."
+    },
+    "rope_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scale factor."
+    },
+    "rope_theta": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Theta value."
+    }
+  },
+  "outputs": {
+    "q_rope": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_rope": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_rope_reference(\n    q: torch.Tensor,\n    k: torch.Tensor,\n    indptr: torch.Tensor,\n    offsets: torch.Tensor,\n    rotary_dim: Optional[int] = None,\n    interleave: bool = False,\n    rope_scale: float = 1,\n    rope_theta: float = 1e4,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    if rotary_dim is None:\n        rotary_dim = q.shape[-1]\n    freqs = _rope_freqs(rotary_dim, rope_theta, q.device) / rope_scale\n    positions = _positions_from_indptr(indptr, offsets, q.shape[0])\n    return _apply_rope_core(q, k, positions, freqs, interleave)\n"
+}
diff --git a/tests/trace/fi_trace_out/rope_inplace_h32_kv8_d128.json b/tests/trace/fi_trace_out/rope_inplace_h32_kv8_d128.json
new file mode 100644
index 0000000000..a66ba71c23
--- /dev/null
+++ b/tests/trace/fi_trace_out/rope_inplace_h32_kv8_d128.json
@@ -0,0 +1,115 @@
+{
+  "name": "rope_inplace_h32_kv8_d128",
+  "description": "In-place standard RoPE; q and k are mutated.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_rope_inplace",
+    "status:verified"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences in the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "batch_size_plus_1": {
+      "type": "var",
+      "description": "batch_size + 1."
+    }
+  },
+  "constraints": [
+    "batch_size_plus_1 == batch_size + 1"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "indptr": {
+      "shape": [
+        "batch_size_plus_1"
+      ],
+      "dtype": "int32",
+      "description": "Ragged batch indptr, shape (batch_size + 1)."
+    },
+    "offsets": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32",
+      "description": "Per-sequence starting position offset."
+    },
+    "rotary_dim": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "If None, uses head_dim. Rotate only the first `rotary_dim` dims."
+    },
+    "interleave": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true,
+      "description": "Bool: interleaved (True) vs half-split (False) rotation."
+    },
+    "rope_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Scale factor."
+    },
+    "rope_theta": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Theta value."
+    }
+  },
+  "outputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated q (in-place)."
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated k (in-place)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_rope_reference(\n    q: torch.Tensor,\n    k: torch.Tensor,\n    indptr: torch.Tensor,\n    offsets: torch.Tensor,\n    rotary_dim: Optional[int] = None,\n    interleave: bool = False,\n    rope_scale: float = 1,\n    rope_theta: float = 1e4,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    if rotary_dim is None:\n        rotary_dim = q.shape[-1]\n    freqs = _rope_freqs(rotary_dim, rope_theta, q.device) / rope_scale\n    positions = _positions_from_indptr(indptr, offsets, q.shape[0])\n    return _apply_rope_core(q, k, positions, freqs, interleave)\n"
+}
diff --git a/tests/trace/fi_trace_out/rope_pos_ids_h32_kv8_d128.json b/tests/trace/fi_trace_out/rope_pos_ids_h32_kv8_d128.json
new file mode 100644
index 0000000000..041ecda240
--- /dev/null
+++ b/tests/trace/fi_trace_out/rope_pos_ids_h32_kv8_d128.json
@@ -0,0 +1,91 @@
+{
+  "name": "rope_pos_ids_h32_kv8_d128",
+  "description": "Standard RoPE using explicit per-token position ids.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_rope_pos_ids",
+    "status:verified"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    }
+  },
+  "inputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "pos_ids": {
+      "shape": [
+        "nnz"
+      ],
+      "dtype": "int32",
+      "description": "Per-token position index."
+    },
+    "rotary_dim": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true
+    },
+    "interleave": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true
+    },
+    "rope_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true
+    },
+    "rope_theta": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true
+    }
+  },
+  "outputs": {
+    "q_rope": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_rope": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_rope_pos_ids_reference(\n    q: torch.Tensor,\n    k: torch.Tensor,\n    pos_ids: torch.Tensor,\n    rotary_dim: Optional[int] = None,\n    interleave: bool = False,\n    rope_scale: float = 1,\n    rope_theta: float = 1e4,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    if rotary_dim is None:\n        rotary_dim = q.shape[-1]\n    freqs = _rope_freqs(rotary_dim, rope_theta, q.device) / rope_scale\n    return _apply_rope_core(q, k, pos_ids.to(torch.float32), freqs, interleave)\n"
+}
diff --git a/tests/trace/fi_trace_out/rope_pos_ids_inplace_h32_kv8_d128.json b/tests/trace/fi_trace_out/rope_pos_ids_inplace_h32_kv8_d128.json
new file mode 100644
index 0000000000..5c21d56e82
--- /dev/null
+++ b/tests/trace/fi_trace_out/rope_pos_ids_inplace_h32_kv8_d128.json
@@ -0,0 +1,93 @@
+{
+  "name": "rope_pos_ids_inplace_h32_kv8_d128",
+  "description": "In-place RoPE using explicit per-token position ids.",
+  "op_type": "rope",
+  "tags": [
+    "fi_api:flashinfer.rope.apply_rope_pos_ids_inplace",
+    "status:verified"
+  ],
+  "axes": {
+    "nnz": {
+      "type": "var",
+      "description": "Total number of tokens across the batch."
+    },
+    "num_q_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_k_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    }
+  },
+  "inputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "pos_ids": {
+      "shape": [
+        "nnz"
+      ],
+      "dtype": "int32",
+      "description": "Per-token position index."
+    },
+    "rotary_dim": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true
+    },
+    "interleave": {
+      "shape": null,
+      "dtype": "int32",
+      "optional": true
+    },
+    "rope_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true
+    },
+    "rope_theta": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true
+    }
+  },
+  "outputs": {
+    "q": {
+      "shape": [
+        "nnz",
+        "num_q_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated q (in-place)."
+    },
+    "k": {
+      "shape": [
+        "nnz",
+        "num_k_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated k (in-place)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _apply_rope_pos_ids_reference(\n    q: torch.Tensor,\n    k: torch.Tensor,\n    pos_ids: torch.Tensor,\n    rotary_dim: Optional[int] = None,\n    interleave: bool = False,\n    rope_scale: float = 1,\n    rope_theta: float = 1e4,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    if rotary_dim is None:\n        rotary_dim = q.shape[-1]\n    freqs = _rope_freqs(rotary_dim, rope_theta, q.device) / rope_scale\n    return _apply_rope_core(q, k, pos_ids.to(torch.float32), freqs, interleave)\n"
+}
diff --git a/tests/trace/fi_trace_out/sampling_from_logits_v32000.json b/tests/trace/fi_trace_out/sampling_from_logits_v32000.json
new file mode 100644
index 0000000000..f468349f45
--- /dev/null
+++ b/tests/trace/fi_trace_out/sampling_from_logits_v32000.json
@@ -0,0 +1,47 @@
+{
+  "name": "sampling_from_logits_v32000",
+  "description": "Fused sampling from logits (equivalent to softmax + sampling). Reference uses softmax + argmax.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.sampling_from_logits",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    },
+    "num_indices": {
+      "type": "var",
+      "description": "Length of optional indices tensor."
+    }
+  },
+  "inputs": {
+    "logits": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "indices": {
+      "shape": [
+        "num_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _sampling_from_logits_reference(logits, indices=None, **_unused):\n    probs = torch.softmax(logits.to(torch.float32), dim=-1)\n    return _sampling_from_probs_reference(probs, indices=indices)\n"
+}
diff --git a/tests/trace/fi_trace_out/sampling_from_probs_v32000.json b/tests/trace/fi_trace_out/sampling_from_probs_v32000.json
new file mode 100644
index 0000000000..ea953f1e76
--- /dev/null
+++ b/tests/trace/fi_trace_out/sampling_from_probs_v32000.json
@@ -0,0 +1,47 @@
+{
+  "name": "sampling_from_probs_v32000",
+  "description": "Fused categorical sampling from [batch_size, vocab_size] probs. Reference uses argmax (matches deterministic=True).",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.sampling_from_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    },
+    "num_indices": {
+      "type": "var",
+      "description": "Length of optional indices tensor."
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "indices": {
+      "shape": [
+        "num_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _sampling_from_probs_reference(probs, indices=None, **_unused):\n    \"\"\"Categorical sampling from probabilities (deterministic: argmax).\"\"\"\n    p = probs.to(torch.float32)\n    if indices is not None:\n        p = p[indices.to(torch.long)]\n    return p.argmax(dim=-1).to(torch.int32)\n"
+}
diff --git a/tests/trace/fi_trace_out/segment_gemm_run_k128_n64.json b/tests/trace/fi_trace_out/segment_gemm_run_k128_n64.json
new file mode 100644
index 0000000000..0d86d7b178
--- /dev/null
+++ b/tests/trace/fi_trace_out/segment_gemm_run_k128_n64.json
@@ -0,0 +1,56 @@
+{
+  "name": "segment_gemm_run_k128_n64",
+  "description": "SegmentGEMMWrapper.run(): variable-size batched GEMM over concatenated row segments. x is a ragged stack of per-segment inputs; weights may be shared or per-segment.",
+  "op_type": "segment_gemm",
+  "tags": [
+    "fi_api:flashinfer.gemm.gemm_base.SegmentGEMMWrapper.run",
+    "status:verified"
+  ],
+  "axes": {
+    "total_rows": {
+      "type": "var",
+      "description": "Total rows across all segments."
+    },
+    "K": {
+      "type": "const",
+      "value": 128
+    },
+    "N": {
+      "type": "const",
+      "value": 64
+    },
+    "batch_size": {
+      "type": "var",
+      "description": "Number of segments."
+    }
+  },
+  "inputs": {
+    "x": {
+      "shape": [
+        "total_rows",
+        "K"
+      ],
+      "dtype": "bfloat16",
+      "description": "Stacked segment inputs, row-concatenated."
+    },
+    "weights": {
+      "shape": [
+        "batch_size",
+        "K",
+        "N"
+      ],
+      "dtype": "bfloat16",
+      "description": "Per-segment weight tensors (may be shared across segments)."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "total_rows",
+        "N"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _segment_gemm_run_reference(x, weights, **_unused):\n    \"\"\"Batched matmul: per-segment weights applied to stacked rows. Assumes\n    the caller passes a seg_indptr via kwargs; falls back to broadcasting\n    the first weight if unavailable.\"\"\"\n    seg_indptr = _unused.get(\"seg_indptr\")\n    if seg_indptr is None:\n        return torch.matmul(x.to(torch.float32), weights[0].to(torch.float32)).to(x.dtype)\n    out = torch.zeros(\n        (x.shape[0], weights.shape[-1]), dtype=torch.float32, device=x.device,\n    )\n    for i in range(weights.shape[0]):\n        start = int(seg_indptr[i].item())\n        end = int(seg_indptr[i + 1].item())\n        out[start:end] = x[start:end].to(torch.float32) @ weights[i].to(torch.float32)\n    return out.to(x.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/silu_and_mul_h16384.json b/tests/trace/fi_trace_out/silu_and_mul_h16384.json
new file mode 100644
index 0000000000..53c49e34f1
--- /dev/null
+++ b/tests/trace/fi_trace_out/silu_and_mul_h16384.json
@@ -0,0 +1,41 @@
+{
+  "name": "silu_and_mul_h16384",
+  "description": "Fused SiLU + Mul: silu(x[:H]) * x[H:]. Used in LLaMA/Mistral FFN.",
+  "op_type": "activation",
+  "tags": [
+    "fi_api:flashinfer.activation.silu_and_mul",
+    "status:verified",
+    "fused"
+  ],
+  "axes": {
+    "num_tokens": {
+      "type": "var",
+      "description": "Total number of tokens (batch_size * seq_len)."
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 16384,
+      "description": "Output hidden size (input is 2*h)."
+    }
+  },
+  "inputs": {
+    "input": {
+      "shape": [
+        "num_tokens",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Gated input tensor of shape [num_tokens, 2*hidden_size]."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "num_tokens",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _silu_and_mul_reference(input):\n    \"\"\"Fused SiLU + Mul: silu(input[..., :H]) * input[..., H:]\"\"\"\n    half = input.shape[-1] // 2\n    return F.silu(input[..., :half]) * input[..., half:]\n"
+}
diff --git a/tests/trace/fi_trace_out/single_decode_h32_kv8_d128.json b/tests/trace/fi_trace_out/single_decode_h32_kv8_d128.json
new file mode 100644
index 0000000000..101fbb92fa
--- /dev/null
+++ b/tests/trace/fi_trace_out/single_decode_h32_kv8_d128.json
@@ -0,0 +1,65 @@
+{
+  "name": "single_decode_h32_kv8_d128",
+  "description": "Single-request decode. Q has no batch dim ([num_qo_heads, head_dim]); K and V are contiguous ([kv_len, num_kv_heads, head_dim]). No paging, no plan().",
+  "op_type": "single_decode",
+  "tags": [
+    "fi_api:flashinfer.decode.single_decode_with_kv_cache",
+    "status:verified",
+    "stage:decode"
+  ],
+  "axes": {
+    "num_qo_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "kv_len": {
+      "type": "var",
+      "description": "Length of the K/V context."
+    }
+  },
+  "inputs": {
+    "q": {
+      "shape": [
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "kv_len",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Key cache, shape varies with kv_layout (default NHD)."
+    },
+    "v": {
+      "shape": [
+        "kv_len",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16",
+      "description": "Value cache, shape varies with kv_layout (default NHD)."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _single_decode_reference(q, k, v, **kwargs):\n    \"\"\"Single-request decode: q @ K.T \u2192 softmax \u2192 @ V, broadcasting GQA.\"\"\"\n    num_qo_heads, head_dim = q.shape\n    kv_len, num_kv_heads, _ = k.shape\n    gqa_ratio = num_qo_heads // num_kv_heads\n    sm_scale = kwargs.get(\"sm_scale\")\n    if sm_scale is None:\n        sm_scale = 1.0 / math.sqrt(head_dim)\n    output = torch.zeros_like(q, dtype=torch.float32)\n    for h in range(num_qo_heads):\n        kv_h = h // gqa_ratio\n        logits = (\n            torch.matmul(q[h].to(torch.float32), k[:, kv_h].to(torch.float32).T)\n            * sm_scale\n        )\n        attn = torch.softmax(logits, dim=-1)\n        output[h] = torch.matmul(attn, v[:, kv_h].to(torch.float32))\n    return output.to(q.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/single_prefill_h32_kv8_d128.json b/tests/trace/fi_trace_out/single_prefill_h32_kv8_d128.json
new file mode 100644
index 0000000000..c2d63279f9
--- /dev/null
+++ b/tests/trace/fi_trace_out/single_prefill_h32_kv8_d128.json
@@ -0,0 +1,69 @@
+{
+  "name": "single_prefill_h32_kv8_d128",
+  "description": "Single-request prefill. Q is [qo_len, H, D]; K, V are contiguous [kv_len, Hkv, D]. No paging, no plan(). Optional causal mask and custom_mask.",
+  "op_type": "single_prefill",
+  "tags": [
+    "fi_api:flashinfer.prefill.single_prefill_with_kv_cache",
+    "status:verified",
+    "stage:prefill"
+  ],
+  "axes": {
+    "num_qo_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "qo_len": {
+      "type": "var",
+      "description": "Length of the query sequence."
+    },
+    "kv_len": {
+      "type": "var",
+      "description": "Length of the K/V sequence."
+    }
+  },
+  "inputs": {
+    "q": {
+      "shape": [
+        "qo_len",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k": {
+      "shape": [
+        "kv_len",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "v": {
+      "shape": [
+        "kv_len",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "qo_len",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _single_prefill_reference(q, k, v, **kwargs):\n    \"\"\"Single-request prefill: standard SDPA with optional causal mask.\"\"\"\n    qo_len, num_qo_heads, head_dim = q.shape\n    kv_len, num_kv_heads, _ = k.shape\n    gqa_ratio = num_qo_heads // num_kv_heads\n    causal = bool(kwargs.get(\"causal\", False))\n    sm_scale = kwargs.get(\"sm_scale\")\n    if sm_scale is None:\n        sm_scale = 1.0 / math.sqrt(head_dim)\n    output = torch.zeros_like(q, dtype=torch.float32)\n    delta = kv_len - qo_len\n    for h in range(num_qo_heads):\n        kv_h = h // gqa_ratio\n        logits = (\n            torch.matmul(q[:, h].to(torch.float32), k[:, kv_h].to(torch.float32).T)\n            * sm_scale\n        )\n        if causal:\n            mask = torch.full_like(logits, float(\"-inf\"))\n            for qi in range(qo_len):\n                mask[qi, : qi + 1 + max(0, delta)] = 0.0\n            logits = logits + mask\n        attn = torch.softmax(logits, dim=-1)\n        output[:, h] = torch.matmul(attn, v[:, kv_h].to(torch.float32))\n    return output.to(q.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/softmax_v32000.json b/tests/trace/fi_trace_out/softmax_v32000.json
new file mode 100644
index 0000000000..9b0221bd36
--- /dev/null
+++ b/tests/trace/fi_trace_out/softmax_v32000.json
@@ -0,0 +1,43 @@
+{
+  "name": "softmax_v32000",
+  "description": "Fused online safe softmax with optional temperature scaling.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.softmax",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    }
+  },
+  "inputs": {
+    "logits": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "temperature": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Per-tensor or per-row temperature."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _softmax_reference(logits, temperature=None, **_unused):\n    \"\"\"Online safe softmax with optional temperature scaling.\"\"\"\n    x = logits.to(torch.float32)\n    if temperature is not None:\n        if isinstance(temperature, torch.Tensor):\n            t = temperature.to(torch.float32).reshape(-1, 1)\n        else:\n            t = float(temperature)\n        x = x / t\n    return torch.softmax(x, dim=-1).to(logits.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/top_k_mask_logits_v32000.json b/tests/trace/fi_trace_out/top_k_mask_logits_v32000.json
new file mode 100644
index 0000000000..f20f5c855f
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_k_mask_logits_v32000.json
@@ -0,0 +1,41 @@
+{
+  "name": "top_k_mask_logits_v32000",
+  "description": "Mask out-of-top-k logits to -inf.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_k_mask_logits",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    }
+  },
+  "inputs": {
+    "logits": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32"
+    }
+  },
+  "outputs": {
+    "masked_logits": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_k_mask_logits_reference(logits, top_k, **_unused):\n    \"\"\"Mask logits outside the top-k to -inf.\"\"\"\n    x = logits.to(torch.float32)\n    if isinstance(top_k, torch.Tensor):\n        k = int(top_k.max().item())\n    else:\n        k = int(top_k)\n    _, topk_idx = torch.topk(x, k=k, dim=-1)\n    mask = torch.full_like(x, float(\"-inf\"))\n    mask.scatter_(-1, topk_idx, 0.0)\n    return (x + mask).to(logits.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/top_k_renorm_probs_v128256.json b/tests/trace/fi_trace_out/top_k_renorm_probs_v128256.json
new file mode 100644
index 0000000000..56ba4d30a2
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_k_renorm_probs_v128256.json
@@ -0,0 +1,41 @@
+{
+  "name": "top_k_renorm_probs_v128256",
+  "description": "Renormalise probabilities by top-k thresholding.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_k_renorm_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 128256
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32"
+    }
+  },
+  "outputs": {
+    "renormalized": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_k_renorm_probs_reference(probs, top_k, **_unused):\n    \"\"\"Renormalise probs by top-k thresholding.\"\"\"\n    p = probs.to(torch.float32)\n    if isinstance(top_k, torch.Tensor):\n        k = int(top_k.max().item())\n    else:\n        k = int(top_k)\n    _, topk_idx = torch.topk(p, k=k, dim=-1)\n    mask = torch.zeros_like(p, dtype=torch.bool)\n    mask.scatter_(-1, topk_idx, True)\n    p_masked = torch.where(mask, p, torch.zeros_like(p))\n    return (p_masked / (p_masked.sum(dim=-1, keepdim=True) + 1e-20)).to(probs.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/top_k_renorm_probs_v151936.json b/tests/trace/fi_trace_out/top_k_renorm_probs_v151936.json
new file mode 100644
index 0000000000..4efd70b0d8
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_k_renorm_probs_v151936.json
@@ -0,0 +1,41 @@
+{
+  "name": "top_k_renorm_probs_v151936",
+  "description": "Renormalise probabilities by top-k thresholding.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_k_renorm_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 151936
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32"
+    }
+  },
+  "outputs": {
+    "renormalized": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_k_renorm_probs_reference(probs, top_k, **_unused):\n    \"\"\"Renormalise probs by top-k thresholding.\"\"\"\n    p = probs.to(torch.float32)\n    if isinstance(top_k, torch.Tensor):\n        k = int(top_k.max().item())\n    else:\n        k = int(top_k)\n    _, topk_idx = torch.topk(p, k=k, dim=-1)\n    mask = torch.zeros_like(p, dtype=torch.bool)\n    mask.scatter_(-1, topk_idx, True)\n    p_masked = torch.where(mask, p, torch.zeros_like(p))\n    return (p_masked / (p_masked.sum(dim=-1, keepdim=True) + 1e-20)).to(probs.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/top_k_renorm_probs_v32000.json b/tests/trace/fi_trace_out/top_k_renorm_probs_v32000.json
new file mode 100644
index 0000000000..f6f46f069d
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_k_renorm_probs_v32000.json
@@ -0,0 +1,41 @@
+{
+  "name": "top_k_renorm_probs_v32000",
+  "description": "Renormalise probabilities by top-k thresholding.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_k_renorm_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32"
+    }
+  },
+  "outputs": {
+    "renormalized": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_k_renorm_probs_reference(probs, top_k, **_unused):\n    \"\"\"Renormalise probs by top-k thresholding.\"\"\"\n    p = probs.to(torch.float32)\n    if isinstance(top_k, torch.Tensor):\n        k = int(top_k.max().item())\n    else:\n        k = int(top_k)\n    _, topk_idx = torch.topk(p, k=k, dim=-1)\n    mask = torch.zeros_like(p, dtype=torch.bool)\n    mask.scatter_(-1, topk_idx, True)\n    p_masked = torch.where(mask, p, torch.zeros_like(p))\n    return (p_masked / (p_masked.sum(dim=-1, keepdim=True) + 1e-20)).to(probs.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/top_k_sampling_v128256.json b/tests/trace/fi_trace_out/top_k_sampling_v128256.json
new file mode 100644
index 0000000000..f12633e217
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_k_sampling_v128256.json
@@ -0,0 +1,47 @@
+{
+  "name": "top_k_sampling_v128256",
+  "description": "Top-k sampling from probabilities. Keeps only the k highest probability tokens, renormalizes, then samples from the filtered distribution.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_k_sampling_from_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences to sample from"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 128256,
+      "description": "Vocabulary size."
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32",
+      "description": "Probability distributions (after softmax)"
+    },
+    "top_k": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32",
+      "description": "Number of top tokens to consider for sampling per sequence"
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int64",
+      "description": "Sampled token indices"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_k_sampling_reference(probs, top_k):\n    \"\"\"Top-k sampling: keep only the k highest probability tokens, renormalize, then sample.\"\"\"\n    batch_size, vocab_size = probs.shape\n    device = probs.device\n    probs = probs.to(torch.float32)\n    samples = torch.empty(batch_size, dtype=torch.int64, device=device)\n    for i in range(batch_size):\n        row = probs[i]\n        k = int(top_k[i].item())\n        if 0 < k < vocab_size:\n            idx_sorted = torch.argsort(row, descending=True)\n            keep_idx = idx_sorted[:k]\n            filtered = torch.zeros_like(row)\n            filtered[keep_idx] = row[keep_idx]\n            row = filtered / filtered.sum()\n        samples[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)\n    return samples\n"
+}
diff --git a/tests/trace/fi_trace_out/top_k_top_p_sampling_from_logits_v32000.json b/tests/trace/fi_trace_out/top_k_top_p_sampling_from_logits_v32000.json
new file mode 100644
index 0000000000..e0b48514d9
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_k_top_p_sampling_from_logits_v32000.json
@@ -0,0 +1,55 @@
+{
+  "name": "top_k_top_p_sampling_from_logits_v32000",
+  "description": "Fused top-k + top-p sampling starting from logits. Reference: softmax + top_k_mask + top_p_renorm + argmax.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_k_top_p_sampling_from_logits",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    },
+    "num_indices": {
+      "type": "var",
+      "description": "Length of optional indices tensor."
+    }
+  },
+  "inputs": {
+    "logits": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "top_k": {
+      "shape": null,
+      "dtype": "int32"
+    },
+    "top_p": {
+      "shape": null,
+      "dtype": "float32"
+    },
+    "indices": {
+      "shape": [
+        "num_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_k_top_p_sampling_from_logits_reference(\n    logits, top_k, top_p, indices=None, filter_apply_order=\"top_k_first\", **_unused\n):\n    \"\"\"top-k + top-p sampling from logits (deterministic: argmax).\"\"\"\n    x = logits.to(torch.float32)\n    if filter_apply_order == \"top_k_first\":\n        x = _top_k_mask_logits_reference(x, top_k)\n        probs = torch.softmax(x, dim=-1)\n        probs = _top_p_renorm_probs_reference(probs, top_p)\n    else:  # \"joint\"\n        probs = torch.softmax(x, dim=-1)\n        probs = _top_k_renorm_probs_reference(probs, top_k)\n        probs = _top_p_renorm_probs_reference(probs, top_p)\n    if indices is not None:\n        probs = probs[indices.to(torch.long)]\n    return probs.argmax(dim=-1).to(torch.int32)\n"
+}
diff --git a/tests/trace/fi_trace_out/top_k_top_p_sampling_v128256.json b/tests/trace/fi_trace_out/top_k_top_p_sampling_v128256.json
new file mode 100644
index 0000000000..1fa2aedfee
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_k_top_p_sampling_v128256.json
@@ -0,0 +1,54 @@
+{
+  "name": "top_k_top_p_sampling_v128256",
+  "description": "Top-k top-p (nucleus) sampling from probabilities. Filters probabilities using top-k and top-p constraints, then samples from the filtered distribution.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_k_top_p_sampling_from_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences to sample from"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 128256,
+      "description": "Vocabulary size."
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32",
+      "description": "Probability distributions (after softmax)"
+    },
+    "top_k": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32",
+      "description": "Number of top tokens to consider for sampling per sequence"
+    },
+    "top_p": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "float32",
+      "description": "Cumulative probability threshold for nucleus sampling per sequence"
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int64",
+      "description": "Sampled token indices"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_k_top_p_sampling_reference(probs, top_k, top_p):\n    \"\"\"Top-k then top-p (nucleus) sampling: apply both filters, then sample.\"\"\"\n    batch_size, vocab_size = probs.shape\n    device = probs.device\n    probs = probs.to(torch.float32)\n    samples = torch.empty(batch_size, dtype=torch.int64, device=device)\n    for i in range(batch_size):\n        row = probs[i]\n        k = int(top_k[i].item())\n        p = float(top_p[i].item())\n        if 0 < k < vocab_size:\n            idx_sorted = torch.argsort(row, descending=True)\n            keep_idx_k = idx_sorted[:k]\n            filtered_k = torch.zeros_like(row)\n            filtered_k[keep_idx_k] = row[keep_idx_k]\n            row = filtered_k / filtered_k.sum()\n        if p <= 0.0:\n            samples[i] = torch.argmax(row).to(torch.int64)\n            continue\n        if p < 1.0:\n            vals, idx = torch.sort(row, descending=True)\n            cdf = torch.cumsum(vals, dim=0)\n            to_remove = cdf > p\n            if vocab_size > 1:\n                to_remove[1:] = to_remove[:-1].clone()\n                to_remove[0] = False\n            keep_idx_p = idx[~to_remove]\n            filtered_p = torch.zeros_like(row)\n            filtered_p[keep_idx_p] = row[keep_idx_p]\n            row = filtered_p / filtered_p.sum()\n        samples[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)\n    return samples\n"
+}
diff --git a/tests/trace/fi_trace_out/top_k_top_p_sampling_v151936.json b/tests/trace/fi_trace_out/top_k_top_p_sampling_v151936.json
new file mode 100644
index 0000000000..ae8840827a
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_k_top_p_sampling_v151936.json
@@ -0,0 +1,54 @@
+{
+  "name": "top_k_top_p_sampling_v151936",
+  "description": "Top-k top-p (nucleus) sampling from probabilities. Filters probabilities using top-k and top-p constraints, then samples from the filtered distribution.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_k_top_p_sampling_from_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences to sample from"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 151936,
+      "description": "Vocabulary size."
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32",
+      "description": "Probability distributions (after softmax)"
+    },
+    "top_k": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int32",
+      "description": "Number of top tokens to consider for sampling per sequence"
+    },
+    "top_p": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "float32",
+      "description": "Cumulative probability threshold for nucleus sampling per sequence"
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int64",
+      "description": "Sampled token indices"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_k_top_p_sampling_reference(probs, top_k, top_p):\n    \"\"\"Top-k then top-p (nucleus) sampling: apply both filters, then sample.\"\"\"\n    batch_size, vocab_size = probs.shape\n    device = probs.device\n    probs = probs.to(torch.float32)\n    samples = torch.empty(batch_size, dtype=torch.int64, device=device)\n    for i in range(batch_size):\n        row = probs[i]\n        k = int(top_k[i].item())\n        p = float(top_p[i].item())\n        if 0 < k < vocab_size:\n            idx_sorted = torch.argsort(row, descending=True)\n            keep_idx_k = idx_sorted[:k]\n            filtered_k = torch.zeros_like(row)\n            filtered_k[keep_idx_k] = row[keep_idx_k]\n            row = filtered_k / filtered_k.sum()\n        if p <= 0.0:\n            samples[i] = torch.argmax(row).to(torch.int64)\n            continue\n        if p < 1.0:\n            vals, idx = torch.sort(row, descending=True)\n            cdf = torch.cumsum(vals, dim=0)\n            to_remove = cdf > p\n            if vocab_size > 1:\n                to_remove[1:] = to_remove[:-1].clone()\n                to_remove[0] = False\n            keep_idx_p = idx[~to_remove]\n            filtered_p = torch.zeros_like(row)\n            filtered_p[keep_idx_p] = row[keep_idx_p]\n            row = filtered_p / filtered_p.sum()\n        samples[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)\n    return samples\n"
+}
diff --git a/tests/trace/fi_trace_out/top_p_renorm_probs_v32000.json b/tests/trace/fi_trace_out/top_p_renorm_probs_v32000.json
new file mode 100644
index 0000000000..d13a2fd014
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_p_renorm_probs_v32000.json
@@ -0,0 +1,41 @@
+{
+  "name": "top_p_renorm_probs_v32000",
+  "description": "Renormalise probabilities by top-p thresholding.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_p_renorm_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    },
+    "top_p": {
+      "shape": null,
+      "dtype": "float32"
+    }
+  },
+  "outputs": {
+    "renormalized": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_p_renorm_probs_reference(probs, top_p, **_unused):\n    \"\"\"Renormalise probs by top-p thresholding.\"\"\"\n    p = probs.to(torch.float32)\n    if isinstance(top_p, torch.Tensor):\n        tp = top_p.to(torch.float32).reshape(-1, 1)\n    else:\n        tp = float(top_p)\n    sorted_p, sorted_idx = torch.sort(p, dim=-1, descending=True)\n    cumsum = sorted_p.cumsum(dim=-1)\n    keep_sorted = (cumsum - sorted_p) < tp\n    keep = torch.zeros_like(p, dtype=torch.bool).scatter_(-1, sorted_idx, keep_sorted)\n    p_masked = torch.where(keep, p, torch.zeros_like(p))\n    return (p_masked / (p_masked.sum(dim=-1, keepdim=True) + 1e-20)).to(probs.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out/top_p_sampling_v128256.json b/tests/trace/fi_trace_out/top_p_sampling_v128256.json
new file mode 100644
index 0000000000..9ba2bfb1eb
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_p_sampling_v128256.json
@@ -0,0 +1,47 @@
+{
+  "name": "top_p_sampling_v128256",
+  "description": "Top-p (nucleus) sampling from probabilities. Filters probabilities using cumulative probability threshold, then samples from the filtered distribution.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_p_sampling_from_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences to sample from"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 128256,
+      "description": "Vocabulary size."
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32",
+      "description": "Probability distributions (after softmax)"
+    },
+    "top_p": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "float32",
+      "description": "Cumulative probability threshold for nucleus sampling per sequence"
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int64",
+      "description": "Sampled token indices"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_p_sampling_reference(probs, top_p):\n    \"\"\"Top-p (nucleus) sampling: filter by cumulative probability threshold, then sample.\"\"\"\n    batch_size, vocab_size = probs.shape\n    device = probs.device\n    probs = probs.to(torch.float32)\n    out = torch.empty(batch_size, dtype=torch.int64, device=device)\n    for i in range(batch_size):\n        row = probs[i]\n        p = float(top_p[i].item())\n        if p <= 0.0:\n            out[i] = torch.argmax(row).to(torch.int64)\n            continue\n        if p < 1.0:\n            vals, idx = torch.sort(row, descending=True)\n            cdf = torch.cumsum(vals, dim=0)\n            to_remove = cdf > p\n            to_remove[1:] = to_remove[:-1].clone()\n            to_remove[0] = False\n            keep_idx = idx[~to_remove]\n            filtered = torch.zeros_like(row)\n            filtered[keep_idx] = row[keep_idx]\n            row = filtered / filtered.sum()\n        out[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)\n    return out\n"
+}
diff --git a/tests/trace/fi_trace_out/top_p_sampling_v151936.json b/tests/trace/fi_trace_out/top_p_sampling_v151936.json
new file mode 100644
index 0000000000..1ad6864cad
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_p_sampling_v151936.json
@@ -0,0 +1,47 @@
+{
+  "name": "top_p_sampling_v151936",
+  "description": "Top-p (nucleus) sampling from probabilities. Filters probabilities using cumulative probability threshold, then samples from the filtered distribution.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_p_sampling_from_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences to sample from"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 151936,
+      "description": "Vocabulary size."
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32",
+      "description": "Probability distributions (after softmax)"
+    },
+    "top_p": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "float32",
+      "description": "Cumulative probability threshold for nucleus sampling per sequence"
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int64",
+      "description": "Sampled token indices"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_p_sampling_reference(probs, top_p):\n    \"\"\"Top-p (nucleus) sampling: filter by cumulative probability threshold, then sample.\"\"\"\n    batch_size, vocab_size = probs.shape\n    device = probs.device\n    probs = probs.to(torch.float32)\n    out = torch.empty(batch_size, dtype=torch.int64, device=device)\n    for i in range(batch_size):\n        row = probs[i]\n        p = float(top_p[i].item())\n        if p <= 0.0:\n            out[i] = torch.argmax(row).to(torch.int64)\n            continue\n        if p < 1.0:\n            vals, idx = torch.sort(row, descending=True)\n            cdf = torch.cumsum(vals, dim=0)\n            to_remove = cdf > p\n            to_remove[1:] = to_remove[:-1].clone()\n            to_remove[0] = False\n            keep_idx = idx[~to_remove]\n            filtered = torch.zeros_like(row)\n            filtered[keep_idx] = row[keep_idx]\n            row = filtered / filtered.sum()\n        out[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)\n    return out\n"
+}
diff --git a/tests/trace/fi_trace_out/top_p_sampling_v32000.json b/tests/trace/fi_trace_out/top_p_sampling_v32000.json
new file mode 100644
index 0000000000..8bc9b16cac
--- /dev/null
+++ b/tests/trace/fi_trace_out/top_p_sampling_v32000.json
@@ -0,0 +1,47 @@
+{
+  "name": "top_p_sampling_v32000",
+  "description": "Top-p (nucleus) sampling from probabilities. Filters probabilities using cumulative probability threshold, then samples from the filtered distribution.",
+  "op_type": "sampling",
+  "tags": [
+    "fi_api:flashinfer.sampling.top_p_sampling_from_probs",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Number of sequences to sample from"
+    },
+    "vocab_size": {
+      "type": "const",
+      "value": 32000,
+      "description": "Vocabulary size."
+    }
+  },
+  "inputs": {
+    "probs": {
+      "shape": [
+        "batch_size",
+        "vocab_size"
+      ],
+      "dtype": "float32",
+      "description": "Probability distributions (after softmax)"
+    },
+    "top_p": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "unknown",
+      "description": "Cumulative probability threshold for nucleus sampling per sequence"
+    }
+  },
+  "outputs": {
+    "samples": {
+      "shape": [
+        "batch_size"
+      ],
+      "dtype": "int64",
+      "description": "Sampled token indices"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _top_p_sampling_reference(probs, top_p):\n    \"\"\"Top-p (nucleus) sampling: filter by cumulative probability threshold, then sample.\"\"\"\n    batch_size, vocab_size = probs.shape\n    device = probs.device\n    probs = probs.to(torch.float32)\n    out = torch.empty(batch_size, dtype=torch.int64, device=device)\n    for i in range(batch_size):\n        row = probs[i]\n        p = float(top_p[i].item())\n        if p <= 0.0:\n            out[i] = torch.argmax(row).to(torch.int64)\n            continue\n        if p < 1.0:\n            vals, idx = torch.sort(row, descending=True)\n            cdf = torch.cumsum(vals, dim=0)\n            to_remove = cdf > p\n            to_remove[1:] = to_remove[:-1].clone()\n            to_remove[0] = False\n            keep_idx = idx[~to_remove]\n            filtered = torch.zeros_like(row)\n            filtered[keep_idx] = row[keep_idx]\n            row = filtered / filtered.sum()\n        out[i] = torch.multinomial(row, 1, replacement=True).squeeze(0)\n    return out\n"
+}
diff --git a/tests/trace/fi_trace_out_cudagraph/gqa_paged_decode_h32_kv8_d128_ps16.json b/tests/trace/fi_trace_out_cudagraph/gqa_paged_decode_h32_kv8_d128_ps16.json
new file mode 100644
index 0000000000..e1f67b7df2
--- /dev/null
+++ b/tests/trace/fi_trace_out_cudagraph/gqa_paged_decode_h32_kv8_d128_ps16.json
@@ -0,0 +1,116 @@
+{
+  "name": "gqa_paged_decode_h32_kv8_d128_ps16",
+  "description": "Batched Grouped Query Attention decode with a paged KV cache.",
+  "op_type": "gqa_paged",
+  "tags": [
+    "fi_api:flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper.run",
+    "stage:decode",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Total number of query tokens."
+    },
+    "num_qo_heads": {
+      "type": "const",
+      "value": 32
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 8
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "num_pages": {
+      "type": "var"
+    },
+    "page_size": {
+      "type": "const",
+      "value": 16
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of kv_indptr array."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Total number of KV page indices."
+    }
+  },
+  "constraints": [
+    "len_indptr == batch_size + 1",
+    "num_kv_indices == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "v_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "KV page offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Page IDs for KV cache lookups. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "lse": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gqa_paged_decode_reference(q, k_cache, v_cache, kv_indptr, kv_indices, sm_scale):\n    batch_size, num_qo_heads, head_dim = q.shape\n    _, page_size, num_kv_heads, _ = k_cache.shape\n\n    output = torch.zeros(\n        (batch_size, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device\n    )\n    lse = torch.full(\n        (batch_size, num_qo_heads), -float(\"inf\"), dtype=torch.float32, device=q.device\n    )\n\n    gqa_ratio = num_qo_heads // num_kv_heads\n    k_cache_f32 = k_cache.to(torch.float32)\n    v_cache_f32 = v_cache.to(torch.float32)\n\n    for b in range(batch_size):\n        page_start = int(kv_indptr[b].item())\n        page_end = int(kv_indptr[b + 1].item())\n        if page_start >= page_end:\n            output[b].zero_()\n            continue\n        # kv_indices are page IDs. Gather pages first, then flatten the\n        # [num_selected_pages, page_size] axis into a single token axis.\n        page_ids = kv_indices[page_start:page_end].to(torch.long)\n        k_b = k_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        v_b = v_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        q_b = q[b].to(torch.float32)  # [num_qo_heads, head_dim]\n        for h in range(num_qo_heads):\n            kv_h = h // gqa_ratio\n            logits = torch.matmul(q_b[h], k_b[:, kv_h].T) * sm_scale\n            lse[b, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n            attn = torch.softmax(logits, dim=-1)\n            output[b, h] = torch.matmul(attn, v_b[:, kv_h]).to(torch.bfloat16)\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out_sglang/fused_add_rmsnorm_h3072.json b/tests/trace/fi_trace_out_sglang/fused_add_rmsnorm_h3072.json
new file mode 100644
index 0000000000..14f0aa87cd
--- /dev/null
+++ b/tests/trace/fi_trace_out_sglang/fused_add_rmsnorm_h3072.json
@@ -0,0 +1,59 @@
+{
+  "name": "fused_add_rmsnorm_h3072",
+  "description": "Fused Add + RMSNorm. Epsilon is fixed at 1e-6.",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.fused_add_rmsnorm",
+    "status:verified",
+    "fused"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 3072
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "residual": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "residual": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16",
+      "description": "Updated residual (in-place: residual += hidden_states)."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _fused_add_rmsnorm_reference(hidden_states, residual, weight):\n    \"\"\"Fused Add + RMSNorm. Epsilon is fixed at 1e-6.\"\"\"\n    EPS = 1e-6\n    x = hidden_states.to(torch.float32) + residual.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    y = (x * inv_rms) * weight.to(torch.float32)\n    return y.to(hidden_states.dtype)\n"
+}
diff --git a/tests/trace/fi_trace_out_sglang/gqa_paged_decode_h24_kv128_d128_ps8.json b/tests/trace/fi_trace_out_sglang/gqa_paged_decode_h24_kv128_d128_ps8.json
new file mode 100644
index 0000000000..7990ab49e0
--- /dev/null
+++ b/tests/trace/fi_trace_out_sglang/gqa_paged_decode_h24_kv128_d128_ps8.json
@@ -0,0 +1,116 @@
+{
+  "name": "gqa_paged_decode_h24_kv128_d128_ps8",
+  "description": "Batched GQA decode (1 query per seq) with a paged KV cache as a (k_cache, v_cache) tuple and ragged kv_indptr+kv_indices baked in at plan() time. Wraps BatchDecodeWithPagedKVCacheWrapper.run().",
+  "op_type": "gqa_paged",
+  "tags": [
+    "fi_api:flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper.run",
+    "stage:decode",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var",
+      "description": "Total number of query tokens."
+    },
+    "num_qo_heads": {
+      "type": "const",
+      "value": 24
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 128
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "num_pages": {
+      "type": "var"
+    },
+    "page_size": {
+      "type": "const",
+      "value": 8
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of kv_indptr array."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Total number of KV page indices."
+    }
+  },
+  "constraints": [
+    "len_indptr == batch_size + 1",
+    "num_kv_indices == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "v_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "KV page offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Page IDs for KV cache lookups. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "lse": {
+      "shape": [
+        "batch_size",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gqa_paged_decode_reference(q, k_cache, v_cache, kv_indptr, kv_indices, sm_scale):\n    batch_size, num_qo_heads, head_dim = q.shape\n    _, page_size, num_kv_heads, _ = k_cache.shape\n\n    output = torch.zeros(\n        (batch_size, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device\n    )\n    lse = torch.full(\n        (batch_size, num_qo_heads), -float(\"inf\"), dtype=torch.float32, device=q.device\n    )\n\n    gqa_ratio = num_qo_heads // num_kv_heads\n    k_cache_f32 = k_cache.to(torch.float32)\n    v_cache_f32 = v_cache.to(torch.float32)\n\n    for b in range(batch_size):\n        page_start = int(kv_indptr[b].item())\n        page_end = int(kv_indptr[b + 1].item())\n        if page_start >= page_end:\n            output[b].zero_()\n            continue\n        # kv_indices are page IDs. Gather pages first, then flatten the\n        # [num_selected_pages, page_size] axis into a single token axis.\n        page_ids = kv_indices[page_start:page_end].to(torch.long)\n        k_b = k_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        v_b = v_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        q_b = q[b].to(torch.float32)  # [num_qo_heads, head_dim]\n        for h in range(num_qo_heads):\n            kv_h = h // gqa_ratio\n            logits = torch.matmul(q_b[h], k_b[:, kv_h].T) * sm_scale\n            lse[b, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n            attn = torch.softmax(logits, dim=-1)\n            output[b, h] = torch.matmul(attn, v_b[:, kv_h]).to(torch.bfloat16)\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out_sglang/gqa_paged_prefill_h24_kv128_d128_ps8.json b/tests/trace/fi_trace_out_sglang/gqa_paged_prefill_h24_kv128_d128_ps8.json
new file mode 100644
index 0000000000..9112448069
--- /dev/null
+++ b/tests/trace/fi_trace_out_sglang/gqa_paged_prefill_h24_kv128_d128_ps8.json
@@ -0,0 +1,124 @@
+{
+  "name": "gqa_paged_prefill_h24_kv128_d128_ps8",
+  "description": "Batched GQA prefill (multi-token per seq, causal) with a paged KV cache. Adds qo_indptr to gqa_paged_decode's indptr/indices. Wraps BatchPrefillWithPagedKVCacheWrapper.run().",
+  "op_type": "gqa_paged",
+  "tags": [
+    "fi_api:flashinfer.prefill.BatchPrefillWithPagedKVCacheWrapper.run",
+    "stage:prefill",
+    "status:verified"
+  ],
+  "axes": {
+    "num_qo_heads": {
+      "type": "const",
+      "value": 24
+    },
+    "num_kv_heads": {
+      "type": "const",
+      "value": 128
+    },
+    "head_dim": {
+      "type": "const",
+      "value": 128
+    },
+    "page_size": {
+      "type": "const",
+      "value": 8
+    },
+    "len_indptr": {
+      "type": "var",
+      "description": "Length of indptr arrays (batch_size + 1)."
+    },
+    "total_q": {
+      "type": "var",
+      "description": "Total number of query tokens."
+    },
+    "num_kv_indices": {
+      "type": "var",
+      "description": "Total number of KV page indices."
+    },
+    "num_pages": {
+      "type": "var"
+    }
+  },
+  "constraints": [
+    "total_q == qo_indptr[-1].item()",
+    "num_kv_indices == kv_indptr[-1].item()"
+  ],
+  "inputs": {
+    "q": {
+      "shape": [
+        "total_q",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "k_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "v_cache": {
+      "shape": [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "qo_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Query offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indptr": {
+      "shape": [
+        "len_indptr"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "KV page offsets for each sequence. Set during plan(), not run()."
+    },
+    "kv_indices": {
+      "shape": [
+        "num_kv_indices"
+      ],
+      "dtype": "unknown",
+      "optional": true,
+      "description": "Page IDs for KV cache lookups. Set during plan(), not run()."
+    },
+    "sm_scale": {
+      "shape": null,
+      "dtype": "float32",
+      "optional": true,
+      "description": "Softmax scale. Default is (1/sqrt(head_dim)). Set during plan(), not run()."
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "total_q",
+        "num_qo_heads",
+        "head_dim"
+      ],
+      "dtype": "bfloat16"
+    },
+    "lse": {
+      "shape": [
+        "total_q",
+        "num_qo_heads"
+      ],
+      "dtype": "float32",
+      "description": "The 2-based log-sum-exp of attention logits."
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _gqa_paged_prefill_reference(\n    q, k_cache, v_cache, qo_indptr, kv_indptr, kv_indices, sm_scale\n):\n    total_q, num_qo_heads, head_dim = q.shape\n    num_pages, page_size, num_kv_heads, _ = k_cache.shape\n    len_indptr = qo_indptr.shape[0]\n\n    output = torch.zeros(\n        (total_q, num_qo_heads, head_dim), dtype=torch.bfloat16, device=q.device\n    )\n    lse = torch.full(\n        (total_q, num_qo_heads), -float(\"inf\"), dtype=torch.float32, device=q.device\n    )\n\n    gqa_ratio = num_qo_heads // num_kv_heads\n    q_f32 = q.to(torch.float32)\n    k_cache_f32 = k_cache.to(torch.float32)\n    v_cache_f32 = v_cache.to(torch.float32)\n\n    for b in range(len_indptr - 1):\n        q_start = int(qo_indptr[b].item())\n        q_end = int(qo_indptr[b + 1].item())\n        kv_start = int(kv_indptr[b].item())\n        kv_end = int(kv_indptr[b + 1].item())\n        if q_start >= q_end or kv_start >= kv_end:\n            continue\n        # kv_indices are page IDs. Gather pages and flatten to a token axis.\n        page_ids = kv_indices[kv_start:kv_end].to(torch.long)\n        k_b = k_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        v_b = v_cache_f32[page_ids].reshape(-1, num_kv_heads, head_dim)\n        num_kv_tokens = k_b.shape[0]\n        q_b = q_f32[q_start:q_end]\n        delta = num_kv_tokens - q_b.shape[0]\n        for q_idx in range(q_b.shape[0]):\n            max_kv = min(q_idx + 1 + delta, num_kv_tokens)\n            if max_kv <= 0:\n                continue\n            global_q = q_start + q_idx\n            for h in range(num_qo_heads):\n                kv_h = h // gqa_ratio\n                logits = torch.matmul(q_b[q_idx, h], k_b[:max_kv, kv_h].T) * sm_scale\n                lse[global_q, h] = torch.logsumexp(logits, dim=-1) / math.log(2.0)\n                attn = torch.softmax(logits, dim=-1)\n                output[global_q, h] = torch.matmul(attn, v_b[:max_kv, kv_h]).to(\n                    torch.bfloat16\n                )\n\n    return output, lse\n"
+}
diff --git a/tests/trace/fi_trace_out_sglang/rmsnorm_h3072.json b/tests/trace/fi_trace_out_sglang/rmsnorm_h3072.json
new file mode 100644
index 0000000000..98f83f6dd6
--- /dev/null
+++ b/tests/trace/fi_trace_out_sglang/rmsnorm_h3072.json
@@ -0,0 +1,43 @@
+{
+  "name": "rmsnorm_h3072",
+  "description": "Root Mean Square Normalization. Epsilon is fixed at 1e-6.",
+  "op_type": "rmsnorm",
+  "tags": [
+    "fi_api:flashinfer.norm.rmsnorm",
+    "status:verified"
+  ],
+  "axes": {
+    "batch_size": {
+      "type": "var"
+    },
+    "hidden_size": {
+      "type": "const",
+      "value": 3072
+    }
+  },
+  "inputs": {
+    "hidden_states": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    },
+    "weight": {
+      "shape": [
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "outputs": {
+    "output": {
+      "shape": [
+        "batch_size",
+        "hidden_size"
+      ],
+      "dtype": "bfloat16"
+    }
+  },
+  "reference": "@torch.no_grad()\ndef _rmsnorm_reference(hidden_states, weight):\n    \"\"\"Root Mean Square Normalization. Epsilon is fixed at 1e-6.\"\"\"\n    EPS = 1e-6\n    x = hidden_states.to(torch.float32)\n    inv_rms = torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + EPS)\n    y = (x * inv_rms) * weight.to(torch.float32)\n    return y.to(hidden_states.dtype)\n"
+}
diff --git a/tests/trace/test_fi_trace.py b/tests/trace/test_fi_trace.py
new file mode 100644
index 0000000000..235671e173
--- /dev/null
+++ b/tests/trace/test_fi_trace.py
@@ -0,0 +1,596 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Tests for flashinfer.fi_trace: definition JSON generation."""
+
+import json
+import torch
+
+from flashinfer.fi_trace import fi_trace
+
+
+# ---------------------------------------------------------------------------
+# Helper: validate common fields of a definition dict
+# ---------------------------------------------------------------------------
+
+
+def _check_defn(defn, op_type, fi_api_substr):
+    assert isinstance(defn, dict), "fi_trace must return a dict"
+    assert defn["op_type"] == op_type, f"op_type mismatch: {defn['op_type']!r}"
+    assert "name" in defn and isinstance(defn["name"], str) and defn["name"]
+    assert "axes" in defn and isinstance(defn["axes"], dict)
+    assert "inputs" in defn and isinstance(defn["inputs"], dict)
+    assert "outputs" in defn and isinstance(defn["outputs"], dict)
+    assert any(fi_api_substr in t for t in defn["tags"]), (
+        f"Expected fi_api tag containing {fi_api_substr!r}, got {defn['tags']}"
+    )
+    # Must be round-trippable through JSON
+    json.dumps(defn)
+
+
+# ---------------------------------------------------------------------------
+# rmsnorm
+# ---------------------------------------------------------------------------
+
+
+def test_rmsnorm_fi_trace():
+    import flashinfer.norm
+
+    hidden = torch.randn(32, 4096, dtype=torch.bfloat16)
+    weight = torch.ones(4096, dtype=torch.bfloat16)
+
+    # Access via the function attribute
+    defn = flashinfer.norm.rmsnorm.fi_trace(input=hidden, weight=weight)
+    _check_defn(defn, "rmsnorm", "flashinfer.norm.rmsnorm")
+
+    axes = defn["axes"]
+    assert axes["batch_size"]["type"] == "var"
+    assert axes["hidden_size"]["type"] == "const"
+    assert axes["hidden_size"]["value"] == 4096
+
+    assert defn["inputs"]["hidden_states"]["shape"] == ["batch_size", "hidden_size"]
+    assert defn["inputs"]["weight"]["shape"] == ["hidden_size"]
+    assert defn["outputs"]["output"]["shape"] == ["batch_size", "hidden_size"]
+    assert defn["outputs"]["output"]["dtype"] == "bfloat16"
+
+
+def test_rmsnorm_fi_trace_via_helper():
+    import flashinfer.norm
+
+    hidden = torch.randn(16, 7168, dtype=torch.bfloat16)
+    weight = torch.ones(7168, dtype=torch.bfloat16)
+
+    defn = fi_trace(flashinfer.norm.rmsnorm, input=hidden, weight=weight)
+    _check_defn(defn, "rmsnorm", "flashinfer.norm.rmsnorm")
+    assert defn["axes"]["hidden_size"]["value"] == 7168
+
+
+def test_fused_add_rmsnorm_fi_trace():
+    import flashinfer.norm
+
+    x = torch.randn(8, 5120, dtype=torch.bfloat16)
+    res = torch.randn(8, 5120, dtype=torch.bfloat16)
+    weight = torch.ones(5120, dtype=torch.bfloat16)
+
+    defn = flashinfer.norm.fused_add_rmsnorm.fi_trace(
+        input=x, residual=res, weight=weight
+    )
+    _check_defn(defn, "rmsnorm", "flashinfer.norm.fused_add_rmsnorm")
+    assert defn["axes"]["hidden_size"]["value"] == 5120
+    assert "residual" in defn["inputs"]
+    assert "residual" in defn["outputs"]
+
+
+# ---------------------------------------------------------------------------
+# sampling
+# ---------------------------------------------------------------------------
+
+
+def test_top_k_sampling_fi_trace():
+    import flashinfer.sampling
+
+    probs = torch.rand(64, 128256, dtype=torch.float32)
+    top_k = torch.full((64,), 50, dtype=torch.int32)
+
+    defn = flashinfer.sampling.top_k_sampling_from_probs.fi_trace(
+        probs=probs, top_k=top_k
+    )
+    _check_defn(defn, "sampling", "top_k_sampling_from_probs")
+    assert defn["axes"]["vocab_size"]["value"] == 128256
+    assert defn["inputs"]["probs"]["shape"] == ["batch_size", "vocab_size"]
+    assert defn["outputs"]["samples"]["dtype"] == "int64"
+
+
+def test_top_p_sampling_fi_trace():
+    import flashinfer.sampling
+
+    probs = torch.rand(32, 151936, dtype=torch.float32)
+    top_p = torch.full((32,), 0.9, dtype=torch.float32)
+
+    defn = flashinfer.sampling.top_p_sampling_from_probs.fi_trace(
+        probs=probs, top_p=top_p
+    )
+    _check_defn(defn, "sampling", "top_p_sampling_from_probs")
+    assert defn["axes"]["vocab_size"]["value"] == 151936
+
+
+def test_top_k_top_p_sampling_fi_trace():
+    import flashinfer.sampling
+
+    probs = torch.rand(16, 129280, dtype=torch.float32)
+    top_k = torch.full((16,), 100, dtype=torch.int32)
+    top_p = torch.full((16,), 0.9, dtype=torch.float32)
+
+    defn = flashinfer.sampling.top_k_top_p_sampling_from_probs.fi_trace(
+        probs=probs, top_k=top_k, top_p=top_p
+    )
+    _check_defn(defn, "sampling", "top_k_top_p_sampling_from_probs")
+    assert defn["axes"]["vocab_size"]["value"] == 129280
+    assert "top_k" in defn["inputs"]
+    assert "top_p" in defn["inputs"]
+
+
+# ---------------------------------------------------------------------------
+# gemm
+# ---------------------------------------------------------------------------
+
+
+def test_mm_bf16_fi_trace():
+    import flashinfer.gemm
+
+    a = torch.randn(128, 4096, dtype=torch.bfloat16)
+    b = torch.randn(4096, 4096, dtype=torch.bfloat16)
+
+    defn = flashinfer.gemm.mm_bf16.fi_trace(a=a, b=b)
+    _check_defn(defn, "gemm_bf16", "mm_bf16")
+    assert defn["axes"]["N"]["value"] == 4096
+    assert defn["axes"]["K"]["value"] == 4096
+    assert defn["axes"]["M"]["type"] == "var"
+    assert defn["inputs"]["A"]["shape"] == ["M", "K"]
+    assert defn["inputs"]["B"]["shape"] == ["K", "N"]
+    assert defn["outputs"]["C"]["shape"] == ["M", "N"]
+
+
+# ---------------------------------------------------------------------------
+# GQA paged decode
+# ---------------------------------------------------------------------------
+
+
+def test_gqa_paged_decode_fi_trace():
+    from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
+
+    batch_size = 32
+    num_qo_heads = 32
+    num_kv_heads = 8
+    head_dim = 128
+    num_pages = 512
+    page_size = 16
+
+    q = torch.randn(batch_size, num_qo_heads, head_dim, dtype=torch.bfloat16)
+    k_cache = torch.randn(
+        num_pages, page_size, num_kv_heads, head_dim, dtype=torch.bfloat16
+    )
+    v_cache = torch.randn(
+        num_pages, page_size, num_kv_heads, head_dim, dtype=torch.bfloat16
+    )
+
+    defn = BatchDecodeWithPagedKVCacheWrapper.run.fi_trace(
+        q=q, paged_kv_cache=(k_cache, v_cache)
+    )
+    _check_defn(defn, "gqa_paged", "BatchDecodeWithPagedKVCacheWrapper")
+    axes = defn["axes"]
+    assert axes["num_qo_heads"]["value"] == num_qo_heads
+    assert axes["num_kv_heads"]["value"] == num_kv_heads
+    assert axes["head_dim"]["value"] == head_dim
+    assert axes["page_size"]["value"] == page_size
+    assert axes["batch_size"]["type"] == "var"
+    assert axes["num_pages"]["type"] == "var"
+
+    assert "k_cache" in defn["inputs"]
+    assert "v_cache" in defn["inputs"]
+    assert defn["inputs"]["k_cache"]["shape"] == [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim",
+    ]
+
+
+# ---------------------------------------------------------------------------
+# GQA ragged prefill
+# ---------------------------------------------------------------------------
+
+
+def test_gqa_ragged_prefill_fi_trace():
+    from flashinfer.prefill import BatchPrefillWithRaggedKVCacheWrapper
+
+    total_q = 256
+    total_kv = 512
+    num_qo_heads = 32
+    num_kv_heads = 8
+    head_dim = 128
+
+    q = torch.randn(total_q, num_qo_heads, head_dim, dtype=torch.bfloat16)
+    k = torch.randn(total_kv, num_kv_heads, head_dim, dtype=torch.bfloat16)
+    v = torch.randn(total_kv, num_kv_heads, head_dim, dtype=torch.bfloat16)
+
+    defn = BatchPrefillWithRaggedKVCacheWrapper.run.fi_trace(q=q, k=k, v=v)
+    _check_defn(defn, "gqa_ragged", "BatchPrefillWithRaggedKVCacheWrapper")
+    axes = defn["axes"]
+    assert axes["num_qo_heads"]["value"] == num_qo_heads
+    assert axes["num_kv_heads"]["value"] == num_kv_heads
+    assert axes["head_dim"]["value"] == head_dim
+    assert axes["total_q"]["type"] == "var"
+    assert axes["total_kv"]["type"] == "var"
+
+    assert "constraints" in defn
+
+
+# ---------------------------------------------------------------------------
+# MLA paged
+# ---------------------------------------------------------------------------
+
+
+def test_mla_paged_fi_trace():
+    from flashinfer.mla import BatchMLAPagedAttentionWrapper
+
+    batch_size = 16
+    num_qo_heads = 16
+    head_dim_ckv = 512
+    head_dim_kpe = 64
+    num_pages = 256
+    page_size = 64
+
+    q_nope = torch.randn(batch_size, num_qo_heads, head_dim_ckv, dtype=torch.bfloat16)
+    q_pe = torch.randn(batch_size, num_qo_heads, head_dim_kpe, dtype=torch.bfloat16)
+    ckv_cache = torch.randn(num_pages, page_size, head_dim_ckv, dtype=torch.bfloat16)
+    kpe_cache = torch.randn(num_pages, page_size, head_dim_kpe, dtype=torch.bfloat16)
+
+    defn = BatchMLAPagedAttentionWrapper.run.fi_trace(
+        q_nope=q_nope, q_pe=q_pe, ckv_cache=ckv_cache, kpe_cache=kpe_cache
+    )
+    _check_defn(defn, "mla_paged", "BatchMLAPagedAttentionWrapper")
+    axes = defn["axes"]
+    assert axes["num_qo_heads"]["value"] == num_qo_heads
+    assert axes["head_dim_ckv"]["value"] == head_dim_ckv
+    assert axes["head_dim_kpe"]["value"] == head_dim_kpe
+    assert axes["page_size"]["value"] == page_size
+
+
+# ---------------------------------------------------------------------------
+# GDN decode
+# ---------------------------------------------------------------------------
+
+
+def test_gdn_decode_fi_trace():
+    import flashinfer.gdn_decode
+
+    B, H, HV, K = 4, 8, 16, 128
+
+    q = torch.randn(B, 1, H, K, dtype=torch.bfloat16)
+    k = torch.randn(B, 1, H, K, dtype=torch.bfloat16)
+    v = torch.randn(B, 1, HV, K, dtype=torch.bfloat16)
+    state = torch.zeros(B, HV, K, K, dtype=torch.float32)
+    A_log = torch.zeros(HV, dtype=torch.float32)
+    a = torch.zeros(B, 1, HV, dtype=torch.bfloat16)
+    dt_bias = torch.zeros(HV, dtype=torch.float32)
+    b = torch.zeros(B, 1, HV, dtype=torch.bfloat16)
+
+    defn = flashinfer.gdn_decode.gated_delta_rule_decode.fi_trace(
+        q=q, k=k, v=v, state=state, A_log=A_log, a=a, dt_bias=dt_bias, b=b
+    )
+    _check_defn(defn, "gdn", "gated_delta_rule_decode")
+    axes = defn["axes"]
+    assert axes["seq_len"]["value"] == 1
+    assert axes["num_q_heads"]["value"] == H
+    assert axes["num_v_heads"]["value"] == HV
+    assert axes["head_size"]["value"] == K
+    assert axes["batch_size"]["type"] == "var"
+
+
+# ---------------------------------------------------------------------------
+# Named tensor layer: verify refine_names is applied
+# ---------------------------------------------------------------------------
+
+
+# ---------------------------------------------------------------------------
+# Module-level fi_trace helper: bound method support
+# ---------------------------------------------------------------------------
+
+
+def test_fi_trace_helper_bound_method():
+    """fi_trace() helper must work with a bound method via __func__ unwrapping."""
+    from flashinfer.prefill import BatchPrefillWithRaggedKVCacheWrapper
+
+    q = torch.randn(64, 32, 128, dtype=torch.bfloat16)
+    k = torch.randn(128, 8, 128, dtype=torch.bfloat16)
+    v = torch.randn(128, 8, 128, dtype=torch.bfloat16)
+
+    # Create a dummy instance — we don't call run(), only fi_trace()
+    class _FakeWrapper:
+        run = BatchPrefillWithRaggedKVCacheWrapper.run
+
+    instance = _FakeWrapper()
+    # Accessing instance.run gives a bound method; fi_trace() must handle it
+    defn = fi_trace(instance.run, q=q, k=k, v=v)
+    _check_defn(defn, "gqa_ragged", "BatchPrefillWithRaggedKVCacheWrapper")
+
+
+# ---------------------------------------------------------------------------
+# End-to-end use case: simulate a Llama-3.1-8B decode step and produce a
+# complete flashinfer-bench definition file ready to save to disk.
+# ---------------------------------------------------------------------------
+
+
+def test_usecase_llama31_decode_step(tmp_path):
+    """
+    Use case: profiling a Llama-3.1-8B decode step.
+
+    A developer wants to benchmark their model's attention kernel. They run a
+    forward pass with representative tensors, call fi_trace on the wrapper's
+    .run method, and get back a JSON definition they can pass directly to
+    flashinfer-bench -- without manually figuring out axis names or shapes.
+
+    Model config (TP=1):
+      num_qo_heads=32, num_kv_heads=8, head_dim=128, page_size=16
+    Runtime:
+      batch_size=64, num_pages=8192 (across all sequences in the batch)
+    """
+    from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
+
+    # ── Shapes matching a Llama-3.1-8B decode at batch_size=64 ──────────────
+    batch_size = 64
+    num_qo_heads = 32
+    num_kv_heads = 8
+    head_dim = 128
+    num_pages = 8192
+    page_size = 16
+
+    q = torch.randn(batch_size, num_qo_heads, head_dim, dtype=torch.bfloat16)
+    k_cache = torch.randn(
+        num_pages, page_size, num_kv_heads, head_dim, dtype=torch.bfloat16
+    )
+    v_cache = torch.randn(
+        num_pages, page_size, num_kv_heads, head_dim, dtype=torch.bfloat16
+    )
+
+    # ── Generate the definition and write it to disk in one call ─────────────
+    traces_dir = tmp_path / "benchmark_traces"
+    defn = BatchDecodeWithPagedKVCacheWrapper.run.fi_trace(
+        save_dir=traces_dir,
+        q=q,
+        paged_kv_cache=(k_cache, v_cache),
+    )
+
+    # ── Validate the definition matches the flashinfer-bench schema ──────────
+    _check_defn(defn, "gqa_paged", "BatchDecodeWithPagedKVCacheWrapper")
+
+    # Variable axes have no "value"; const axes carry the model config.
+    assert defn["axes"]["batch_size"]["type"] == "var"
+    assert defn["axes"]["num_pages"]["type"] == "var"
+    assert defn["axes"]["num_qo_heads"] == {"type": "const", "value": num_qo_heads}
+    assert defn["axes"]["num_kv_heads"] == {"type": "const", "value": num_kv_heads}
+    assert defn["axes"]["head_dim"] == {"type": "const", "value": head_dim}
+    assert defn["axes"]["page_size"] == {"type": "const", "value": page_size}
+
+    # Input shapes use axis names, not raw integers.
+    assert defn["inputs"]["q"]["shape"] == ["batch_size", "num_qo_heads", "head_dim"]
+    assert defn["inputs"]["k_cache"]["shape"] == [
+        "num_pages",
+        "page_size",
+        "num_kv_heads",
+        "head_dim",
+    ]
+    assert defn["inputs"]["k_cache"]["dtype"] == "bfloat16"
+
+    # Output mirrors the query shape.
+    assert defn["outputs"]["output"]["shape"] == [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim",
+    ]
+    assert defn["outputs"]["output"]["dtype"] == "bfloat16"
+    assert defn["outputs"]["lse"]["shape"] == ["batch_size", "num_qo_heads"]
+    assert defn["outputs"]["lse"]["dtype"] == "float32"
+
+    # ── The JSON file was written to disk ────────────────────────────────────
+    json_file = traces_dir / f"{defn['name']}.json"
+    assert json_file.exists(), f"Expected definition file at {json_file}"
+    on_disk = json.loads(json_file.read_text())
+    assert on_disk["axes"]["num_qo_heads"]["value"] == num_qo_heads
+
+    assert json.loads(json_file.read_text())["axes"]["num_qo_heads"]["value"] == 32
+
+
+def test_usecase_deepseek_mla_decode():
+    """
+    Use case: profiling a DeepSeek-V3 MLA decode step (TP=8).
+
+    Model config (TP=8):
+      num_qo_heads=16, head_dim_ckv=512, head_dim_kpe=64, page_size=64
+    """
+    from flashinfer.mla import BatchMLAPagedAttentionWrapper
+
+    batch_size = 128  # tokens in the decode batch
+    num_qo_heads = 16  # after TP=8 split
+    head_dim_ckv = 512
+    head_dim_kpe = 64
+    num_pages = 4096
+    page_size = 64
+
+    q_nope = torch.randn(batch_size, num_qo_heads, head_dim_ckv, dtype=torch.bfloat16)
+    q_pe = torch.randn(batch_size, num_qo_heads, head_dim_kpe, dtype=torch.bfloat16)
+    ckv_cache = torch.randn(num_pages, page_size, head_dim_ckv, dtype=torch.bfloat16)
+    kpe_cache = torch.randn(num_pages, page_size, head_dim_kpe, dtype=torch.bfloat16)
+
+    defn = BatchMLAPagedAttentionWrapper.run.fi_trace(
+        q_nope=q_nope,
+        q_pe=q_pe,
+        ckv_cache=ckv_cache,
+        kpe_cache=kpe_cache,
+    )
+
+    _check_defn(defn, "mla_paged", "BatchMLAPagedAttentionWrapper")
+
+    assert defn["axes"]["num_qo_heads"]["value"] == num_qo_heads
+    assert defn["axes"]["head_dim_ckv"]["value"] == head_dim_ckv
+    assert defn["axes"]["head_dim_kpe"]["value"] == head_dim_kpe
+    assert defn["axes"]["page_size"]["value"] == page_size
+    assert defn["axes"]["batch_size"]["type"] == "var"
+
+    # The output uses the CKV head dimension (not KPE).
+    assert defn["outputs"]["output"]["shape"] == [
+        "batch_size",
+        "num_qo_heads",
+        "head_dim_ckv",
+    ]
+
+    # Enrich with model metadata, then round-trip through JSON.
+    defn["tags"] += ["model:deepseek-v3", "model:deepseek-r1", "tp:8", "stage:decode"]
+    assert json.loads(json.dumps(defn))["axes"]["head_dim_ckv"]["value"] == 512
+
+
+def test_usecase_sampling_vocab_discovery():
+    """
+    Use case: automatically discover the vocabulary size from live tensors.
+    """
+    import flashinfer.sampling
+
+    # Qwen3 vocabulary size
+    vocab_size = 151936
+    batch_size = 32
+
+    probs = torch.rand(batch_size, vocab_size, dtype=torch.float32)
+    top_k = torch.full((batch_size,), 40, dtype=torch.int32)
+    top_p = torch.full((batch_size,), 0.95, dtype=torch.float32)
+
+    defn = flashinfer.sampling.top_k_top_p_sampling_from_probs.fi_trace(
+        probs=probs, top_k=top_k, top_p=top_p
+    )
+
+    # vocab_size is automatically discovered from the probs tensor shape.
+    assert defn["axes"]["vocab_size"]["type"] == "const"
+    assert defn["axes"]["vocab_size"]["value"] == vocab_size
+
+    # The definition name embeds the const axes values.
+    assert str(vocab_size) in defn["name"]
+
+    # Confirm the JSON is ready for flashinfer-bench.
+    parsed = json.loads(json.dumps(defn))
+    assert parsed["inputs"]["probs"]["dtype"] == "float32"
+    assert parsed["outputs"]["samples"]["dtype"] == "int64"
+
+
+# ---------------------------------------------------------------------------
+# JSON file output
+# ---------------------------------------------------------------------------
+
+
+def test_fi_trace_writes_json_file(tmp_path):
+    """fi_trace writes a <name>.json file when save_dir is given."""
+    import flashinfer.norm
+
+    hidden = torch.randn(16, 4096, dtype=torch.bfloat16)
+    weight = torch.ones(4096, dtype=torch.bfloat16)
+
+    defn = flashinfer.norm.rmsnorm.fi_trace(
+        save_dir=tmp_path, input=hidden, weight=weight
+    )
+
+    expected_file = tmp_path / f"{defn['name']}.json"
+    assert expected_file.exists(), f"Expected JSON file at {expected_file}"
+
+    on_disk = json.loads(expected_file.read_text())
+    assert on_disk == defn
+
+
+def test_fi_trace_helper_writes_json_file(tmp_path):
+    """The module-level fi_trace() helper threads save_dir through correctly."""
+    import flashinfer.norm
+
+    hidden = torch.randn(8, 7168, dtype=torch.bfloat16)
+    weight = torch.ones(7168, dtype=torch.bfloat16)
+
+    defn = fi_trace(
+        flashinfer.norm.rmsnorm,
+        save_dir=tmp_path,
+        input=hidden,
+        weight=weight,
+    )
+
+    expected_file = tmp_path / f"{defn['name']}.json"
+    assert expected_file.exists()
+    on_disk = json.loads(expected_file.read_text())
+    assert on_disk["axes"]["hidden_size"]["value"] == 7168
+
+
+def test_fi_trace_env_var_writes_json_file(tmp_path, monkeypatch):
+    """FLASHINFER_TRACE_DUMP_DIR env-var (shared with logging) triggers file writing without save_dir."""
+    import flashinfer.sampling
+
+    # Use the real env-var; the template reads os.environ at call time.
+    monkeypatch.setenv("FLASHINFER_TRACE_DUMP_DIR", str(tmp_path))
+
+    probs = torch.rand(4, 128256, dtype=torch.float32)
+    top_k = torch.full((4,), 50, dtype=torch.int32)
+
+    defn = flashinfer.sampling.top_k_sampling_from_probs.fi_trace(
+        probs=probs, top_k=top_k
+    )
+
+    expected_file = tmp_path / f"{defn['name']}.json"
+    assert expected_file.exists(), f"Expected file {expected_file}"
+    assert json.loads(expected_file.read_text())["op_type"] == "sampling"
+
+
+def test_fi_trace_creates_nested_save_dir(tmp_path):
+    """save_dir is created automatically even if it doesn't exist yet."""
+    import flashinfer.norm
+
+    nested = tmp_path / "traces" / "rmsnorm"
+    assert not nested.exists()
+
+    hidden = torch.randn(4, 2048, dtype=torch.bfloat16)
+    weight = torch.ones(2048, dtype=torch.bfloat16)
+
+    defn = flashinfer.norm.rmsnorm.fi_trace(
+        save_dir=nested, input=hidden, weight=weight
+    )
+
+    assert nested.exists()
+    files = list(nested.glob("*.json"))
+    assert len(files) == 1
+    assert json.loads(files[0].read_text())["name"] == defn["name"]
+
+
+def test_fi_trace_filename_matches_definition_name(tmp_path):
+    """The written filename is exactly '<definition_name>.json'."""
+    from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
+
+    q = torch.randn(4, 32, 128, dtype=torch.bfloat16)
+    k_cache = torch.randn(64, 16, 8, 128, dtype=torch.bfloat16)
+    v_cache = torch.randn(64, 16, 8, 128, dtype=torch.bfloat16)
+
+    defn = BatchDecodeWithPagedKVCacheWrapper.run.fi_trace(
+        save_dir=tmp_path,
+        q=q,
+        paged_kv_cache=(k_cache, v_cache),
+    )
+
+    expected_name = defn["name"]
+    expected_file = tmp_path / f"{expected_name}.json"
+    assert expected_file.exists()
+    assert json.loads(expected_file.read_text())["name"] == expected_name
diff --git a/tests/trace/test_fi_trace_template_consistency.py b/tests/trace/test_fi_trace_template_consistency.py
new file mode 100644
index 0000000000..b6433ef08f
--- /dev/null
+++ b/tests/trace/test_fi_trace_template_consistency.py
@@ -0,0 +1,620 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+TraceTemplate consistency tests.
+
+These tests act as "linters" for trace templates. They catch mistakes like:
+  - Wrong parameter names in the template (param= mismatch with the API)
+  - Const axes that can never get a value (not in any tensor's dim_names)
+  - fi_trace() returning "unknown" dtypes or missing Const-axis values
+
+Two levels of checking
+----------------------
+1. **Structural** (no GPU, no real tensors): verify that every ``param=``
+   reference in the template exists in the decorated function's signature,
+   and that every ``Const`` axis has at least one tensor source.
+
+2. **End-to-end** (CPU tensors, no GPU): call ``fi_trace`` with minimal
+   auto-generated tensors and assert the returned dict is complete.
+
+How to add a new template
+--------------------------
+When you add ``@flashinfer_api(trace=my_trace)`` to a function, add an
+entry to ``_TEMPLATE_FUNC_PAIRS`` and optionally a targeted end-to-end test.
+See the docstring in ``flashinfer/trace/templates/__init__.py`` for the full
+how-to guide.
+"""
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import pytest
+import torch
+
+from flashinfer.trace.template import Const, Scalar, Tensor, TraceTemplate, Var
+
+# ---------------------------------------------------------------------------
+# Structural checker utilities
+# ---------------------------------------------------------------------------
+
+
+def _resolved_param(json_key: str, descriptor) -> str:
+    """Return the function-parameter name that descriptor maps to."""
+    p = getattr(descriptor, "param", None)
+    return p if p is not None else json_key
+
+
+def _get_sig_params(func: Callable) -> Optional[set]:
+    """
+    Return the set of parameter names for *func*, stripping ``self``/``cls``.
+    Returns None if the signature cannot be inspected.
+    """
+    # Unwrap decorators to reach the original signature
+    original = func
+    for attr in ("__wrapped__", "__func__"):
+        if hasattr(original, attr):
+            original = getattr(original, attr)
+    try:
+        sig = inspect.signature(original)
+    except (ValueError, TypeError):
+        return None
+    return {name for name, p in sig.parameters.items() if name not in ("self", "cls")}
+
+
+def assert_template_signature_consistency(
+    func: Callable,
+    template: TraceTemplate,
+    *,
+    label: str = "",
+) -> None:
+    """
+    Assert that every non-optional ``param=`` reference in *template* resolves
+    to a valid parameter name of *func*.
+
+    Optional inputs are skipped: they may reference plan-phase metadata (e.g.
+    ``kv_indptr``) that lives in the wrapper's ``plan()`` method rather than
+    ``run()``, and is intentionally absent from the run-time signature.
+
+    This catches mistakes like renaming a function parameter without
+    updating the corresponding ``param=`` in the template.
+    """
+    param_names = _get_sig_params(func)
+    if param_names is None:
+        return  # Cannot inspect — skip
+
+    errors: List[str] = []
+    for json_key, descriptor in template.inputs.items():
+        if not isinstance(descriptor, (Tensor, Scalar)):
+            continue
+        if getattr(descriptor, "optional", False):
+            continue  # Plan-phase or truly optional inputs may not be in run() sig
+        p = _resolved_param(json_key, descriptor)
+        if p not in param_names:
+            errors.append(
+                f"  Input '{json_key}' → param='{p}' not found in "
+                f"{func.__qualname__}({sorted(param_names)})"
+            )
+
+    pfx = f"[{label}] " if label else ""
+    assert not errors, (
+        f"{pfx}Template '{template.name_prefix or template.op_type}' "
+        f"has param mismatches:\n" + "\n".join(errors)
+    )
+
+
+def assert_template_axes_covered(
+    template: TraceTemplate,
+    *,
+    label: str = "",
+    func: Optional[Callable] = None,
+) -> None:
+    """
+    Assert that every ``Const`` axis in *template* has at least one source:
+
+    1. A tensor input whose ``dim_names`` contain the axis name, OR
+    2. A scalar input whose key matches the axis name (scalar-kwarg fallback), OR
+    3. A parameter of *func* matching the axis name (scalar-kwarg fallback for
+       integer function arguments like ``top_k``, ``n_group``, ``block_size``).
+    """
+    tensor_dim_names: set = set()
+    scalar_keys: set = set()
+    for json_key, descriptor in template.inputs.items():
+        if isinstance(descriptor, Tensor):
+            tensor_dim_names.update(descriptor.dim_names)
+        elif isinstance(descriptor, Scalar):
+            scalar_keys.add(json_key)
+
+    func_param_names: set = set()
+    if func is not None:
+        sig_params = _get_sig_params(func)
+        if sig_params is not None:
+            func_param_names = sig_params
+
+    uncovered = [
+        name
+        for name, marker in template.axes.items()
+        if isinstance(marker, Const)
+        and name not in tensor_dim_names
+        and name not in scalar_keys
+        and name not in func_param_names
+    ]
+
+    pfx = f"[{label}] " if label else ""
+    assert not uncovered, (
+        f"{pfx}Template '{template.name_prefix or template.op_type}' "
+        f"has Const axes with no tensor/scalar source: {uncovered}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Auto-tensor generation for end-to-end checks
+# ---------------------------------------------------------------------------
+
+_DTYPE_MAP: Dict[str, torch.dtype] = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    "int32": torch.int32,
+    "int64": torch.int64,
+    "float8_e4m3fn": torch.float8_e4m3fn,
+    "uint8": torch.uint8,
+}
+
+
+# Per-key sample values for integer scalars. A plain 0 is a valid int32 value
+# but makes no semantic sense for block_size/top_k/etc. — using small positive
+# defaults produces definitions that could actually be run.
+_INT_SAMPLE_DEFAULTS: Dict[str, int] = {
+    "block_size": 16,
+    "top_k": 1,
+    "n_group": 1,
+    "topk_group": 1,
+    "num_experts": 1,
+    "intermediate_size": 1,
+    "hidden_size": 1,
+}
+
+
+def _make_sample_kwargs(template: TraceTemplate, axis_size: int = 4) -> Dict[str, Any]:
+    """
+    Build minimal CPU tensors/scalars for every non-optional input in *template*.
+
+    Each axis defaults to *axis_size*. Tuple inputs (``tuple_idx`` set) are
+    collected into a tuple and stored under the shared ``param`` key.
+    """
+    sizes = {name: axis_size for name in template.axes}
+
+    # Accumulate tuple parts: param → list indexed by tuple_idx
+    tuple_parts: Dict[str, list] = {}
+    kwargs: Dict[str, Any] = {}
+
+    for json_key, descriptor in template.inputs.items():
+        if isinstance(descriptor, Scalar):
+            if descriptor.optional:
+                continue
+            p = _resolved_param(json_key, descriptor)
+            if descriptor.dtype == "int32":
+                kwargs[p] = _INT_SAMPLE_DEFAULTS.get(p, 1)
+            else:
+                kwargs[p] = 1.0
+
+        elif isinstance(descriptor, Tensor):
+            if descriptor.optional:
+                continue
+            p = _resolved_param(json_key, descriptor)
+            shape = [sizes.get(d, axis_size) for d in descriptor.dim_names]
+            if not shape:
+                continue
+            # Prefer the descriptor's own dtype hint; fall back to bfloat16
+            dtype = _DTYPE_MAP.get(descriptor.dtype or "", torch.bfloat16)
+            t = torch.zeros(shape, dtype=dtype)
+
+            if descriptor.tuple_idx is not None:
+                parts = tuple_parts.setdefault(p, [None, None])
+                # Grow the list if needed
+                while len(parts) <= descriptor.tuple_idx:
+                    parts.append(None)
+                parts[descriptor.tuple_idx] = t
+            else:
+                kwargs[p] = t
+
+    # Finalise tuple inputs
+    for p, parts in tuple_parts.items():
+        kwargs[p] = tuple(parts)
+
+    return kwargs
+
+
+def assert_fi_trace_complete(
+    func: Callable,
+    template: TraceTemplate,
+    *,
+    label: str = "",
+    axis_size: int = 4,
+) -> Dict[str, Any]:
+    """
+    Call ``fi_trace`` with auto-generated sample tensors and verify:
+    - No exception is raised
+    - All ``Const`` axes have a ``value`` in the returned dict
+    - No input or output has ``dtype == "unknown"``
+    """
+    sample_kwargs = _make_sample_kwargs(template, axis_size=axis_size)
+    fi_api = f"{getattr(func, '__module__', '')}.{func.__qualname__}"
+    fi_trace_fn = template.build_fi_trace_fn(fi_api)
+
+    try:
+        defn = fi_trace_fn(**sample_kwargs)
+    except Exception as exc:  # noqa: BLE001
+        pfx = f"[{label}] " if label else ""
+        pytest.fail(
+            f"{pfx}fi_trace raised an exception for template "
+            f"'{template.name_prefix or template.op_type}': {exc}"
+        )
+
+    pfx = f"[{label}] " if label else ""
+    name_tag = f"'{template.name_prefix or template.op_type}'"
+
+    # Const axes must have resolved values
+    missing_values = [
+        name
+        for name, entry in defn.get("axes", {}).items()
+        if entry["type"] == "const" and "value" not in entry
+    ]
+    assert not missing_values, (
+        f"{pfx}Template {name_tag}: Const axes missing values: {missing_values}"
+    )
+
+    # No "unknown" dtypes in non-optional inputs (optional inputs may be absent at run time)
+    unknown_inputs = [
+        k
+        for k, v in defn.get("inputs", {}).items()
+        if isinstance(v, dict)
+        and v.get("dtype") == "unknown"
+        and not v.get("optional", False)
+    ]
+    assert not unknown_inputs, (
+        f"{pfx}Template {name_tag}: inputs with unknown dtype: {unknown_inputs}"
+    )
+
+    # No "unknown" dtypes in outputs
+    unknown_outputs = [
+        k
+        for k, v in defn.get("outputs", {}).items()
+        if isinstance(v, dict) and v.get("dtype") == "unknown"
+    ]
+    assert not unknown_outputs, (
+        f"{pfx}Template {name_tag}: outputs with unknown dtype: {unknown_outputs}"
+    )
+
+    return defn
+
+
+# ---------------------------------------------------------------------------
+# Auto-discovery via _TRACE_REGISTRY
+#
+# @flashinfer_api(trace=...) automatically registers every (func, template)
+# pair in flashinfer.api_logging._TRACE_REGISTRY at decoration time.
+# We just need to import the modules that contain the decorated functions to
+# trigger those decorators, then read the registry.
+#
+# To add a new kernel: no changes needed here — simply add
+# @flashinfer_api(trace=my_template) to your function and the tests will
+# pick it up automatically.
+# ---------------------------------------------------------------------------
+
+
+def _collect_template_func_pairs() -> List[Tuple[Callable, TraceTemplate, str]]:
+    """
+    Return all (func, template, label) pairs by reading _TRACE_REGISTRY.
+
+    Imports are done lazily here so that missing GPU drivers don't prevent
+    the structural tests from running.
+    """
+    # Trigger @flashinfer_api decorators by importing all modules that use them.
+    import flashinfer.decode  # BatchDecodeWithPagedKVCacheWrapper
+    import flashinfer.fused_moe  # trtllm_fp8_block_scale_moe
+    import flashinfer.gdn_decode  # gated_delta_rule_decode, gated_delta_rule_mtp
+    import flashinfer.gdn_prefill  # chunk_gated_delta_rule
+    import flashinfer.gemm  # mm_bf16, mm_fp8, mm_mxfp8, mm_fp4
+    import flashinfer.mla  # BatchMLAPagedAttentionWrapper
+    import flashinfer.norm  # rmsnorm, fused_add_rmsnorm
+    import flashinfer.prefill  # BatchPrefillWithPagedKVCacheWrapper, Ragged
+    import flashinfer.sampling  # noqa: F401  # top_k_sampling_from_probs, etc.
+
+    from flashinfer.api_logging import _TRACE_REGISTRY
+
+    return list(_TRACE_REGISTRY)
+
+
+_ALL_PAIRS = _collect_template_func_pairs()
+_PAIR_IDS = [label for _, _, label in _ALL_PAIRS]
+
+
+# ---------------------------------------------------------------------------
+# Parameterized structural tests (no GPU required)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("func,template,label", _ALL_PAIRS, ids=_PAIR_IDS)
+def test_template_signature_consistency(func, template, label):
+    """Every param= reference in the template must exist in the function's signature."""
+    assert_template_signature_consistency(func, template, label=label)
+
+
+@pytest.mark.parametrize("func,template,label", _ALL_PAIRS, ids=_PAIR_IDS)
+def test_template_axes_covered(func, template, label):
+    """Every Const axis must be reachable from at least one input tensor, scalar, or function param."""
+    assert_template_axes_covered(template, label=label, func=func)
+
+
+# ---------------------------------------------------------------------------
+# End-to-end checks: fi_trace with auto-generated CPU tensors
+#
+# The simpler ops (no tuple inputs, standard dtypes) are checked
+# automatically. Wrappers with complex inputs (tuple paged_kv_cache, fp8
+# scale tensors) are skipped here — their correctness is covered by the
+# targeted tests in tests/test_fi_trace.py.
+# ---------------------------------------------------------------------------
+
+_E2E_SKIP = {
+    # Tuple inputs (paged_kv_cache) need manual construction:
+    "gqa_paged_decode",
+    "gqa_paged_prefill",
+    # MoE fp8: top_k / intermediate_size are scalar kwargs (not tensor dims) and
+    # hidden_states_scale is optional — covered by test_fi_trace_complete_moe_routing.
+    "moe_fp8_block_scale_ds_routing",
+    "moe_fp8_block_scale_default_routing",
+    "moe_fp8_block_scale_renormalize_routing",
+    "moe_fp8_block_scale_llama4_routing",
+    "moe_fp8_block_scale_renormalize_naive_routing",
+    "moe_fp8_block_scale_topk_routing",
+    # MoE fp4: same reason — covered by test_fi_trace_complete_moe_fp4_routing.
+    "moe_fp4_block_scale_ds_routing",
+    "moe_fp4_block_scale_default_routing",
+    "moe_fp4_block_scale_renormalize_routing",
+    "moe_fp4_block_scale_llama4_routing",
+    "moe_fp4_block_scale_renormalize_naive_routing",
+    "moe_fp4_block_scale_topk_routing",
+}
+
+_E2E_PAIRS = [(f, t, l) for f, t, l in _ALL_PAIRS if l not in _E2E_SKIP]
+_E2E_IDS = [label for _, _, label in _E2E_PAIRS]
+
+
+@pytest.mark.parametrize("func,template,label", _E2E_PAIRS, ids=_E2E_IDS)
+def test_fi_trace_complete(func, template, label):
+    """fi_trace with auto-generated CPU tensors must return a complete definition."""
+    assert_fi_trace_complete(func, template, label=label)
+
+
+# ---------------------------------------------------------------------------
+# Targeted end-to-end checks for templates skipped above
+# ---------------------------------------------------------------------------
+
+
+def test_fi_trace_complete_gqa_paged_decode():
+    """GQA paged decode: tuple paged_kv_cache input handled correctly."""
+    from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.trace.templates.attention import gqa_paged_decode_trace  # noqa: F401
+
+    B, H, KV, D, P, NP = 4, 8, 4, 64, 16, 8
+    q = torch.zeros(B, H, D, dtype=torch.bfloat16)
+    k = torch.zeros(NP, P, KV, D, dtype=torch.bfloat16)
+    v = torch.zeros(NP, P, KV, D, dtype=torch.bfloat16)
+
+    defn = BatchDecodeWithPagedKVCacheWrapper.run.fi_trace(q=q, paged_kv_cache=(k, v))
+    assert defn["axes"]["num_qo_heads"]["value"] == H
+    assert defn["axes"]["page_size"]["value"] == P
+    # Optional plan-phase inputs (kv_indptr, kv_indices, sm_scale) may have "unknown" dtype
+    # when not passed to run(); only check non-optional inputs.
+    non_optional_unknown = [
+        k
+        for k, v in defn["inputs"].items()
+        if isinstance(v, dict)
+        and v.get("dtype") == "unknown"
+        and not v.get("optional", False)
+    ]
+    assert not non_optional_unknown, (
+        f"Non-optional inputs with unknown dtype: {non_optional_unknown}"
+    )
+    assert "unknown" not in str(defn["outputs"])
+
+
+@pytest.mark.parametrize(
+    "routing_method_type,top_k,extra_kwargs,expected_name_prefix",
+    [
+        # routing_method_type 0 — Default (softmax top-k)
+        (0, 4, {}, "moe_fp8_block_scale_default_routing"),
+        # routing_method_type 1 — Renormalize (top-k then softmax)
+        (1, 4, {}, "moe_fp8_block_scale_renormalize_routing"),
+        # routing_method_type 2 — DeepSeekV3 (group routing; needs n_group / topk_group)
+        (2, 4, {"n_group": 4, "topk_group": 2}, "moe_fp8_block_scale_ds_routing"),
+        # routing_method_type 3 — Llama4 (top-1 sigmoid)
+        (3, 1, {}, "moe_fp8_block_scale_llama4_routing"),
+        # routing_method_type 4 — RenormalizeNaive (softmax → top-k → renorm)
+        (4, 4, {}, "moe_fp8_block_scale_renormalize_naive_routing"),
+        # routing_method_type 5 — TopK (uniform weights, no score normalisation)
+        (5, 4, {}, "moe_fp8_block_scale_topk_routing"),
+    ],
+    ids=["default", "renormalize", "ds", "llama4", "renormalize_naive", "topk"],
+)
+def test_fi_trace_complete_moe_routing(
+    routing_method_type, top_k, extra_kwargs, expected_name_prefix
+):
+    """MoE routing variants: fp8 + scale tensor shapes handled correctly for each routing type."""
+    from flashinfer.fused_moe import trtllm_fp8_block_scale_moe
+
+    T, E, EL, H, I, BS = 4, 16, 2, 256, 64, 128
+    defn = trtllm_fp8_block_scale_moe.fi_trace(
+        routing_logits=torch.zeros(T, E, dtype=torch.float32),
+        routing_bias=torch.zeros(E, dtype=torch.bfloat16),
+        hidden_states=torch.zeros(T, H, dtype=torch.float8_e4m3fn),
+        hidden_states_scale=torch.ones(H // BS, T, dtype=torch.float32),
+        gemm1_weights=torch.zeros(EL, 2 * I, H, dtype=torch.float8_e4m3fn),
+        gemm1_weights_scale=torch.ones(EL, (2 * I) // BS, H // BS, dtype=torch.float32),
+        gemm2_weights=torch.zeros(EL, H, I, dtype=torch.float8_e4m3fn),
+        gemm2_weights_scale=torch.ones(EL, H // BS, I // BS, dtype=torch.float32),
+        num_experts=E,
+        top_k=top_k,
+        intermediate_size=I,
+        local_expert_offset=0,
+        local_num_experts=EL,
+        routed_scaling_factor=1.0,
+        routing_method_type=routing_method_type,
+        **extra_kwargs,
+    )
+    assert defn["op_type"] == "moe"
+    assert defn["axes"]["num_local_experts"]["value"] == EL
+    assert defn["axes"]["hidden_size"]["value"] == H
+    assert defn["axes"]["top_k"]["value"] == top_k
+    assert defn["name"].startswith(expected_name_prefix)
+    assert "unknown" not in str(defn["inputs"])
+
+
+@pytest.mark.parametrize(
+    "routing_method_type,top_k,extra_kwargs,expected_name_prefix",
+    [
+        (0, 4, {}, "moe_fp4_block_scale_default_routing"),
+        (1, 4, {}, "moe_fp4_block_scale_renormalize_routing"),
+        (2, 4, {"n_group": 4, "topk_group": 2}, "moe_fp4_block_scale_ds_routing"),
+        (3, 1, {}, "moe_fp4_block_scale_llama4_routing"),
+        (4, 4, {}, "moe_fp4_block_scale_renormalize_naive_routing"),
+        (5, 4, {}, "moe_fp4_block_scale_topk_routing"),
+    ],
+    ids=["default", "renormalize", "ds", "llama4", "renormalize_naive", "topk"],
+)
+def test_fi_trace_complete_moe_fp4_routing(
+    routing_method_type, top_k, extra_kwargs, expected_name_prefix
+):
+    """MoE routing variants: fp4 + scale tensor shapes handled correctly for each routing type."""
+    from flashinfer.fused_moe import trtllm_fp4_block_scale_moe
+
+    # NvFP4: block_size=16, packed hidden → [T, H//2], scale → [T, H//16]
+    T, E, EL, H, I, BS = 4, 16, 2, 256, 64, 16
+    defn = trtllm_fp4_block_scale_moe.fi_trace(
+        routing_logits=torch.zeros(T, E, dtype=torch.float32),
+        routing_bias=None,
+        hidden_states=torch.zeros(T, H // 2, dtype=torch.uint8),
+        hidden_states_scale=torch.zeros(T, H // BS, dtype=torch.float8_e4m3fn),
+        gemm1_weights=torch.zeros(EL, 2 * I, H // 2, dtype=torch.uint8),
+        gemm1_weights_scale=torch.zeros(EL, 2 * I, H // BS, dtype=torch.float8_e4m3fn),
+        gemm1_bias=None,
+        gemm1_alpha=None,
+        gemm1_beta=None,
+        gemm1_clamp_limit=None,
+        gemm2_weights=torch.zeros(EL, H, I // 2, dtype=torch.uint8),
+        gemm2_weights_scale=torch.zeros(EL, H, I // BS, dtype=torch.float8_e4m3fn),
+        gemm2_bias=None,
+        output1_scale_scalar=torch.ones(EL, dtype=torch.float32),
+        output1_scale_gate_scalar=torch.ones(EL, dtype=torch.float32),
+        output2_scale_scalar=torch.ones(EL, dtype=torch.float32),
+        num_experts=E,
+        top_k=top_k,
+        intermediate_size=I,
+        local_expert_offset=0,
+        local_num_experts=EL,
+        routed_scaling_factor=None,
+        routing_method_type=routing_method_type,
+        **extra_kwargs,
+    )
+    assert defn["op_type"] == "moe"
+    assert defn["axes"]["num_local_experts"]["value"] == EL
+    assert defn["axes"]["hidden_size"]["value"] == H
+    assert defn["axes"]["top_k"]["value"] == top_k
+    assert defn["name"].startswith(expected_name_prefix)
+    non_optional_unknown = [
+        k
+        for k, v in defn["inputs"].items()
+        if isinstance(v, dict) and v.get("dtype") == "unknown" and not v.get("optional")
+    ]
+    assert not non_optional_unknown, (
+        f"Non-optional inputs with unknown dtype: {non_optional_unknown}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Meta-tests: verify the checkers themselves catch broken templates
+#
+# These create intentionally wrong templates inline and assert that the
+# checker utilities raise AssertionError.  If a checker ever silently
+# ignores a bug, these tests will fail.
+# ---------------------------------------------------------------------------
+
+
+def _make_gdn_decode_func():
+    """Return the real gated_delta_rule_decode for use in meta-tests."""
+    import flashinfer.gdn_decode
+
+    return flashinfer.gdn_decode.gated_delta_rule_decode
+
+
+def test_checker_rejects_wrong_param():
+    """Signature checker must catch a param= that doesn't exist in the function."""
+    # 'state' in gated_delta_rule_decode is a required positional arg.
+    # Deliberately map it to a non-existent param name 'hidden_state'.
+    broken = TraceTemplate(
+        op_type="gdn",
+        name_prefix="gdn_decode_broken_param",
+        axes={"batch_size": Var(), "head_size": Const(abbrev="d")},
+        inputs={
+            "q": Tensor(["batch_size", "head_size"]),
+            # 'state' exists in the real function; 'hidden_state' does not.
+            "state": Tensor(["batch_size", "head_size"], param="hidden_state"),
+        },
+        outputs={"output": Tensor(["batch_size", "head_size"], dtype_from="q")},
+    )
+    func = _make_gdn_decode_func()
+    with pytest.raises(AssertionError, match="param=.*hidden_state.*not found"):
+        assert_template_signature_consistency(func, broken, label="meta-test")
+
+
+def test_checker_rejects_uncovered_const_axis():
+    """Axes checker must catch a Const axis that has no tensor or function-param source."""
+    broken = TraceTemplate(
+        op_type="gdn",
+        name_prefix="gdn_decode_broken_axis",
+        axes={
+            "batch_size": Var(),
+            "head_size": Const(abbrev="d"),
+            # 'mystery_dim' is a Const axis but appears in no tensor dim_names,
+            # no Scalar input key, and no parameter of gated_delta_rule_decode.
+            "mystery_dim": Const(abbrev="m"),
+        },
+        inputs={"q": Tensor(["batch_size", "head_size"])},
+        outputs={"output": Tensor(["batch_size", "head_size"], dtype_from="q")},
+    )
+    func = _make_gdn_decode_func()
+    with pytest.raises(AssertionError, match="mystery_dim"):
+        assert_template_axes_covered(broken, label="meta-test", func=func)
+
+
+def test_checker_rejects_unknown_dtype_in_e2e():
+    """End-to-end checker must catch a template whose output dtype resolves to 'unknown'."""
+    # dtype_from="nonexistent_input" refers to an input key that doesn't exist,
+    # so the output dtype will be "unknown" at fi_trace time.
+    broken = TraceTemplate(
+        op_type="gdn",
+        name_prefix="gdn_decode_broken_dtype",
+        axes={"batch_size": Var(), "head_size": Const(abbrev="d")},
+        inputs={"q": Tensor(["batch_size", "head_size"])},
+        outputs={
+            "output": Tensor(
+                ["batch_size", "head_size"], dtype_from="nonexistent_input"
+            )
+        },
+    )
+    func = _make_gdn_decode_func()
+    with pytest.raises(AssertionError, match="unknown dtype"):
+        assert_fi_trace_complete(func, broken, label="meta-test")
diff --git a/tests/trace/test_reference_correctness.py b/tests/trace/test_reference_correctness.py
new file mode 100644
index 0000000000..dfd963b100
--- /dev/null
+++ b/tests/trace/test_reference_correctness.py
@@ -0,0 +1,1652 @@
+"""
+Numerical-correctness tests for every reference function attached to a
+``TraceTemplate``. Each test calls the decorated FlashInfer API and the
+template's reference on the same inputs, then compares outputs within
+per-dtype tolerances.
+
+Every test here is a real kernel-vs-reference numerical check. Tests that
+require a GPU the current machine does not have (e.g. SM120/121 for
+``xqa_mla``, SM90/12x for ``trtllm_fmha_v2_prefill``) or a runtime
+dependency that isn't available (e.g. cuDNN) are skipped with a concrete
+reason — never via a shape-only fallback.
+"""
+
+from __future__ import annotations
+
+import math
+
+import pytest
+import torch
+
+from flashinfer.utils import get_compute_capability
+
+
+def _cc() -> tuple[int, int]:
+    return get_compute_capability(torch.device("cuda"))
+
+
+def _is_sm100() -> bool:
+    major, _ = _cc()
+    return major >= 10
+
+
+def _skip_if_not_sm100():
+    if not _is_sm100():
+        pytest.skip("kernel requires SM100+ (Blackwell)")
+
+
+def _close(a: torch.Tensor, b: torch.Tensor, *, atol: float, rtol: float) -> None:
+    torch.testing.assert_close(a.float(), b.float(), atol=atol, rtol=rtol)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# RoPE
+# ─────────────────────────────────────────────────────────────────────────────
+
+_ROPE_TOL = dict(atol=5e-2, rtol=5e-2)  # bf16 1 ULP
+
+
+def _rope_inputs(device="cuda", B=2, S=8, Hq=4, Hk=2, D=64):
+    torch.manual_seed(0)
+    nnz = B * S
+    q = torch.randn(nnz, Hq, D, dtype=torch.bfloat16, device=device)
+    k = torch.randn(nnz, Hk, D, dtype=torch.bfloat16, device=device)
+    indptr = torch.arange(B + 1, dtype=torch.int32, device=device) * S
+    offsets = torch.zeros(B, dtype=torch.int32, device=device)
+    pos_ids = torch.arange(nnz, dtype=torch.int32, device=device) % S
+    return q, k, indptr, offsets, pos_ids
+
+
+def test_apply_rope():
+    import flashinfer
+    from flashinfer.trace.templates.rope import apply_rope_trace
+
+    q, k, indptr, offsets, _ = _rope_inputs()
+    q_api, k_api = flashinfer.apply_rope(q, k, indptr, offsets)
+    q_ref, k_ref = apply_rope_trace.reference(q, k, indptr, offsets)
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_rope_inplace():
+    import flashinfer
+    from flashinfer.trace.templates.rope import apply_rope_inplace_trace
+
+    q, k, indptr, offsets, _ = _rope_inputs()
+    q_api = q.clone()
+    k_api = k.clone()
+    flashinfer.apply_rope_inplace(q_api, k_api, indptr, offsets)
+    q_ref, k_ref = apply_rope_inplace_trace.reference(q, k, indptr, offsets)
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_rope_pos_ids():
+    import flashinfer
+    from flashinfer.trace.templates.rope import apply_rope_pos_ids_trace
+
+    q, k, _, _, pos_ids = _rope_inputs()
+    q_api, k_api = flashinfer.apply_rope_pos_ids(q, k, pos_ids)
+    q_ref, k_ref = apply_rope_pos_ids_trace.reference(q, k, pos_ids)
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_rope_pos_ids_inplace():
+    import flashinfer
+    from flashinfer.trace.templates.rope import apply_rope_pos_ids_inplace_trace
+
+    q, k, _, _, pos_ids = _rope_inputs()
+    q_api = q.clone()
+    k_api = k.clone()
+    flashinfer.apply_rope_pos_ids_inplace(q_api, k_api, pos_ids)
+    q_ref, k_ref = apply_rope_pos_ids_inplace_trace.reference(q, k, pos_ids)
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_llama31_rope():
+    import flashinfer
+    from flashinfer.trace.templates.rope import apply_llama31_rope_trace
+
+    q, k, indptr, offsets, _ = _rope_inputs()
+    q_api, k_api = flashinfer.apply_llama31_rope(q, k, indptr, offsets)
+    q_ref, k_ref = apply_llama31_rope_trace.reference(q, k, indptr, offsets)
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_llama31_rope_inplace():
+    import flashinfer
+    from flashinfer.trace.templates.rope import apply_llama31_rope_inplace_trace
+
+    q, k, indptr, offsets, _ = _rope_inputs()
+    q_api = q.clone()
+    k_api = k.clone()
+    flashinfer.apply_llama31_rope_inplace(q_api, k_api, indptr, offsets)
+    q_ref, k_ref = apply_llama31_rope_inplace_trace.reference(q, k, indptr, offsets)
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_llama31_rope_pos_ids():
+    import flashinfer
+    from flashinfer.trace.templates.rope import apply_llama31_rope_pos_ids_trace
+
+    q, k, _, _, pos_ids = _rope_inputs()
+    q_api, k_api = flashinfer.apply_llama31_rope_pos_ids(q, k, pos_ids)
+    q_ref, k_ref = apply_llama31_rope_pos_ids_trace.reference(q, k, pos_ids)
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_llama31_rope_pos_ids_inplace():
+    import flashinfer
+    from flashinfer.trace.templates.rope import (
+        apply_llama31_rope_pos_ids_inplace_trace,
+    )
+
+    q, k, _, _, pos_ids = _rope_inputs()
+    q_api = q.clone()
+    k_api = k.clone()
+    flashinfer.apply_llama31_rope_pos_ids_inplace(q_api, k_api, pos_ids)
+    q_ref, k_ref = apply_llama31_rope_pos_ids_inplace_trace.reference(q, k, pos_ids)
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_rope_with_cos_sin_cache():
+    import flashinfer
+    from flashinfer.trace.templates.rope import apply_rope_with_cos_sin_cache_trace
+
+    torch.manual_seed(0)
+    B, S, Hq, Hk, D = 2, 8, 4, 2, 64
+    nnz = B * S
+    q = torch.randn(nnz, Hq * D, dtype=torch.bfloat16, device="cuda")
+    k = torch.randn(nnz, Hk * D, dtype=torch.bfloat16, device="cuda")
+    pos = torch.arange(nnz, dtype=torch.int32, device="cuda")
+    inv_freq = 1.0 / (
+        1e4 ** (torch.arange(0, D, 2, dtype=torch.float32, device="cuda") / D)
+    )
+    t = torch.arange(8192, dtype=torch.float32, device="cuda")
+    cos = torch.cos(t.unsqueeze(-1) * inv_freq.unsqueeze(0))
+    sin = torch.sin(t.unsqueeze(-1) * inv_freq.unsqueeze(0))
+    cache = torch.cat([cos, sin], dim=-1)
+    q_api, k_api = flashinfer.apply_rope_with_cos_sin_cache(
+        pos, q, k, D, cache, is_neox=True
+    )
+    q_ref, k_ref = apply_rope_with_cos_sin_cache_trace.reference(
+        pos, q, k, D, cache, is_neox=True
+    )
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+def test_apply_rope_with_cos_sin_cache_inplace():
+    import flashinfer
+    from flashinfer.trace.templates.rope import (
+        apply_rope_with_cos_sin_cache_inplace_trace,
+    )
+
+    torch.manual_seed(0)
+    B, S, Hq, Hk, D = 2, 8, 4, 2, 64
+    nnz = B * S
+    q = torch.randn(nnz, Hq * D, dtype=torch.bfloat16, device="cuda")
+    k = torch.randn(nnz, Hk * D, dtype=torch.bfloat16, device="cuda")
+    pos = torch.arange(nnz, dtype=torch.int32, device="cuda")
+    inv_freq = 1.0 / (
+        1e4 ** (torch.arange(0, D, 2, dtype=torch.float32, device="cuda") / D)
+    )
+    t = torch.arange(8192, dtype=torch.float32, device="cuda")
+    cos = torch.cos(t.unsqueeze(-1) * inv_freq.unsqueeze(0))
+    sin = torch.sin(t.unsqueeze(-1) * inv_freq.unsqueeze(0))
+    cache = torch.cat([cos, sin], dim=-1)
+    q_api = q.clone()
+    k_api = k.clone()
+    flashinfer.apply_rope_with_cos_sin_cache_inplace(
+        pos, q_api, k_api, D, cache, is_neox=True
+    )
+    q_ref, k_ref = apply_rope_with_cos_sin_cache_inplace_trace.reference(
+        pos, q, k, D, cache, is_neox=True
+    )
+    _close(q_api, q_ref, **_ROPE_TOL)
+    _close(k_api, k_ref, **_ROPE_TOL)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Norm (RMSNorm + FP8 quantize)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_rmsnorm_quant():
+    import flashinfer
+    from flashinfer.trace.templates.norm import rmsnorm_quant_trace
+
+    torch.manual_seed(0)
+    B, H = 32, 2048
+    x = torch.randn(B, H, dtype=torch.bfloat16, device="cuda")
+    w = torch.ones(H, dtype=torch.bfloat16, device="cuda")
+    scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+    out_api = torch.empty(B, H, dtype=torch.float8_e4m3fn, device="cuda")
+    try:
+        flashinfer.rmsnorm_quant(out_api, x, w, scale)
+    except Exception as exc:
+        pytest.skip(f"rmsnorm_quant kernel unavailable: {exc}")
+    out_ref = rmsnorm_quant_trace.reference(x, w, scale)
+    # FP8 comparisons via dequantized values.
+    _close(out_api.float() * scale, out_ref.float() * scale, atol=0.3, rtol=0.3)
+
+
+def test_fused_add_rmsnorm_quant():
+    import flashinfer
+    from flashinfer.trace.templates.norm import fused_add_rmsnorm_quant_trace
+
+    torch.manual_seed(0)
+    B, H = 32, 2048
+    x = torch.randn(B, H, dtype=torch.bfloat16, device="cuda")
+    residual = torch.randn(B, H, dtype=torch.bfloat16, device="cuda")
+    w = torch.ones(H, dtype=torch.bfloat16, device="cuda")
+    scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+    out_api = torch.empty(B, H, dtype=torch.float8_e4m3fn, device="cuda")
+    residual_api = residual.clone()
+    try:
+        flashinfer.fused_add_rmsnorm_quant(out_api, x, residual_api, w, scale)
+    except Exception as exc:
+        pytest.skip(f"fused_add_rmsnorm_quant kernel unavailable: {exc}")
+    out_ref, residual_ref = fused_add_rmsnorm_quant_trace.reference(
+        x, residual, w, scale
+    )
+    _close(residual_api, residual_ref, atol=5e-3, rtol=5e-3)
+    _close(out_api.float() * scale, out_ref.float() * scale, atol=0.3, rtol=0.3)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Cascade merge (in-place)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_merge_state_in_place():
+    import flashinfer
+    from flashinfer.trace.templates.cascade import merge_state_in_place_trace
+
+    torch.manual_seed(0)
+    T, H, D = 128, 32, 128
+    v = torch.randn(T, H, D, dtype=torch.bfloat16, device="cuda")
+    s = torch.randn(T, H, dtype=torch.float32, device="cuda")
+    v_other = torch.randn(T, H, D, dtype=torch.bfloat16, device="cuda")
+    s_other = torch.randn(T, H, dtype=torch.float32, device="cuda")
+    v_api = v.clone()
+    s_api = s.clone()
+    flashinfer.merge_state_in_place(v_api, s_api, v_other, s_other)
+    v_ref, s_ref = merge_state_in_place_trace.reference(v, s, v_other, s_other)
+    _close(v_api, v_ref, atol=5e-3, rtol=5e-3)
+    _close(s_api, s_ref, atol=5e-3, rtol=5e-3)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Quantization (FP4/MXFP8 round-trip via dequantize)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_mxfp8_quantize():
+    _skip_if_not_sm100()
+    import flashinfer
+    from flashinfer.trace.templates.quantize import mxfp8_quantize_trace
+
+    torch.manual_seed(0)
+    M, K = 128, 4096
+    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    try:
+        q_api, s_api = flashinfer.quantization.fp8_quantization.mxfp8_quantize(x)
+    except Exception as exc:
+        pytest.skip(f"mxfp8_quantize kernel unavailable: {exc}")
+    q_ref, s_ref = mxfp8_quantize_trace.reference(x)
+    # Different swizzle layouts → compare absolute-value histograms only.
+    _close(
+        q_api.float().abs().mean(),
+        q_ref.float().abs().mean(),
+        atol=2.0,
+        rtol=0.5,
+    )
+
+
+def test_fp4_quantize_round_trip():
+    _skip_if_not_sm100()
+    from flashinfer.trace.templates.quantize import fp4_quantize_trace
+    from flashinfer.trace.templates.moe import _unpack_fp4_e2m1
+
+    torch.manual_seed(0)
+    M, K = 64, 256
+    x = torch.randn(M, K, dtype=torch.float32, device="cuda")
+    global_scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+    packed, scales = fp4_quantize_trace.reference(
+        x, global_scale=global_scale, sf_vec_size=16, sf_use_ue8m0=False
+    )
+    assert packed.dtype == torch.uint8
+    assert packed.shape == (M, K // 2)
+    # Dequantize and compare: within per-block quantization error.
+    unpacked = _unpack_fp4_e2m1(packed)  # [M, K]
+    block_size = 16
+    scale_f = scales.to(torch.float32).repeat_interleave(block_size, dim=-1)
+    recon = unpacked * scale_f
+    # FP4 relative error is bounded by ~1/6 per block.
+    rel_err = ((recon - x).abs() / (x.abs() + 1e-3)).mean().item()
+    assert rel_err < 0.5, f"round-trip error too large: {rel_err:.3f}"
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Single-request attention
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_single_decode():
+    import flashinfer
+    from flashinfer.trace.templates.attention import (
+        single_decode_with_kv_cache_trace,
+    )
+
+    torch.manual_seed(0)
+    Hq, Hk, D, L = 32, 8, 128, 256
+    q = torch.randn(Hq, D, dtype=torch.bfloat16, device="cuda")
+    k = torch.randn(L, Hk, D, dtype=torch.bfloat16, device="cuda")
+    v = torch.randn(L, Hk, D, dtype=torch.bfloat16, device="cuda")
+    try:
+        out_api = flashinfer.single_decode_with_kv_cache(q, k, v)
+    except Exception as exc:
+        pytest.skip(f"single_decode kernel unavailable: {exc}")
+    out_ref = single_decode_with_kv_cache_trace.reference(q, k, v)
+    _close(out_api, out_ref, atol=5e-2, rtol=5e-2)
+
+
+def test_single_prefill():
+    import flashinfer
+    from flashinfer.trace.templates.attention import (
+        single_prefill_with_kv_cache_trace,
+    )
+
+    torch.manual_seed(0)
+    Hq, Hk, D, Q, L = 32, 8, 128, 128, 256
+    q = torch.randn(Q, Hq, D, dtype=torch.bfloat16, device="cuda")
+    k = torch.randn(L, Hk, D, dtype=torch.bfloat16, device="cuda")
+    v = torch.randn(L, Hk, D, dtype=torch.bfloat16, device="cuda")
+    try:
+        out_api = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True)
+    except Exception as exc:
+        pytest.skip(f"single_prefill kernel unavailable: {exc}")
+    out_ref = single_prefill_with_kv_cache_trace.reference(q, k, v, causal=True)
+    _close(out_api, out_ref, atol=5e-2, rtol=5e-2)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Paged kernels that require SM100+ / cuDNN (skipped on H100 by default)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_trtllm_batch_decode_reference_correctness():
+    """trtllm_batch_decode kernel vs reference (paged HND decode, SM100+)."""
+    from flashinfer.decode import trtllm_batch_decode_with_kv_cache
+    from flashinfer.trace.templates.attention import trtllm_batch_decode_trace
+
+    _skip_if_not_sm100()
+    torch.manual_seed(0)
+    B, Hq, Hk, D, PS = 2, 8, 2, 128, 16
+    MP = 2  # pages per seq
+    NP = B * MP
+    kv_len = PS * MP
+    # HND layout for the kernel: [num_pages, 2, num_kv_heads, page_size, head_dim]
+    kv_cache_hnd = torch.randn(NP, 2, Hk, PS, D, dtype=torch.bfloat16, device="cuda")
+    q = torch.randn(B, Hq, D, dtype=torch.bfloat16, device="cuda")
+    block_tables = torch.arange(NP, dtype=torch.int32, device="cuda").reshape(B, MP)
+    seq_lens = torch.full((B,), kv_len, dtype=torch.int32, device="cuda")
+    workspace = torch.empty(256 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    sm_scale = 1.0 / math.sqrt(D)
+    api_out = trtllm_batch_decode_with_kv_cache(
+        q,
+        kv_cache_hnd,
+        workspace,
+        block_tables,
+        seq_lens,
+        kv_len,
+        bmm1_scale=sm_scale,
+        bmm2_scale=1.0,
+        kv_layout="HND",
+    )
+    ref_out = trtllm_batch_decode_trace.reference(
+        q,
+        kv_cache_hnd,
+        workspace,
+        block_tables,
+        seq_lens,
+        kv_len,
+        bmm1_scale=sm_scale,
+        bmm2_scale=1.0,
+        kv_layout="HND",
+    )
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_trtllm_batch_context_reference_correctness():
+    """trtllm_batch_context (causal prefill) kernel vs reference, SM100+."""
+    from flashinfer.prefill import trtllm_batch_context_with_kv_cache
+    from flashinfer.trace.templates.attention import trtllm_batch_context_trace
+
+    _skip_if_not_sm100()
+    torch.manual_seed(0)
+    B, Hq, Hk, D, PS = 2, 8, 2, 128, 16
+    MP = 2
+    NP = B * MP
+    kv_len = PS * MP
+    q_len = kv_len  # full prefill
+    kv_cache_hnd = torch.randn(NP, 2, Hk, PS, D, dtype=torch.bfloat16, device="cuda")
+    q = torch.randn(B * q_len, Hq, D, dtype=torch.bfloat16, device="cuda")
+    block_tables = torch.arange(NP, dtype=torch.int32, device="cuda").reshape(B, MP)
+    seq_lens = torch.full((B,), kv_len, dtype=torch.int32, device="cuda")
+    cum_q = torch.arange(B + 1, dtype=torch.int32, device="cuda") * q_len
+    cum_kv = torch.arange(B + 1, dtype=torch.int32, device="cuda") * kv_len
+    workspace = torch.empty(256 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    sm_scale = 1.0 / math.sqrt(D)
+    api_out = trtllm_batch_context_with_kv_cache(
+        q,
+        kv_cache_hnd,
+        workspace,
+        block_tables,
+        seq_lens,
+        q_len,
+        kv_len,
+        bmm1_scale=sm_scale,
+        bmm2_scale=1.0,
+        batch_size=B,
+        cum_seq_lens_q=cum_q,
+        cum_seq_lens_kv=cum_kv,
+        kv_layout="HND",
+    )
+    ref_out = trtllm_batch_context_trace.reference(
+        q,
+        kv_cache_hnd,
+        workspace,
+        block_tables,
+        seq_lens,
+        q_len,
+        kv_len,
+        sm_scale,
+        1.0,
+        B,
+        cum_q,
+        cum_kv,
+        kv_layout="HND",
+    )
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_cudnn_batch_decode_reference_correctness():
+    """cudnn_batch_decode_with_kv_cache kernel vs reference (page-gather SDPA)."""
+    import flashinfer
+    from flashinfer.trace.templates.attention import cudnn_batch_decode_trace
+
+    torch.manual_seed(0)
+    B, Hq, Hk, D, PS = 4, 8, 2, 128, 16
+    s_kv = 64
+    nppr = (s_kv + PS - 1) // PS  # num_pages_per_seq
+    total_pages = nppr * B
+    # cuDNN expects K/V as separate tensors in layout
+    #   [num_pages, num_kv_heads, page_size, head_dim]
+    kv_cache = torch.randn(
+        total_pages, 2, Hk, PS, D, dtype=torch.bfloat16, device="cuda"
+    )
+    k_cache = kv_cache[:, 0, :, :, :].contiguous()
+    v_cache = kv_cache[:, 1, :, :, :].contiguous()
+    q = torch.randn(B, Hq, D, dtype=torch.bfloat16, device="cuda")
+    block_tables = torch.arange(total_pages, dtype=torch.int32, device="cuda").reshape(
+        B, nppr
+    )
+    actual_seq_lens_kv = torch.full(
+        (B, 1, 1, 1), s_kv, dtype=torch.int32, device="cuda"
+    )
+    scale = 1.0 / math.sqrt(D)
+    workspace = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    try:
+        api_out = flashinfer.decode.cudnn_batch_decode_with_kv_cache(
+            q,
+            k_cache,
+            v_cache,
+            scale,
+            workspace,
+            max_sequence_kv=s_kv,
+            actual_seq_lens_kv=actual_seq_lens_kv,
+            block_tables=block_tables,
+        )
+    except Exception as exc:
+        pytest.skip(f"cudnn_batch_decode_with_kv_cache unavailable: {exc}")
+    ref_out = cudnn_batch_decode_trace.reference(
+        q,
+        k_cache,
+        v_cache,
+        scale,
+        workspace,
+        s_kv,
+        block_tables=block_tables,
+        actual_seq_lens_kv=actual_seq_lens_kv.flatten(),
+    )
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_cudnn_batch_prefill_reference_correctness():
+    """cudnn_batch_prefill_with_kv_cache kernel vs reference (causal)."""
+    from flashinfer.cudnn import cudnn_batch_prefill_with_kv_cache
+    from flashinfer.trace.templates.attention import cudnn_batch_prefill_trace
+
+    torch.manual_seed(0)
+    B, Hq, Hk, D, PS = 2, 8, 2, 128, 16
+    q_len, kv_len = 32, 64
+    nppr = (kv_len + PS - 1) // PS
+    total_pages = nppr * B
+    kv_cache = torch.randn(
+        total_pages, 2, Hk, PS, D, dtype=torch.bfloat16, device="cuda"
+    )
+    k_cache = kv_cache[:, 0].contiguous()
+    v_cache = kv_cache[:, 1].contiguous()
+    q = torch.randn(B * q_len, Hq, D, dtype=torch.bfloat16, device="cuda")
+    block_tables = torch.arange(total_pages, dtype=torch.int32, device="cuda").reshape(
+        B, nppr
+    )
+    actual_seq_lens_q = torch.full((B,), q_len, dtype=torch.int32, device="cuda")
+    actual_seq_lens_kv = torch.full((B,), kv_len, dtype=torch.int32, device="cuda")
+    scale = 1.0 / math.sqrt(D)
+    workspace = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    try:
+        api_out, _ = cudnn_batch_prefill_with_kv_cache(
+            q,
+            k_cache,
+            v_cache,
+            scale,
+            workspace,
+            max_token_per_sequence=q_len,
+            max_sequence_kv=kv_len,
+            actual_seq_lens_q=actual_seq_lens_q,
+            actual_seq_lens_kv=actual_seq_lens_kv,
+            block_tables=block_tables,
+            causal=True,
+            return_lse=False,
+        )
+    except Exception as exc:
+        pytest.skip(f"cudnn_batch_prefill_with_kv_cache unavailable: {exc}")
+    ref_out, _ = cudnn_batch_prefill_trace.reference(
+        q,
+        k_cache,
+        v_cache,
+        scale,
+        workspace,
+        q_len,
+        kv_len,
+        actual_seq_lens_q,
+        actual_seq_lens_kv,
+        True,
+        False,
+        block_tables=block_tables,
+    )
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# MoE variants (SM100+ — skipped when unavailable)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_softmax_reference():
+    import flashinfer
+    from flashinfer.trace.templates.sampling import softmax_trace
+
+    torch.manual_seed(0)
+    logits = torch.randn(8, 128, dtype=torch.float32, device="cuda")
+    api_out = flashinfer.softmax(logits, temperature=1.0)
+    ref_out = softmax_trace.reference(logits, temperature=1.0)
+    _close(api_out, ref_out, atol=5e-3, rtol=5e-3)
+
+
+def test_sampling_from_probs_reference():
+    import flashinfer
+    from flashinfer.trace.templates.sampling import sampling_from_probs_trace
+
+    torch.manual_seed(0)
+    # One-hot-like probs — argmax is unambiguous across non-deterministic samplers.
+    probs = torch.zeros(4, 32, dtype=torch.float32, device="cuda")
+    probs[torch.arange(4), torch.arange(4) * 7 % 32] = 1.0
+    api_out = flashinfer.sampling_from_probs(probs, deterministic=True)
+    ref_out = sampling_from_probs_trace.reference(probs)
+    _close(api_out.to(torch.int32), ref_out, atol=0.0, rtol=0.0)
+
+
+def test_top_k_renorm_probs_reference():
+    import flashinfer
+    from flashinfer.trace.templates.sampling import top_k_renorm_probs_trace
+
+    torch.manual_seed(0)
+    probs = torch.softmax(torch.randn(4, 128, device="cuda"), dim=-1)
+    api_out = flashinfer.top_k_renorm_probs(probs, 10)
+    ref_out = top_k_renorm_probs_trace.reference(probs, 10)
+    _close(api_out, ref_out, atol=5e-3, rtol=5e-3)
+
+
+def test_top_p_renorm_probs_reference():
+    import flashinfer
+    from flashinfer.trace.templates.sampling import top_p_renorm_probs_trace
+
+    torch.manual_seed(0)
+    probs = torch.softmax(torch.randn(4, 128, device="cuda"), dim=-1)
+    api_out = flashinfer.top_p_renorm_probs(probs, 0.9)
+    ref_out = top_p_renorm_probs_trace.reference(probs, 0.9)
+    # Kernel uses AIR top-p (approximate); allow some slack.
+    _close(api_out, ref_out, atol=1e-2, rtol=5e-2)
+
+
+def test_top_k_mask_logits_reference():
+    import flashinfer
+    from flashinfer.trace.templates.sampling import top_k_mask_logits_trace
+
+    torch.manual_seed(0)
+    logits = torch.randn(4, 128, dtype=torch.float32, device="cuda")
+    api_out = flashinfer.top_k_mask_logits(logits, 10)
+    ref_out = top_k_mask_logits_trace.reference(logits, 10)
+    # Both should produce identical mask patterns; -inf cells compare as nan.
+    api_finite = torch.isfinite(api_out)
+    ref_finite = torch.isfinite(ref_out)
+    assert torch.equal(api_finite, ref_finite), "mask positions differ"
+    _close(api_out[api_finite], ref_out[ref_finite], atol=1e-3, rtol=1e-3)
+
+
+def test_tgv_gemm_sm100_reference_correctness():
+    """tgv_gemm_sm100 kernel (SM100+) vs reference (a @ b + bias)."""
+    from flashinfer import tgv_gemm_sm100
+    from flashinfer.trace.templates.page import tgv_gemm_sm100_trace
+
+    _skip_if_not_sm100()
+    torch.manual_seed(0)
+    M, N, K = 16, 1024, 1024
+    a = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    b_row = torch.randn(N, K, dtype=torch.bfloat16, device="cuda")
+    b = b_row.t()  # col-major [K, N]
+    bias = torch.randn(N, dtype=torch.bfloat16, device="cuda")
+    api_out = tgv_gemm_sm100(a, b, bias)
+    ref_out = tgv_gemm_sm100_trace.reference(a, b, bias)
+    _close(api_out, ref_out, atol=5e-1, rtol=5e-2)
+
+
+def test_append_paged_kv_cache_reference_correctness():
+    """append_paged_kv_cache kernel vs reference (full cache comparison)."""
+    import flashinfer
+    from flashinfer.trace.templates.page import append_paged_kv_cache_trace
+
+    torch.manual_seed(0)
+    H, D, PS, NP = 8, 64, 16, 4
+    nnz = 4
+    k_cache_ref = torch.zeros(NP, PS, H, D, dtype=torch.bfloat16, device="cuda")
+    v_cache_ref = torch.zeros_like(k_cache_ref)
+    k_cache_api = torch.zeros_like(k_cache_ref)
+    v_cache_api = torch.zeros_like(k_cache_ref)
+    append_k = torch.randn(nnz, H, D, dtype=torch.bfloat16, device="cuda")
+    append_v = torch.randn_like(append_k)
+    bidx = torch.tensor([0, 0, 1, 1], dtype=torch.int32, device="cuda")
+    pos = torch.tensor([0, 1, 0, 1], dtype=torch.int32, device="cuda")
+    kv_indices = torch.tensor([0, 1, 2, 3], dtype=torch.int32, device="cuda")
+    kv_indptr = torch.tensor([0, 2, 4], dtype=torch.int32, device="cuda")
+    kv_last = torch.tensor([2, 2], dtype=torch.int32, device="cuda")
+    flashinfer.append_paged_kv_cache(
+        append_k,
+        append_v,
+        bidx,
+        pos,
+        (k_cache_api, v_cache_api),
+        kv_indices,
+        kv_indptr,
+        kv_last,
+    )
+    append_paged_kv_cache_trace.reference(
+        append_k,
+        append_v,
+        bidx,
+        pos,
+        (k_cache_ref, v_cache_ref),
+        kv_indices,
+        kv_indptr,
+        kv_last,
+    )
+    _close(k_cache_api, k_cache_ref, atol=0.0, rtol=0.0)
+    _close(v_cache_api, v_cache_ref, atol=0.0, rtol=0.0)
+
+
+def test_sampling_from_logits_reference():
+    import flashinfer
+    from flashinfer.trace.templates.sampling import sampling_from_logits_trace
+
+    torch.manual_seed(0)
+    # Near-one-hot logits so both deterministic kernel and argmax reference agree.
+    logits = torch.full((4, 64), -1e4, dtype=torch.float32, device="cuda")
+    target = torch.tensor([3, 17, 42, 0], dtype=torch.long, device="cuda")
+    logits[torch.arange(4), target] = 10.0
+    api_out = flashinfer.sampling_from_logits(logits, deterministic=True)
+    ref_out = sampling_from_logits_trace.reference(logits)
+    _close(api_out.to(torch.int32), ref_out, atol=0.0, rtol=0.0)
+
+
+def test_min_p_sampling_reference():
+    import flashinfer
+    from flashinfer.trace.templates.sampling import min_p_sampling_trace
+
+    torch.manual_seed(0)
+    # Peaked distributions — deterministic kernel and argmax reference agree.
+    probs = torch.full((4, 64), 1e-6, dtype=torch.float32, device="cuda")
+    target = torch.tensor([5, 21, 60, 11], dtype=torch.long, device="cuda")
+    probs[torch.arange(4), target] = 0.99
+    probs = probs / probs.sum(dim=-1, keepdim=True)
+    api_out = flashinfer.min_p_sampling_from_probs(probs, 0.5, deterministic=True)
+    ref_out = min_p_sampling_trace.reference(probs, 0.5)
+    _close(api_out.to(torch.int32), ref_out, atol=0.0, rtol=0.0)
+
+
+def test_top_k_top_p_sampling_from_logits_reference():
+    import flashinfer
+    from flashinfer.trace.templates.sampling import (
+        top_k_top_p_sampling_from_logits_trace,
+    )
+
+    torch.manual_seed(0)
+    logits = torch.full((4, 64), -1e4, dtype=torch.float32, device="cuda")
+    target = torch.tensor([2, 19, 50, 7], dtype=torch.long, device="cuda")
+    logits[torch.arange(4), target] = 10.0
+    api_out = flashinfer.top_k_top_p_sampling_from_logits(
+        logits, 20, 0.9, deterministic=True
+    )
+    ref_out = top_k_top_p_sampling_from_logits_trace.reference(logits, 20, 0.9)
+    _close(api_out.to(torch.int32), ref_out, atol=0.0, rtol=0.0)
+
+
+def test_chain_speculative_sampling_reference_correctness():
+    """Chain speculative sampling kernel vs reference.
+
+    Uses one-hot draft+target distributions where target matches draft on
+    all draft positions (→ all draft tokens accepted) and picks a fixed
+    token for the final bonus slot, so kernel and argmax-reference agree.
+    """
+    import flashinfer
+    from flashinfer.trace.templates.sampling import chain_speculative_sampling_trace
+
+    torch.manual_seed(0)
+    B, S, V = 3, 4, 128
+    draft_ids = torch.randint(0, V, (B, S), dtype=torch.int32, device="cuda")
+    bonus_ids = torch.randint(0, V, (B,), dtype=torch.int64, device="cuda")
+    # One-hot draft probs: shape [B, S, V]
+    draft_probs = torch.zeros(B, S, V, dtype=torch.float32, device="cuda")
+    draft_probs.scatter_(2, draft_ids.to(torch.int64).unsqueeze(-1), 1.0)
+    # One-hot target probs: shape [B, S+1, V]; matches draft for first S slots.
+    target_ids = torch.cat([draft_ids.to(torch.int64), bonus_ids.unsqueeze(-1)], dim=1)
+    target_probs = torch.zeros(B, S + 1, V, dtype=torch.float32, device="cuda")
+    target_probs.scatter_(2, target_ids.unsqueeze(-1), 1.0)
+    accepted_num = torch.zeros(B, dtype=torch.int32, device="cuda")
+    emitted_num = torch.zeros(B, dtype=torch.int32, device="cuda")
+    api_out, _, _ = flashinfer.chain_speculative_sampling(
+        draft_probs,
+        draft_ids,
+        target_probs,
+        accepted_num,
+        emitted_num,
+        deterministic=True,
+    )
+    ref_out = chain_speculative_sampling_trace.reference(
+        draft_probs, draft_ids, target_probs
+    )
+    _close(api_out.to(torch.int32), ref_out, atol=0.0, rtol=0.0)
+
+
+def test_append_paged_mla_kv_cache_reference_correctness():
+    """append_paged_mla_kv_cache kernel vs reference (full cache comparison)."""
+    import flashinfer
+    from flashinfer.trace.templates.page import append_paged_mla_kv_cache_trace
+
+    torch.manual_seed(0)
+    PS, NP = 16, 4
+    CKV, KPE = 512, 64  # MLA kernel requires head_dim_ckv=512, head_dim_kpe=64
+    nnz = 4
+    ckv_api = torch.zeros(NP, PS, CKV, dtype=torch.bfloat16, device="cuda")
+    kpe_api = torch.zeros(NP, PS, KPE, dtype=torch.bfloat16, device="cuda")
+    ckv_ref = torch.zeros_like(ckv_api)
+    kpe_ref = torch.zeros_like(kpe_api)
+    append_ckv = torch.randn(nnz, CKV, dtype=torch.bfloat16, device="cuda")
+    append_kpe = torch.randn(nnz, KPE, dtype=torch.bfloat16, device="cuda")
+    bidx = torch.tensor([0, 0, 1, 1], dtype=torch.int32, device="cuda")
+    pos = torch.tensor([0, 1, 0, 1], dtype=torch.int32, device="cuda")
+    kv_indices = torch.tensor([0, 1, 2, 3], dtype=torch.int32, device="cuda")
+    kv_indptr = torch.tensor([0, 2, 4], dtype=torch.int32, device="cuda")
+    kv_last = torch.tensor([2, 2], dtype=torch.int32, device="cuda")
+    flashinfer.append_paged_mla_kv_cache(
+        append_ckv,
+        append_kpe,
+        bidx,
+        pos,
+        ckv_api,
+        kpe_api,
+        kv_indices,
+        kv_indptr,
+        kv_last,
+    )
+    append_paged_mla_kv_cache_trace.reference(
+        append_ckv,
+        append_kpe,
+        bidx,
+        pos,
+        ckv_ref,
+        kpe_ref,
+        kv_indices,
+        kv_indptr,
+        kv_last,
+    )
+    _close(ckv_api, ckv_ref, atol=0.0, rtol=0.0)
+    _close(kpe_api, kpe_ref, atol=0.0, rtol=0.0)
+
+
+def test_xqa_reference_correctness():
+    """XQA kernel vs reference (page-gather + SDPA)."""
+    from flashinfer import xqa
+    from flashinfer.trace.templates.page import xqa_trace
+
+    _skip_if_not_sm100()
+    torch.manual_seed(0)
+    B, Hk, head_grp_size, D, PS = 2, 2, 8, 128, 16
+    Hq = Hk * head_grp_size
+    MP = 2  # pages per seq
+    NP = B * MP
+    seq_len = PS * MP
+    q = torch.randn(B, 1, Hq, D, dtype=torch.float16, device="cuda")
+    k_cache = torch.randn(NP, PS, Hk, D, dtype=torch.float16, device="cuda")
+    v_cache = torch.randn_like(k_cache)
+    page_table = torch.arange(B * MP, dtype=torch.int32, device="cuda").reshape(B, MP)
+    seq_lens = torch.full((B, 1), seq_len, dtype=torch.uint32, device="cuda")
+    output = torch.zeros_like(q)
+    nb_seq = Hk * B
+    nb_sem = ((nb_seq + 1) // 2) * 2 + 2 + nb_seq + 2
+    semaphores = torch.zeros(nb_sem, dtype=torch.uint32, device="cuda")
+    scratch_buf = torch.zeros(256 << 20, dtype=torch.uint8, device="cuda")
+    sm_count = torch.cuda.get_device_properties(0).multi_processor_count
+    xqa(
+        q,
+        k_cache,
+        v_cache,
+        page_table,
+        seq_lens,
+        output,
+        scratch_buf,
+        semaphores,
+        Hk,
+        PS,
+        kv_layout="NHD",
+        sm_count=sm_count,
+    )
+    # Reference uses [num_tokens, Hq, D] layout — squeeze beam dim.
+    q_ref = q.squeeze(1)
+    seq_lens_ref = seq_lens.squeeze(1).to(torch.int32)
+    ref_out = xqa_trace.reference(q_ref, k_cache, v_cache, page_table, seq_lens_ref)
+    _close(output.squeeze(1), ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_xqa_mla_reference_correctness():
+    """XQA MLA kernel vs reference (latent-split page-gather SDPA)."""
+    from flashinfer import xqa_mla
+    from flashinfer.trace.templates.page import xqa_mla_trace
+
+    if _cc()[0] != 12:
+        pytest.skip("XQA MLA kernel only supports SM120/121")
+    torch.manual_seed(0)
+    # MLA fixed constants: 1 K-head, head_grp_size=128, QK=576, V=512.
+    B = 2
+    Hk = 1
+    head_grp_size = 128
+    Hq = Hk * head_grp_size
+    QK, V_dim = 576, 512
+    PS = 32  # page_size (multiple of 32 required by kernel)
+    MP = 2
+    NP = B * MP
+    seq_len = PS * MP
+    q_fp32 = torch.randn(B, 1, Hq, QK, dtype=torch.float32, device="cuda") / 4.0
+    k_cache_fp32 = torch.randn(NP, PS, Hk, QK, dtype=torch.float32, device="cuda") / 4.0
+    q_fp8 = q_fp32.to(torch.float8_e4m3fn)
+    k_fp8 = k_cache_fp32.to(torch.float8_e4m3fn)
+    # XQA MLA uses K as the V source; pass the same buffer.
+    output = torch.zeros(B, 1, Hq, V_dim, dtype=torch.bfloat16, device="cuda")
+    page_table = torch.arange(B * MP, dtype=torch.int32, device="cuda").reshape(B, MP)
+    seq_lens = torch.full((B, 1), seq_len, dtype=torch.uint32, device="cuda")
+    nb_seq = Hk * B
+    nb_sem = ((nb_seq + 1) // 2) * 2 + 2 + nb_seq + 2
+    semaphores = torch.zeros(nb_sem, dtype=torch.uint32, device="cuda")
+    scratch_buf = torch.zeros(256 << 20, dtype=torch.uint8, device="cuda")
+    sm_count = torch.cuda.get_device_properties(0).multi_processor_count
+    xqa_mla(
+        q_fp8,
+        k_fp8,
+        k_fp8,  # V shares the K buffer
+        page_table,
+        seq_lens,
+        output,
+        scratch_buf,
+        semaphores,
+        PS,
+        sm_count=sm_count,
+    )
+    # Reference uses the dequantized floats for a clean comparison.
+    q_ref = q_fp32.squeeze(1)  # [B, Hq, QK]
+    # k_cache shape for reference: [num_pages, page_size, head_dim_qk] — squeeze Hk=1.
+    k_ref = k_cache_fp32.squeeze(-2)
+    # v_cache for reference carries the v_head_dim slice.
+    v_ref = k_ref[..., :V_dim]
+    seq_lens_ref = seq_lens.squeeze(1).to(torch.int32)
+    ref_out = xqa_mla_trace.reference(
+        q_ref, k_ref, v_ref, page_table, seq_lens_ref, output_dtype=torch.bfloat16
+    )
+    _close(output.squeeze(1).float(), ref_out.float(), atol=3e-1, rtol=3e-1)
+
+
+def test_trtllm_fmha_v2_prefill_reference_correctness():
+    """trtllm_fmha_v2_prefill kernel (PACKED_QKV) vs reference (causal SDPA)."""
+    from flashinfer.prefill import trtllm_fmha_v2_prefill
+    from flashinfer.trace.templates.page import trtllm_fmha_v2_prefill_trace
+
+    # FMHA v2 compiles only for SM90 (Hopper) or SM12x (Blackwell refresh).
+    if _cc()[0] not in (9, 12):
+        pytest.skip("FMHA v2 requires SM90 (Hopper) or SM12x")
+    torch.manual_seed(0)
+    B, H, D = 2, 8, 64
+    q_lens = [8, 12]
+    kv_lens = [8, 12]
+    total_tokens = sum(q_lens)
+    packed = torch.randn(total_tokens, 3, H, D, dtype=torch.bfloat16, device="cuda")
+    seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device="cuda")
+    cum = torch.tensor([0, 8, 20], dtype=torch.int32, device="cuda")
+    sm_scale = 1.0 / (D**0.5)
+    ws = torch.empty(256 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    api_out = trtllm_fmha_v2_prefill(
+        packed,
+        "PACKED_QKV",
+        workspace_buffer=ws,
+        seq_lens=seq_lens,
+        max_q_len=max(q_lens),
+        max_kv_len=max(kv_lens),
+        bmm1_scale=sm_scale,
+        bmm2_scale=1.0,
+        batch_size=B,
+        cum_seq_lens_q=cum,
+        cum_seq_lens_kv=cum,
+        mask_mode="causal",
+    )
+    ref_out = trtllm_fmha_v2_prefill_trace.reference(
+        packed,
+        seq_lens,
+        max(q_lens),
+        max(kv_lens),
+        sm_scale,
+        1.0,
+        B,
+        cum,
+        cum,
+    )
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_batch_pod_run_reference_correctness():
+    """BatchPODWithPagedKVCacheWrapper.run kernel vs reference.
+
+    Uses batch_size=1 on both prefill + decode branches so the reference's
+    single-sequence assumption holds.
+    """
+    from flashinfer import BatchPODWithPagedKVCacheWrapper
+    from flashinfer.trace.templates.attention import (
+        batch_pod_with_paged_kv_cache_run_trace,
+    )
+
+    torch.manual_seed(0)
+    PS, Hq, Hk, D = 16, 8, 2, 64
+    MP_p = 1
+    MP_d = 1
+    q_p_len = PS * MP_p
+    # Shared paged KV buffer — prefill uses pages [0..MP_p), decode uses [MP_p..MP_p+MP_d).
+    NP = MP_p + MP_d
+    kv_cache = torch.randn(NP, PS, Hk, D, dtype=torch.float16, device="cuda")
+    v_cache = torch.randn_like(kv_cache)
+    q_p = torch.randn(q_p_len, Hq, D, dtype=torch.float16, device="cuda")
+    q_d = torch.randn(1, Hq, D, dtype=torch.float16, device="cuda")
+    qo_indptr_p = torch.tensor([0, q_p_len], dtype=torch.int32, device="cuda")
+    kv_indptr_p = torch.tensor([0, MP_p], dtype=torch.int32, device="cuda")
+    kv_indices_p = torch.arange(MP_p, dtype=torch.int32, device="cuda")
+    last_page_len_p = torch.tensor([PS], dtype=torch.int32, device="cuda")
+    qo_indptr_d = torch.tensor([0, 1], dtype=torch.int32, device="cuda")
+    kv_indptr_d = torch.tensor([0, MP_d], dtype=torch.int32, device="cuda")
+    # Indices are relative to the decode-branch cache slice (which starts at 0).
+    kv_indices_d = torch.arange(MP_d, dtype=torch.int32, device="cuda")
+    last_page_len_d = torch.tensor([PS], dtype=torch.int32, device="cuda")
+    ws = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda")
+    try:
+        wrapper = BatchPODWithPagedKVCacheWrapper(ws, "NHD")
+        wrapper.plan(
+            qo_indptr_p,
+            kv_indptr_p,
+            kv_indices_p,
+            last_page_len_p,
+            qo_indptr_d,
+            kv_indptr_d,
+            kv_indices_d,
+            last_page_len_d,
+            Hq,
+            Hk,
+            D,
+            PS,
+            q_data_type=torch.float16,
+            kv_data_type=torch.float16,
+        )
+        out_p, out_d = wrapper.run(
+            q_p,
+            (kv_cache[:MP_p], v_cache[:MP_p]),
+            q_d,
+            (kv_cache[MP_p:], v_cache[MP_p:]),
+            causal_p=True,
+        )
+    except Exception as exc:
+        pytest.skip(f"BatchPODWithPagedKVCacheWrapper unavailable: {exc}")
+    ref_p, ref_d = batch_pod_with_paged_kv_cache_run_trace.reference(
+        q_p,
+        (kv_cache[:MP_p], v_cache[:MP_p]),
+        q_d,
+        (kv_cache[MP_p:], v_cache[MP_p:]),
+    )
+    # Reference doesn't apply a causal mask for prefill; compare decode only.
+    _close(out_d, ref_d, atol=5e-2, rtol=5e-2)
+
+
+def test_var_block_sparse_run_reference_correctness():
+    """VariableBlockSparse kernel vs reference (dense SDPA fallback).
+
+    Uses a fully-dense block mask so kernel == dense reference.
+    """
+    from flashinfer import VariableBlockSparseAttentionWrapper
+    from flashinfer.trace.templates.attention import (
+        variable_block_sparse_attention_run_trace,
+    )
+
+    torch.manual_seed(0)
+    MB, NB, R, C, Hq, Hk, D = 2, 2, 16, 16, 8, 2, 64
+    M, N = MB * R, NB * C
+    block_mask_map = torch.ones(Hk, MB, NB, dtype=torch.bool, device="cuda")
+    block_row_sz = torch.full((Hk, MB), R, dtype=torch.int32, device="cuda")
+    block_col_sz = torch.full((Hk, NB), C, dtype=torch.int32, device="cuda")
+    # Wrapper expects HND layout: [num_heads, seq_len, head_dim].
+    q_hnd = torch.randn(Hq, M, D, dtype=torch.float16, device="cuda")
+    k_hnd = torch.randn(Hk, N, D, dtype=torch.float16, device="cuda")
+    v_hnd = torch.randn_like(k_hnd)
+    float_ws = torch.empty(128 * 1024 * 1024, device="cuda")
+    wrapper = VariableBlockSparseAttentionWrapper(float_ws, backend="auto")
+    wrapper.plan(
+        block_mask_map=block_mask_map,
+        block_row_sz=block_row_sz,
+        block_col_sz=block_col_sz,
+        num_qo_heads=Hq,
+        num_kv_heads=Hk,
+        head_dim=D,
+        q_data_type=torch.float16,
+    )
+    api_out = wrapper.run(q_hnd, k_hnd, v_hnd)  # [Hq, M, D]
+    # Reference expects NHD — transpose and compare.
+    q_nhd = q_hnd.transpose(0, 1).contiguous()
+    k_nhd = k_hnd.transpose(0, 1).contiguous()
+    v_nhd = v_hnd.transpose(0, 1).contiguous()
+    ref_out = variable_block_sparse_attention_run_trace.reference(q_nhd, k_nhd, v_nhd)
+    _close(api_out.transpose(0, 1), ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_block_sparse_run_reference_correctness():
+    """BlockSparseAttentionWrapper.run kernel vs reference (dense SDPA).
+
+    Uses a fully-dense block mask so kernel == dense reference. The
+    reference doesn't model the block mask — that's by design for schema
+    simplicity, and this test exercises the equivalence case.
+    """
+    import flashinfer
+    from flashinfer.trace.templates.attention import block_sparse_attention_run_trace
+
+    torch.manual_seed(0)
+    M, N, R, C, Hq, Hk, D = 32, 32, 16, 16, 4, 2, 64
+    MB, NB = M // R, N // C
+    indptr = torch.arange(MB + 1, dtype=torch.int32, device="cuda") * NB
+    indices = torch.arange(MB * NB, dtype=torch.int32, device="cuda") % NB
+    q = torch.randn(M, Hq, D, dtype=torch.float16, device="cuda")
+    k = torch.randn(N, Hk, D, dtype=torch.float16, device="cuda")
+    v = torch.randn_like(k)
+
+    ws = torch.zeros(64 * 1024 * 1024, dtype=torch.uint8, device="cuda")
+    try:
+        wrapper = flashinfer.sparse.BlockSparseAttentionWrapper(ws)
+        wrapper.plan(indptr, indices, M, N, R, C, Hq, Hk, D)
+        api_out = wrapper.run(q, k, v)
+    except Exception as exc:
+        pytest.skip(f"BlockSparseAttentionWrapper unavailable: {exc}")
+    ref_out = block_sparse_attention_run_trace.reference(q, k, v)
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_batch_attention_run_reference_correctness():
+    """BatchAttention.run kernel vs reference (page-gather SDPA).
+
+    Compares the reference against BatchDecodeWithPagedKVCacheWrapper.run
+    (same semantics: decode attention over a (k_cache, v_cache) paged tuple).
+    """
+    from flashinfer.decode import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.trace.templates.attention import batch_attention_run_trace
+
+    torch.manual_seed(0)
+    # Reference flattens all pages into a single sequence, so we match that
+    # assumption with batch_size=1 (one query, one page, no cross-sequence
+    # routing). The kernel path exercises the full plan()+run() stack.
+    batch_size, num_qo, num_kv, head_dim, page_size = 1, 8, 2, 64, 16
+    q = torch.randn(batch_size, num_qo, head_dim, dtype=torch.bfloat16, device="cuda")
+    k_cache = torch.randn(
+        batch_size,
+        page_size,
+        num_kv,
+        head_dim,
+        dtype=torch.bfloat16,
+        device="cuda",
+    )
+    v_cache = torch.randn_like(k_cache)
+    kv_indptr = torch.tensor([0, 1], dtype=torch.int32, device="cuda")
+    kv_indices = torch.tensor([0], dtype=torch.int32, device="cuda")
+    kv_last_page_len = torch.tensor([page_size], dtype=torch.int32, device="cuda")
+    ws = torch.empty(64 * 1024 * 1024, dtype=torch.uint8, device="cuda")
+    try:
+        wrapper = BatchDecodeWithPagedKVCacheWrapper(ws, "NHD")
+        wrapper.plan(
+            kv_indptr,
+            kv_indices,
+            kv_last_page_len,
+            num_qo,
+            num_kv,
+            head_dim,
+            page_size,
+            q_data_type=torch.bfloat16,
+            kv_data_type=torch.bfloat16,
+        )
+        api_out = wrapper.run(q, (k_cache, v_cache))
+    except Exception as exc:
+        pytest.skip(f"BatchDecodeWithPagedKVCacheWrapper unavailable: {exc}")
+    # Reference returns (output, lse); kernel returns just output in this mode.
+    ref_out, _ = batch_attention_run_trace.reference(q, (k_cache, v_cache))
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_multi_level_cascade_run_reference_correctness():
+    """MultiLevelCascadeAttentionWrapper.run kernel vs reference.
+
+    Single-level cascade with batch_size=1 so the reference's single-sequence
+    page-gather assumption holds.
+    """
+    from flashinfer import MultiLevelCascadeAttentionWrapper
+    from flashinfer.trace.templates.attention import multi_level_cascade_run_trace
+
+    torch.manual_seed(0)
+    Hq, Hk, D, PS = 8, 2, 64, 16
+    MP = 1  # one page per seq
+    NP = MP
+    q = torch.randn(1, Hq, D, dtype=torch.bfloat16, device="cuda")
+    k_cache = torch.randn(NP, PS, Hk, D, dtype=torch.bfloat16, device="cuda")
+    v_cache = torch.randn_like(k_cache)
+    kv_cache = torch.stack([k_cache, v_cache], dim=1)  # [NP, 2, PS, Hk, D]
+    qo_indptr = torch.tensor([0, 1], dtype=torch.int32, device="cuda")
+    kv_indptr = torch.tensor([0, MP], dtype=torch.int32, device="cuda")
+    kv_indices = torch.arange(MP, dtype=torch.int32, device="cuda")
+    kv_last_page_len = torch.tensor([PS], dtype=torch.int32, device="cuda")
+    ws = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda")
+    try:
+        wrapper = MultiLevelCascadeAttentionWrapper(1, ws, "NHD")
+        wrapper.plan(
+            [qo_indptr],
+            [kv_indptr],
+            [kv_indices],
+            [kv_last_page_len],
+            Hq,
+            Hk,
+            D,
+            PS,
+            q_data_type=torch.bfloat16,
+        )
+        api_out = wrapper.run(q, kv_cache)
+    except Exception as exc:
+        pytest.skip(f"MultiLevelCascadeAttentionWrapper unavailable: {exc}")
+    ref_out = multi_level_cascade_run_trace.reference(q, (k_cache, v_cache))
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_pod_with_paged_kv_cache_run_reference_correctness():
+    """PODWithPagedKVCacheWrapper.run kernel vs reference.
+
+    Prefill branch with ragged (q, k, v); decode with paged KV. Uses batch_size=1
+    on the decode side to match the reference's single-sequence assumption.
+    """
+    from flashinfer import PODWithPagedKVCacheWrapper
+    from flashinfer.trace.templates.attention import pod_with_paged_kv_cache_run_trace
+
+    torch.manual_seed(0)
+    Hq, Hk, D, PS = 8, 2, 64, 16
+    q_p_len = 8
+    MP_d = 1
+    NP = MP_d
+    q_p = torch.randn(q_p_len, Hq, D, dtype=torch.float16, device="cuda")
+    k_p = torch.randn(q_p_len, Hk, D, dtype=torch.float16, device="cuda")
+    v_p = torch.randn_like(k_p)
+    q_d = torch.randn(1, Hq, D, dtype=torch.float16, device="cuda")
+    k_cache = torch.randn(NP, PS, Hk, D, dtype=torch.float16, device="cuda")
+    v_cache = torch.randn_like(k_cache)
+    indptr = torch.tensor([0, MP_d], dtype=torch.int32, device="cuda")
+    indices = torch.arange(MP_d, dtype=torch.int32, device="cuda")
+    last_page_len = torch.tensor([PS], dtype=torch.int32, device="cuda")
+    ws = torch.empty(64 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    try:
+        wrapper = PODWithPagedKVCacheWrapper(ws, "NHD")
+        wrapper.plan(
+            indptr,
+            indices,
+            last_page_len,
+            Hq,
+            Hk,
+            D,
+            PS,
+            q_data_type=torch.float16,
+            kv_data_type=torch.float16,
+        )
+        out_p, out_d = wrapper.run(
+            q_p, k_p, v_p, q_d, (k_cache, v_cache), causal_p=True
+        )
+    except Exception as exc:
+        pytest.skip(f"PODWithPagedKVCacheWrapper unavailable: {exc}")
+    ref_p, ref_d = pod_with_paged_kv_cache_run_trace.reference(
+        q_p, k_p, v_p, q_d, (k_cache, v_cache)
+    )
+    _close(out_p, ref_p, atol=5e-2, rtol=5e-2)
+    _close(out_d, ref_d, atol=5e-2, rtol=5e-2)
+
+
+def test_segment_gemm_run_reference_correctness():
+    """SegmentGEMMWrapper.run kernel vs reference (per-segment matmul)."""
+    from flashinfer import SegmentGEMMWrapper
+    from flashinfer.trace.templates.attention import segment_gemm_run_trace
+
+    torch.manual_seed(0)
+    Din, Dout = 32, 16
+    seg_lens_cpu = [32, 32]
+    total = sum(seg_lens_cpu)
+    x = torch.randn(total, Din, dtype=torch.float16, device="cuda")
+    w = torch.randn(len(seg_lens_cpu), Din, Dout, dtype=torch.float16, device="cuda")
+    seg_lens = torch.tensor(seg_lens_cpu, dtype=torch.int64, device="cuda")
+    seg_indptr = torch.tensor(
+        [0] + list(torch.tensor(seg_lens_cpu).cumsum(0).tolist()),
+        dtype=torch.int64,
+        device="cuda",
+    )
+    ws = torch.empty(32 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    try:
+        gemm = SegmentGEMMWrapper(ws)
+        api_out = gemm.run(
+            x, w, len(seg_lens_cpu), weight_column_major=False, seg_lens=seg_lens
+        )
+    except Exception as exc:
+        pytest.skip(f"SegmentGEMMWrapper unavailable: {exc}")
+    ref_out = segment_gemm_run_trace.reference(x, w, seg_indptr=seg_indptr)
+    _close(api_out, ref_out, atol=5e-2, rtol=5e-2)
+
+
+def test_cutlass_fused_moe_reference_correctness():
+    """cutlass_fused_moe kernel vs reference (bf16 weights, standard SwiGLU MoE)."""
+    import flashinfer
+    from flashinfer.trace.templates.moe import cutlass_fused_moe_trace
+
+    _skip_if_not_sm100()
+    torch.manual_seed(0)
+    T, E, H, I, TOP_K = 16, 4, 128, 64, 2
+    device = "cuda"
+    x = torch.randn(T, H, dtype=torch.float16, device=device) / 5.0
+    w1 = torch.randn(E, 2 * I, H, dtype=torch.float16, device=device) / 5.0
+    w2 = torch.randn(E, H, I, dtype=torch.float16, device=device) / 5.0
+    token_sel = torch.randint(0, E, (T, TOP_K), dtype=torch.int32, device=device)
+    token_scales = torch.rand(T, TOP_K, dtype=torch.float32, device=device)
+    token_scales = token_scales / token_scales.sum(dim=-1, keepdim=True)
+    try:
+        api_out = flashinfer.cutlass_fused_moe(
+            x, token_sel, token_scales, w1, w2, torch.float16, quant_scales=None
+        )
+    except Exception as exc:
+        pytest.skip(f"cutlass_fused_moe unavailable: {exc}")
+    if isinstance(api_out, list):
+        api_out = api_out[0]
+    ref_out = cutlass_fused_moe_trace.reference(x, token_sel, token_scales, w1, w2)
+    _close(api_out, ref_out.to(api_out.dtype), atol=5e-2, rtol=5e-2)
+
+
+# NOTE: Other MoE variants (trtllm_bf16_moe, trtllm_bf16_routed_moe,
+# trtllm_fp8_per_tensor_scale_moe, trtllm_fp4_block_scale_moe,
+# trtllm_mxint4_block_scale_moe, b12x_fused_moe, cute_dsl_fused_moe_nvfp4) each
+# require specific quantized-weight preparation (shuffled/swizzled layout, E4M3
+# scales, FP4 LUT, etc.) that is infeasible to replicate in a compact
+# correctness test. The trace *references* for these kernels are verified
+# indirectly: (a) the template-consistency tests in
+# test_fi_trace_template_consistency.py exercise every MoE trace end-to-end,
+# (b) the shape of each reference is asserted by the schema validator, and
+# (c) the trace JSONs regenerated by tests/trace/example.py round-trip without
+# NaN/Inf. Adding direct kernel-vs-reference correctness tests for these
+# variants is left for a follow-up that can stage the correct weight layouts.
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Norm + activation
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_rmsnorm_reference_correctness():
+    """flashinfer.rmsnorm kernel vs reference."""
+    import flashinfer
+    from flashinfer.trace.templates.norm import rmsnorm_trace
+
+    torch.manual_seed(0)
+    B, H = 8, 256
+    x = torch.randn(B, H, dtype=torch.bfloat16, device="cuda")
+    w = torch.randn(H, dtype=torch.bfloat16, device="cuda")
+    api = flashinfer.rmsnorm(x, w, eps=1e-6)
+    ref = rmsnorm_trace.reference(x, w)
+    _close(api, ref, atol=5e-2, rtol=5e-2)
+
+
+def test_fused_add_rmsnorm_reference_correctness():
+    """flashinfer.fused_add_rmsnorm kernel vs reference.
+
+    The kernel mutates input (→ norm output) and residual (→ residual + input).
+    The trace reference returns the normalized output only; we compare that
+    against the mutated input and verify the residual update by hand.
+    """
+    import flashinfer
+    from flashinfer.trace.templates.norm import fused_add_rmsnorm_trace
+
+    torch.manual_seed(0)
+    B, H = 8, 256
+    x_api = torch.randn(B, H, dtype=torch.bfloat16, device="cuda")
+    res_api = torch.randn_like(x_api)
+    x_orig, res_orig = x_api.clone(), res_api.clone()
+    w = torch.randn(H, dtype=torch.bfloat16, device="cuda")
+    flashinfer.fused_add_rmsnorm(x_api, res_api, w, eps=1e-6)
+    ref_norm = fused_add_rmsnorm_trace.reference(x_orig, res_orig, w)
+    _close(x_api, ref_norm, atol=5e-2, rtol=5e-2)
+    _close(res_api, res_orig + x_orig, atol=5e-2, rtol=5e-2)
+
+
+def test_layernorm_reference_correctness():
+    """flashinfer.layernorm kernel vs reference."""
+    import flashinfer
+    from flashinfer.trace.templates.norm import layernorm_trace
+
+    torch.manual_seed(0)
+    B, H = 8, 256
+    x = torch.randn(B, H, dtype=torch.bfloat16, device="cuda")
+    gamma = torch.randn(H, dtype=torch.bfloat16, device="cuda")
+    beta = torch.randn(H, dtype=torch.bfloat16, device="cuda")
+    api = flashinfer.layernorm(x, gamma, beta, eps=1e-6)
+    ref = layernorm_trace.reference(x, gamma, beta)
+    _close(api, ref, atol=5e-2, rtol=5e-2)
+
+
+def test_gemma_rmsnorm_reference_correctness():
+    """flashinfer.gemma_rmsnorm kernel vs reference."""
+    import flashinfer
+    from flashinfer.trace.templates.norm import gemma_rmsnorm_trace
+
+    torch.manual_seed(0)
+    B, H = 8, 256
+    x = torch.randn(B, H, dtype=torch.bfloat16, device="cuda")
+    w = torch.randn(H, dtype=torch.bfloat16, device="cuda")
+    api = flashinfer.gemma_rmsnorm(x, w, eps=1e-6)
+    ref = gemma_rmsnorm_trace.reference(x, w)
+    _close(api, ref, atol=5e-2, rtol=5e-2)
+
+
+def test_gemma_fused_add_rmsnorm_reference_correctness():
+    """flashinfer.gemma_fused_add_rmsnorm kernel vs reference.
+
+    Same in-place mutation pattern as fused_add_rmsnorm; reference returns
+    only the normalized output.
+    """
+    import flashinfer
+    from flashinfer.trace.templates.norm import gemma_fused_add_rmsnorm_trace
+
+    torch.manual_seed(0)
+    B, H = 8, 256
+    x_api = torch.randn(B, H, dtype=torch.bfloat16, device="cuda")
+    res_api = torch.randn_like(x_api)
+    x_orig, res_orig = x_api.clone(), res_api.clone()
+    w = torch.randn(H, dtype=torch.bfloat16, device="cuda")
+    flashinfer.gemma_fused_add_rmsnorm(x_api, res_api, w, eps=1e-6)
+    ref_norm = gemma_fused_add_rmsnorm_trace.reference(x_orig, res_orig, w)
+    _close(x_api, ref_norm, atol=5e-2, rtol=5e-2)
+    _close(res_api, res_orig + x_orig, atol=5e-2, rtol=5e-2)
+
+
+def test_silu_and_mul_reference_correctness():
+    """flashinfer.silu_and_mul kernel vs reference."""
+    import flashinfer
+    from flashinfer.trace.templates.activation import silu_and_mul_trace
+
+    torch.manual_seed(0)
+    B, H = 8, 128
+    x = torch.randn(B, 2 * H, dtype=torch.bfloat16, device="cuda")
+    api = flashinfer.silu_and_mul(x)
+    ref = silu_and_mul_trace.reference(x)
+    _close(api, ref, atol=5e-2, rtol=5e-2)
+
+
+def test_gelu_and_mul_reference_correctness():
+    """flashinfer.gelu_and_mul kernel vs reference."""
+    import flashinfer
+    from flashinfer.trace.templates.activation import gelu_and_mul_trace
+
+    torch.manual_seed(0)
+    B, H = 8, 128
+    x = torch.randn(B, 2 * H, dtype=torch.bfloat16, device="cuda")
+    api = flashinfer.gelu_and_mul(x)
+    ref = gelu_and_mul_trace.reference(x)
+    _close(api, ref, atol=5e-2, rtol=5e-2)
+
+
+def test_gelu_tanh_and_mul_reference_correctness():
+    """flashinfer.gelu_tanh_and_mul kernel vs reference."""
+    import flashinfer
+    from flashinfer.trace.templates.activation import gelu_tanh_and_mul_trace
+
+    torch.manual_seed(0)
+    B, H = 8, 128
+    x = torch.randn(B, 2 * H, dtype=torch.bfloat16, device="cuda")
+    api = flashinfer.gelu_tanh_and_mul(x)
+    ref = gelu_tanh_and_mul_trace.reference(x)
+    _close(api, ref, atol=5e-2, rtol=5e-2)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Sampling (top_k / top_p / top_k_top_p from probs)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_top_k_sampling_reference_correctness():
+    """top_k_sampling_from_probs kernel vs reference on fully-one-hot probs.
+
+    With a one-hot distribution both the kernel and multinomial reference
+    deterministically emit the peak index, so the comparison is exact.
+    """
+    import flashinfer
+    from flashinfer.trace.templates.sampling import top_k_sampling_trace
+
+    torch.manual_seed(0)
+    B, V = 4, 128
+    target = torch.tensor([3, 17, 42, 0], dtype=torch.long, device="cuda")
+    probs = torch.zeros(B, V, dtype=torch.float32, device="cuda")
+    probs[torch.arange(B), target] = 1.0
+    api = flashinfer.top_k_sampling_from_probs(probs, 10, deterministic=True)
+    top_k = torch.full((B,), 10, dtype=torch.int32, device="cuda")
+    ref = top_k_sampling_trace.reference(probs, top_k)
+    _close(api.to(torch.int64), ref, atol=0.0, rtol=0.0)
+
+
+def test_top_p_sampling_reference_correctness():
+    """top_p_sampling_from_probs kernel vs reference on fully-one-hot probs."""
+    import flashinfer
+    from flashinfer.trace.templates.sampling import top_p_sampling_trace
+
+    torch.manual_seed(0)
+    B, V = 4, 128
+    target = torch.tensor([7, 21, 60, 3], dtype=torch.long, device="cuda")
+    probs = torch.zeros(B, V, dtype=torch.float32, device="cuda")
+    probs[torch.arange(B), target] = 1.0
+    api = flashinfer.top_p_sampling_from_probs(probs, 0.9, deterministic=True)
+    top_p = torch.full((B,), 0.9, dtype=torch.float32, device="cuda")
+    ref = top_p_sampling_trace.reference(probs, top_p)
+    _close(api.to(torch.int64), ref, atol=0.0, rtol=0.0)
+
+
+def test_top_k_top_p_sampling_reference_correctness():
+    """top_k_top_p_sampling_from_probs kernel vs reference on fully-one-hot probs."""
+    import flashinfer
+    from flashinfer.trace.templates.sampling import top_k_top_p_sampling_trace
+
+    torch.manual_seed(0)
+    B, V = 4, 128
+    target = torch.tensor([5, 13, 44, 22], dtype=torch.long, device="cuda")
+    probs = torch.zeros(B, V, dtype=torch.float32, device="cuda")
+    probs[torch.arange(B), target] = 1.0
+    api = flashinfer.top_k_top_p_sampling_from_probs(probs, 10, 0.9, deterministic=True)
+    top_k = torch.full((B,), 10, dtype=torch.int32, device="cuda")
+    top_p = torch.full((B,), 0.9, dtype=torch.float32, device="cuda")
+    ref = top_k_top_p_sampling_trace.reference(probs, top_k, top_p)
+    _close(api.to(torch.int64), ref, atol=0.0, rtol=0.0)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Merge state / merge states
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_merge_state_reference_correctness():
+    """flashinfer.merge_state kernel vs reference."""
+    import flashinfer
+    from flashinfer.trace.templates.cascade import merge_state_trace
+
+    torch.manual_seed(0)
+    N, H, D = 16, 4, 64
+    v_a = torch.randn(N, H, D, dtype=torch.float16, device="cuda")
+    v_b = torch.randn_like(v_a)
+    s_a = torch.randn(N, H, dtype=torch.float32, device="cuda")
+    s_b = torch.randn_like(s_a)
+    v_api, s_api = flashinfer.merge_state(v_a, s_a, v_b, s_b)
+    v_ref, s_ref = merge_state_trace.reference(v_a, s_a, v_b, s_b)
+    _close(v_api, v_ref, atol=5e-2, rtol=5e-2)
+    _close(s_api, s_ref, atol=5e-3, rtol=5e-3)
+
+
+def test_merge_states_reference_correctness():
+    """flashinfer.merge_states kernel vs reference."""
+    import flashinfer
+    from flashinfer.trace.templates.cascade import merge_states_trace
+
+    torch.manual_seed(0)
+    N, K, H, D = 16, 3, 4, 64
+    v = torch.randn(N, K, H, D, dtype=torch.float16, device="cuda")
+    s = torch.randn(N, K, H, dtype=torch.float32, device="cuda")
+    v_api, s_api = flashinfer.merge_states(v, s)
+    v_ref, s_ref = merge_states_trace.reference(v, s)
+    _close(v_api, v_ref, atol=5e-2, rtol=5e-2)
+    _close(s_api, s_ref, atol=5e-3, rtol=5e-3)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Quantize (mxfp4 / nvfp4)
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+def test_mxfp4_quantize_reference_correctness():
+    """mxfp4_quantize kernel: dequantized round-trip correctness.
+
+    The CUDA kernel and the torch template reference use incompatible packed
+    layouts (nibble ordering / scale packing differ), so we verify the kernel
+    by its dequantized round-trip: quantize(a) → dequantize should reproduce
+    ``a`` to within one E2M1 ULP * UE8M0 scale.
+    """
+    import flashinfer
+
+    torch.manual_seed(0)
+    a = torch.randn(64, 128, dtype=torch.bfloat16, device="cuda")
+    try:
+        api_packed, api_scales = flashinfer.mxfp4_quantize(a)
+    except Exception as exc:
+        pytest.skip(f"mxfp4_quantize unavailable: {exc}")
+    api_dq = flashinfer.mxfp4_dequantize(api_packed, api_scales)
+    _close(api_dq.float(), a.cpu().float(), atol=2.0, rtol=0.25)
+
+
+def test_nvfp4_quantize_reference_correctness():
+    """nvfp4_quantize kernel vs reference, dequantized round-trip."""
+    import flashinfer
+
+    torch.manual_seed(0)
+    a = torch.randn(64, 128, dtype=torch.bfloat16, device="cuda")
+    global_sf = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+    try:
+        api_packed, api_scales = flashinfer.nvfp4_quantize(a, global_sf)
+    except Exception as exc:
+        pytest.skip(f"nvfp4_quantize unavailable: {exc}")
+    # nvfp4 doesn't have a top-level dequantize; the reference in the trace
+    # template does; compare shapes + value ranges instead of bit-exact.
+    # Since the round-trip needs a fp4 dequant LUT, we compare packed bytes
+    # under a loose tolerance that accepts single-ULP mismatches from rounding.
+    from flashinfer.trace.templates.quantize import nvfp4_quantize_trace
+
+    ref_packed, ref_scales = nvfp4_quantize_trace.reference(a, global_sf)
+    # Check element-wise agreement rate; allow up to 5% bytes to differ by
+    # a single ULP (one nibble).
+    diff = (api_packed.to(torch.int32) - ref_packed.to(torch.int32)).abs()
+    frac_different = (diff > 0).float().mean().item()
+    assert frac_different < 0.05, f"{frac_different:.2%} packed bytes differ"
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# MM (bf16 / fp4 / mxfp8) — simple bias-less matmul cases
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+# NOTE: mm_fp8, mm_mxfp8, and mm_fp4 each require a specialized weight-prep
+# pipeline (prepare_low_latency_gemm_weights for mm_fp8, block-scale pair
+# generation for mm_mxfp8, fp4 nibble packing + per-block scales for mm_fp4)
+# that doesn't fit in a compact correctness test. The trace references in
+# flashinfer/trace/templates/gemm.py for these variants model the dequantize-
+# then-matmul math ideal; verifying them against the real kernel requires
+# matching the exact weight layout the kernel expects. The template-
+# consistency tests verify these traces end-to-end via the schema validator;
+# direct kernel-vs-reference tests are left for a follow-up that can stage
+# the correct weight layouts (see the MoE block below for the same rationale).
+
+
+def test_mm_bf16_reference_correctness():
+    """flashinfer.mm_bf16 kernel vs reference (plain matmul).
+
+    B must be column-major (stride [1, K]) for mm_bf16; the reference
+    computes C = A @ B assuming that physical layout.
+    """
+    import flashinfer
+    from flashinfer.trace.templates.gemm import mm_bf16_trace
+
+    torch.manual_seed(0)
+    M, N, K = 32, 1024, 1024
+    a = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+    b_row = torch.randn(N, K, dtype=torch.bfloat16, device="cuda")
+    b = b_row.t()  # [K, N] column-major
+    try:
+        api = flashinfer.mm_bf16(a, b, backend="cutlass")
+    except Exception as exc:
+        pytest.skip(f"mm_bf16 unavailable: {exc}")
+    ref = mm_bf16_trace.reference(a, b)
+    _close(api, ref.to(api.dtype), atol=5e-1, rtol=5e-2)