Tencent
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bench/fused_moe/README.md‎
Lines changed: 97 additions & 0 deletions b/‎bench/fused_moe/README.md‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎bench/fused_moe/backends/__init__.py‎
Lines changed: 41 additions & 0 deletions b/‎bench/fused_moe/backends/__init__.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎bench/fused_moe/backends/base.py‎
Lines changed: 208 additions & 0 deletions b/‎bench/fused_moe/backends/base.py‎
Lines changed: 208 additions & 0 deletions
@@ -8,6 +8,9 @@ find_package(CUDAToolkit REQUIRED)
 find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
 
 file(GLOB_RECURSE SOURCES "src/*/*.cu" "src/*/*.cc")
+file(GLOB CP_ASYNC_SOURCES "src/fuse_moe/cp_async/*.cu" "src/group_gemm/cp_async/*.cu"
+                           "src/group_gemm/cp_async/*.cc")
+list(APPEND SOURCES ${CP_ASYNC_SOURCES})
 list(FILTER SOURCES EXCLUDE REGEX ".*test.*")
 
 
 
@@ -0,0 +1,97 @@
+# FusedMoE Benchmark
+
+This directory contains a per-tensor FP8 FusedMoE benchmark for HPC-Ops,
+vLLM Triton, vLLM CUTLASS, and SGLang.
+
+## Requirements
+
+- NVIDIA GPU with FP8 support.
+- CUDA, PyTorch, Triton, NumPy, `nvtx`, and `nsys`.
+- Built HPC-Ops, vLLM, and SGLang checkouts.
+
+Set checkout roots before running:
+
+```bash
+export HPCOPS_ROOT=/path/to/hpc-ops
+export VLLM_ROOT=/path/to/vllm
+export SGLANG_ROOT=/path/to/sglang
+```
+
+## Usage
+
+Run TP mode:
+
+```bash
+python3 bench.py \
+  --tp 8 --ep 1 \
+  --gpu 0 \
+  --backends hpcops vllm vllm_cutlass sglang
+```
+
+Run EP mode:
+
+```bash
+python3 bench.py \
+  --tp 1 --ep 8 \
+  --gpu 0 \
+  --backends hpcops vllm vllm_cutlass sglang
+```
+
+Run a smaller smoke test:
+
+```bash
+python3 bench.py \
+  --tp 8 --ep 1 \
+  --models qwen3-235b \
+  --bs 16 32 \
+  --backends hpcops vllm_cutlass \
+  --gpu 0
+```
+
+By default, outputs are written under `./log/<tag>/`. Override this with:
+
+```bash
+python3 bench.py --output-dir /path/to/output ...
+```
+
+## Defaults
+
+Models:
+
+| Model | Experts | topk | Hidden | Intermediate |
+|---|---:|---:|---:|---:|
+| `qwen3-235b` | 128 | 8 | 4096 | 1536 |
+| `hunyuan-v3` | 192 | 8 | 4096 | 1536 |
+| `deepseek-v3` | 256 | 8 | 7168 | 2048 |
+
+Shape semantics:
+
+- `bs` is the kernel-visible sequence count on the measured rank.
+- `TP` partitions the intermediate dimension only, so `intermediate_per_rank = intermediate / TP`.
+- `EP` partitions experts, so `experts_per_rank = experts / EP`.
+- The reported `avg/group` is `bs * topk / experts_per_rank`.
+
+For `TP=8 EP=1`, experts are not partitioned and the benchmark keeps the full expert set
+visible to the measured rank:
+
+```text
+avg/group = bs * topk / experts
+```
+
+For `TP=1 EP=8`, the benchmark measures one EP rank with local experts only. Routing is
+sampled within that local expert set, so:
+
+```text
+experts_per_rank = experts / 8
+avg/group = bs * topk / experts_per_rank
+```
+
+The EP batch range is shorter than the TP range to cover the same per-rank operator regime at
+comparable `avg/group` values.
+
+Batch sizes:
+
+| Mode | Batch sizes |
+|---|---|
+| `TP=8 EP=1` | `4 16 32 64 128 256 512 1024 2048 4096 8192 16384` |
+| `TP=1 EP=8` | `4 8 16 32 64 128 256 512 1024 2048` |
@@ -0,0 +1,41 @@
+# Copyright (C) 2026 Tencent.
+
+"""Backend registry."""
+from __future__ import annotations
+
+from typing import Callable, Dict
+
+from .base import Backend, BenchSpec  # re-export for convenience
+
+_REGISTRY: Dict[str, Callable[[], Backend]] = {}
+
+
+def register(name: str, factory: Callable[[], Backend]) -> None:
+    if name in _REGISTRY:
+        raise ValueError(f"backend already registered: {name}")
+    _REGISTRY[name] = factory
+
+
+def make(name: str) -> Backend:
+    if name not in _REGISTRY:
+        raise KeyError(
+            f"unknown backend: {name!r} (known: {sorted(_REGISTRY.keys())})")
+    return _REGISTRY[name]()
+
+
+def known() -> list[str]:
+    return sorted(_REGISTRY.keys())
+
+
+def _import_all():
+    """Trigger registration side effects for all known modules."""
+    from . import hpcops        # noqa: F401
+    from . import vllm          # noqa: F401
+    from . import vllm_cutlass  # noqa: F401
+    from . import sglang        # noqa: F401
+
+
+_import_all()
+
+
+__all__ = ["Backend", "BenchSpec", "register", "make", "known"]
@@ -0,0 +1,208 @@
+# Copyright (C) 2026 Tencent.
+
+"""Shared base classes and helpers for all FusedMoE backends."""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, asdict
+from typing import Callable, Optional
+
+import torch
+
+
+# ---------------------------------------------------------------------------
+# Per-cell shape spec
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True)
+class BenchSpec:
+    """Per-cell input shape that backends consume.
+
+    Fields match the kernel-visible shape (post TP/EP simulation), not the
+    full-model shape.  The driver derives these from the user's --tp/--ep
+    args and stores them in the JSONL output for traceability.
+    """
+    num_seq: int                # batch_size
+    hidden: int                 # K of Gate-Up; N of Down
+    intermediate_per_rank: int  # N of one of {Gate, Up}; K of Down
+    num_expert_local: int       # experts visible to this rank
+    num_expert_total: int       # for sampling topk_ids; equals local under EP-rank-0 sim
+    num_topk: int
+
+    model: str = ""
+    tp: int = 1
+    ep: int = 1
+
+
+# ---------------------------------------------------------------------------
+# Backend ABC
+# ---------------------------------------------------------------------------
+class Backend(ABC):
+    """Abstract backend.  Subclass for each registered benchmark backend."""
+
+    name: str  # registry key, e.g. "hpcops"
+
+    @abstractmethod
+    def setup(self, spec: BenchSpec) -> Callable[[], None]:
+        """Build tensors and return the timed call_fn."""
+        raise NotImplementedError
+
+    def cleanup(self) -> None:
+        torch.cuda.empty_cache()
+
+    def extra_metadata(self) -> dict:
+        return {}
+
+
+# ---------------------------------------------------------------------------
+# Shared tensor builders
+# ---------------------------------------------------------------------------
+DTYPE_FP8 = torch.float8_e4m3fn
+DTYPE_HALF = torch.half
+
+
+def build_fp8_weights(
+    num_expert_local: int,
+    intermediate_per_rank: int,
+    hidden: int,
+    *,
+    seed: int = 0,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Build w1 (gate+up fused along N) and w2 (down) in fp8 + per-expert
+    per-tensor scales.
+
+    Layouts match Triton / CUTLASS / sglang convention:
+        w1: [E, 2N, K]  (fp8)   w1_scale: [E, 1, 1] (fp32)
+        w2: [E, K,  N]  (fp8)   w2_scale: [E, 1, 1] (fp32)
+
+    Returns the same 4 tensors regardless of backend; HPC reshapes/views
+    these to its own per-expert layout in its own backend module.
+    """
+    from vllm import _custom_ops as ops
+
+    g = torch.Generator(device="cuda").manual_seed(seed)
+    E, N, K = num_expert_local, intermediate_per_rank, hidden
+
+    w1_half = torch.randn(
+        (E, 2 * N, K), dtype=torch.float, device="cuda", generator=g,
+    ).to(DTYPE_HALF)
+    w2_half = torch.randn(
+        (E, K, N), dtype=torch.float, device="cuda", generator=g,
+    ).to(DTYPE_HALF)
+
+    w1_fp8 = torch.empty_like(w1_half, dtype=DTYPE_FP8)
+    w2_fp8 = torch.empty_like(w2_half, dtype=DTYPE_FP8)
+    w1_scale = torch.empty((E, 1, 1), device="cuda", dtype=torch.float32)
+    w2_scale = torch.empty((E, 1, 1), device="cuda", dtype=torch.float32)
+    for e in range(E):
+        w1_fp8[e], s1 = ops.scaled_fp8_quant(w1_half[e])
+        w2_fp8[e], s2 = ops.scaled_fp8_quant(w2_half[e])
+        w1_scale[e, 0, 0] = s1
+        w2_scale[e, 0, 0] = s2
+    return w1_fp8, w2_fp8, w1_scale, w2_scale
+
+
+def build_routing(
+    num_seq: int,
+    num_expert_total: int,
+    num_topk: int,
+    *,
+    seed: int = 0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Sample uniform `topk_ids` and a normalized `topk_weights`.
+
+    Returns:
+        topk_ids   : (num_seq, num_topk) int32, sorted along topk axis
+        topk_w     : (num_seq, num_topk) float32, softmax-normalized
+    """
+    g = torch.Generator(device="cuda").manual_seed(seed)
+    topk_ids = torch.stack([
+        torch.sort(
+            torch.randperm(
+                num_expert_total, dtype=torch.int32, device="cuda",
+                generator=g,
+            )[:num_topk]
+        ).values
+        for _ in range(num_seq)
+    ])
+    topk_w = torch.softmax(
+        torch.randn((num_seq, num_topk), dtype=torch.float32, device="cuda",
+                    generator=g),
+        dim=-1,
+    )
+    return topk_ids, topk_w
+
+
+def build_activation(
+    num_seq: int, hidden: int, *, seed: int = 0,
+) -> torch.Tensor:
+    """Build a half activation tensor."""
+    g = torch.Generator(device="cuda").manual_seed(seed)
+    return torch.randn(
+        (num_seq, hidden), dtype=DTYPE_HALF, device="cuda", generator=g,
+    ) / 10
+
+
+A_SCALE_VALUE = 1e-2
+
+
+def build_a_scale() -> torch.Tensor:
+    return torch.full((), A_SCALE_VALUE, device="cuda", dtype=torch.float32)
+
+
+# ---------------------------------------------------------------------------
+# Method C timing harness
+# ---------------------------------------------------------------------------
+def run_method_c(call_fn: Callable[[], None], *, n_timed: int = 52):
+    """Run warmup, graph capture, replay warmup, and timed graph replays."""
+    import nvtx  # imported lazily so backends that error during setup don't fail on import
+
+    for _ in range(3):
+        call_fn()
+    torch.cuda.synchronize()
+
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        call_fn()
+
+    for _ in range(3):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    torch.cuda.cudart().cudaProfilerStart()
+    for _ in range(n_timed):
+        with nvtx.annotate("step"):
+            graph.replay()
+            torch.cuda.synchronize()
+    torch.cuda.cudart().cudaProfilerStop()
+
+
+# ---------------------------------------------------------------------------
+# Spec serialization (worker stdin <-> driver)
+# ---------------------------------------------------------------------------
+def spec_to_argv(spec: BenchSpec) -> list[str]:
+    """Serialize a BenchSpec to argv (used by bench.py to invoke worker.py)."""
+    return [
+        "--num-seq", str(spec.num_seq),
+        "--hidden", str(spec.hidden),
+        "--intermediate-per-rank", str(spec.intermediate_per_rank),
+        "--num-expert-local", str(spec.num_expert_local),
+        "--num-expert-total", str(spec.num_expert_total),
+        "--num-topk", str(spec.num_topk),
+        "--model", spec.model,
+        "--tp", str(spec.tp),
+        "--ep", str(spec.ep),
+    ]
+
+
+def spec_from_args(args) -> BenchSpec:
+    return BenchSpec(
+        num_seq=args.num_seq,
+        hidden=args.hidden,
+        intermediate_per_rank=args.intermediate_per_rank,
+        num_expert_local=args.num_expert_local,
+        num_expert_total=args.num_expert_total,
+        num_topk=args.num_topk,
+        model=args.model,
+        tp=args.tp,
+        ep=args.ep,
+    )