WIP

Binyang2014 · Binyang2014 · commit 493e3b3cc8dc · 2026-05-29T21:11:25.000Z
diff --git a/.azure-pipelines/templates/nccl-test.yml b/.azure-pipelines/templates/nccl-test.yml
@@ -79,8 +79,8 @@ steps:
     name: PyBench
     displayName: Run collective benchmarks
     remoteScript: |
-      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
-      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float16 --symmetric-memory --autotune
+      mpirun --allow-run-as-root -np 8 python3 -m mscclpp_benchmark.bench_collective --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
+      mpirun --allow-run-as-root -np 8 python3 -m mscclpp_benchmark.bench_collective --collective allreduce --dtype float16 --symmetric-memory --autotune
 
 - template: stop.yml
   parameters:
diff --git a/.azure-pipelines/templates/rccl-test.yml b/.azure-pipelines/templates/rccl-test.yml
@@ -62,8 +62,8 @@ steps:
     name: PyBench
     displayName: Run collective benchmarks
     remoteScript: |
-      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
-      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allgather --dtype float8_e4m3b15 --autotune
+      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m mscclpp_benchmark.bench_collective --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
+      mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m mscclpp_benchmark.bench_collective --collective allgather --dtype float8_e4m3b15 --autotune
 
 - template: stop.yml
   parameters:
diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -218,36 +218,30 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0
 # Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6.
 $ python3 -m pip install ".[cuda12,benchmark,test]"
 
-# Run a benchmark.
-$ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
-    python3 ./python/mscclpp_benchmark/bench_collective.py \
-    --collective allreduce --dtype float16
 ```
 
 To autotune launch parameters and save a tuned config:
 
 ```bash
 $ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
-    python3 -m mscclpp_benchmark.tuner \
+    python3 -m mscclpp_benchmark.bench_collective \
     --collective allreduce \
-    --dim 5120 \
     --dtype float16 \
-    --scale 8 \
     --batch-sizes 1,2,4,8 \
-    --output /tmp/mscclpp_tuned_configs.json
+    --autotune \
+    --write-config /tmp/mscclpp_tuned_configs.json
 ```
 
 Use the tuned config in a benchmark:
 
 ```bash
 $ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
-    python3 ./python/mscclpp_benchmark/bench_collective.py \
+    python3 -m mscclpp_benchmark.bench_collective \
     --collective allreduce \
     --dtype float16 \
     --config-path /tmp/mscclpp_tuned_configs.json
 ```
 
-
 (nccl-benchmark)=
 ### NCCL/RCCL Benchmark over MSCCL++
 
diff --git a/python/mscclpp_benchmark/tuner.py b/python/mscclpp_benchmark/tuner.py
@@ -3,9 +3,7 @@
 
 from __future__ import annotations
 
-import argparse
-from pathlib import Path
-from typing import Any, Callable, Iterable, Sequence
+from typing import Any, Callable, Iterable
 
 from mscclpp_benchmark.tuning_config import TunedConfig
 
@@ -84,109 +82,3 @@ def tune(self, case: Any) -> TunedConfig | None:
         if best_config is None:
             return self.comm.resolve_config(case)
         return best_config
-
-
-def _normalize_name(name: str | None) -> str:
-    if not name:
-        return "native"
-    return name.strip().lower().replace("-", "_")
-
-
-def _build_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(description="Generate offline MSCCL++ tuned configs")
-    parser.add_argument("--collective", choices=("allreduce", "allgather"), default="allreduce")
-    parser.add_argument("--dim", type=int, required=True)
-    parser.add_argument("--dtype", required=True)
-    parser.add_argument("--accum-type")
-    parser.add_argument("--sku", default="runtime", help="Used only for the default output filename")
-    parser.add_argument("--scale", type=int, help="Expected MPI world size")
-    parser.add_argument("--batch-sizes")
-    parser.add_argument("--output")
-    parser.add_argument("--scratch-buffer-size", type=int, default=1 << 27)
-    parser.add_argument("--warmup", type=int, default=5, help="Warmup graph replays during tuning")
-    parser.add_argument("--graph-launches", type=int, default=10, help="Timed graph replays during tuning")
-    parser.add_argument(
-        "--ops-per-graph", type=int, default=100, help="Collective ops captured per graph during tuning"
-    )
-    parser.add_argument("--candidate-nblocks")
-    parser.add_argument("--candidate-nthreads")
-    parser.add_argument("--symmetric-memory", action="store_true")
-    parser.add_argument("--skip-correctness", action="store_true")
-    return parser
-
-
-def _default_output_path(args: argparse.Namespace) -> str:
-    accum = _normalize_name(args.accum_type)
-    return (
-        "mscclpp_tuned_"
-        f"{_normalize_name(args.collective)}_"
-        f"{_normalize_name(args.sku)}_"
-        f"s{args.scale or 'runtime'}_"
-        f"d{args.dim}_"
-        f"dtype_{_normalize_name(args.dtype)}_"
-        f"accum_{accum}.json"
-    )
-
-
-def _bench_collective_args(args: argparse.Namespace) -> list[str]:
-    output = args.output or _default_output_path(args)
-    bench_args = [
-        "--collective",
-        args.collective,
-        "--d-model",
-        str(args.dim),
-        "--dtype",
-        args.dtype,
-        "--autotune",
-        "--write-config",
-        output,
-        "--scratch-buffer-size",
-        str(args.scratch_buffer_size),
-        "--tune-warmup",
-        str(args.warmup),
-        "--tune-graph-launches",
-        str(args.graph_launches),
-        "--tune-iterations",
-        str(args.ops_per_graph),
-        "--warmup",
-        "0",
-        "--graph-launches",
-        "1",
-        "--iterations",
-        "1",
-    ]
-    if args.batch_sizes:
-        bench_args += ["--batch-sizes", args.batch_sizes]
-    if args.accum_type:
-        bench_args += ["--accum-type", args.accum_type]
-    if args.candidate_nblocks:
-        bench_args += ["--candidate-nblocks", args.candidate_nblocks]
-    if args.candidate_nthreads:
-        bench_args += ["--candidate-nthreads", args.candidate_nthreads]
-    if args.symmetric_memory:
-        bench_args.append("--symmetric-memory")
-    if args.skip_correctness:
-        bench_args.append("--skip-correctness")
-    return bench_args
-
-
-def main(argv: Sequence[str] | None = None) -> None:
-    parser = _build_parser()
-    args = parser.parse_args(argv)
-
-    if args.scale is not None:
-        from mpi4py import MPI
-
-        world_size = MPI.COMM_WORLD.Get_size()
-        if world_size != args.scale:
-            raise ValueError(f"MSCCL++ tuning scale mismatch: expected MPI world size {args.scale}, got {world_size}")
-
-    from mscclpp_benchmark.bench_collective import main as bench_collective_main
-
-    bench_collective_main(_bench_collective_args(args))
-    if args.output is None:
-        print(f"Wrote tuned config to {Path(_default_output_path(args)).resolve()}", flush=True)
-
-
-if __name__ == "__main__":
-    main()