Skip to content

Commit 493e3b3

Browse files
committed
WIP
1 parent c8a49fa commit 493e3b3

4 files changed

Lines changed: 9 additions & 123 deletions

File tree

.azure-pipelines/templates/nccl-test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ steps:
7979
name: PyBench
8080
displayName: Run collective benchmarks
8181
remoteScript: |
82-
mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
83-
mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float16 --symmetric-memory --autotune
82+
mpirun --allow-run-as-root -np 8 python3 -m mscclpp_benchmark.bench_collective --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
83+
mpirun --allow-run-as-root -np 8 python3 -m mscclpp_benchmark.bench_collective --collective allreduce --dtype float16 --symmetric-memory --autotune
8484
8585
- template: stop.yml
8686
parameters:

.azure-pipelines/templates/rccl-test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ steps:
6262
name: PyBench
6363
displayName: Run collective benchmarks
6464
remoteScript: |
65-
mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
66-
mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 ./python/mscclpp_benchmark/bench_collective.py --collective allgather --dtype float8_e4m3b15 --autotune
65+
mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m mscclpp_benchmark.bench_collective --collective allreduce --dtype float8_e4m3b15 --accum-type float32 --autotune
66+
mpirun --allow-run-as-root -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m mscclpp_benchmark.bench_collective --collective allgather --dtype float8_e4m3b15 --autotune
6767
6868
- template: stop.yml
6969
parameters:

docs/quickstart.md

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -218,36 +218,30 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0
218218
# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6.
219219
$ python3 -m pip install ".[cuda12,benchmark,test]"
220220
221-
# Run a benchmark.
222-
$ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
223-
python3 ./python/mscclpp_benchmark/bench_collective.py \
224-
--collective allreduce --dtype float16
225221
```
226222
227223
To autotune launch parameters and save a tuned config:
228224
229225
```bash
230226
$ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
231-
python3 -m mscclpp_benchmark.tuner \
227+
python3 -m mscclpp_benchmark.bench_collective \
232228
--collective allreduce \
233-
--dim 5120 \
234229
--dtype float16 \
235-
--scale 8 \
236230
--batch-sizes 1,2,4,8 \
237-
--output /tmp/mscclpp_tuned_configs.json
231+
--autotune \
232+
--write-config /tmp/mscclpp_tuned_configs.json
238233
```
239234
240235
Use the tuned config in a benchmark:
241236
242237
```bash
243238
$ PYTHONPATH=$PWD/python mpirun -np 8 --allow-run-as-root \
244-
python3 ./python/mscclpp_benchmark/bench_collective.py \
239+
python3 -m mscclpp_benchmark.bench_collective \
245240
--collective allreduce \
246241
--dtype float16 \
247242
--config-path /tmp/mscclpp_tuned_configs.json
248243
```
249244
250-
251245
(nccl-benchmark)=
252246
### NCCL/RCCL Benchmark over MSCCL++
253247

python/mscclpp_benchmark/tuner.py

Lines changed: 1 addition & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33

44
from __future__ import annotations
55

6-
import argparse
7-
from pathlib import Path
8-
from typing import Any, Callable, Iterable, Sequence
6+
from typing import Any, Callable, Iterable
97

108
from mscclpp_benchmark.tuning_config import TunedConfig
119

@@ -84,109 +82,3 @@ def tune(self, case: Any) -> TunedConfig | None:
8482
if best_config is None:
8583
return self.comm.resolve_config(case)
8684
return best_config
87-
88-
89-
def _normalize_name(name: str | None) -> str:
90-
if not name:
91-
return "native"
92-
return name.strip().lower().replace("-", "_")
93-
94-
95-
def _build_parser() -> argparse.ArgumentParser:
96-
parser = argparse.ArgumentParser(description="Generate offline MSCCL++ tuned configs")
97-
parser.add_argument("--collective", choices=("allreduce", "allgather"), default="allreduce")
98-
parser.add_argument("--dim", type=int, required=True)
99-
parser.add_argument("--dtype", required=True)
100-
parser.add_argument("--accum-type")
101-
parser.add_argument("--sku", default="runtime", help="Used only for the default output filename")
102-
parser.add_argument("--scale", type=int, help="Expected MPI world size")
103-
parser.add_argument("--batch-sizes")
104-
parser.add_argument("--output")
105-
parser.add_argument("--scratch-buffer-size", type=int, default=1 << 27)
106-
parser.add_argument("--warmup", type=int, default=5, help="Warmup graph replays during tuning")
107-
parser.add_argument("--graph-launches", type=int, default=10, help="Timed graph replays during tuning")
108-
parser.add_argument(
109-
"--ops-per-graph", type=int, default=100, help="Collective ops captured per graph during tuning"
110-
)
111-
parser.add_argument("--candidate-nblocks")
112-
parser.add_argument("--candidate-nthreads")
113-
parser.add_argument("--symmetric-memory", action="store_true")
114-
parser.add_argument("--skip-correctness", action="store_true")
115-
return parser
116-
117-
118-
def _default_output_path(args: argparse.Namespace) -> str:
119-
accum = _normalize_name(args.accum_type)
120-
return (
121-
"mscclpp_tuned_"
122-
f"{_normalize_name(args.collective)}_"
123-
f"{_normalize_name(args.sku)}_"
124-
f"s{args.scale or 'runtime'}_"
125-
f"d{args.dim}_"
126-
f"dtype_{_normalize_name(args.dtype)}_"
127-
f"accum_{accum}.json"
128-
)
129-
130-
131-
def _bench_collective_args(args: argparse.Namespace) -> list[str]:
132-
output = args.output or _default_output_path(args)
133-
bench_args = [
134-
"--collective",
135-
args.collective,
136-
"--d-model",
137-
str(args.dim),
138-
"--dtype",
139-
args.dtype,
140-
"--autotune",
141-
"--write-config",
142-
output,
143-
"--scratch-buffer-size",
144-
str(args.scratch_buffer_size),
145-
"--tune-warmup",
146-
str(args.warmup),
147-
"--tune-graph-launches",
148-
str(args.graph_launches),
149-
"--tune-iterations",
150-
str(args.ops_per_graph),
151-
"--warmup",
152-
"0",
153-
"--graph-launches",
154-
"1",
155-
"--iterations",
156-
"1",
157-
]
158-
if args.batch_sizes:
159-
bench_args += ["--batch-sizes", args.batch_sizes]
160-
if args.accum_type:
161-
bench_args += ["--accum-type", args.accum_type]
162-
if args.candidate_nblocks:
163-
bench_args += ["--candidate-nblocks", args.candidate_nblocks]
164-
if args.candidate_nthreads:
165-
bench_args += ["--candidate-nthreads", args.candidate_nthreads]
166-
if args.symmetric_memory:
167-
bench_args.append("--symmetric-memory")
168-
if args.skip_correctness:
169-
bench_args.append("--skip-correctness")
170-
return bench_args
171-
172-
173-
def main(argv: Sequence[str] | None = None) -> None:
174-
parser = _build_parser()
175-
args = parser.parse_args(argv)
176-
177-
if args.scale is not None:
178-
from mpi4py import MPI
179-
180-
world_size = MPI.COMM_WORLD.Get_size()
181-
if world_size != args.scale:
182-
raise ValueError(f"MSCCL++ tuning scale mismatch: expected MPI world size {args.scale}, got {world_size}")
183-
184-
from mscclpp_benchmark.bench_collective import main as bench_collective_main
185-
186-
bench_collective_main(_bench_collective_args(args))
187-
if args.output is None:
188-
print(f"Wrote tuned config to {Path(_default_output_path(args)).resolve()}", flush=True)
189-
190-
191-
if __name__ == "__main__":
192-
main()

0 commit comments

Comments
 (0)