Skip to content

Commit 57ef899

Browse files
authored
Merge branch 'main' into aflowers/nats-max-payload
2 parents d6b2bf3 + dbab07e commit 57ef899

17 files changed

Lines changed: 3050 additions & 3 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ venv/
2020
env/
2121
ENV/
2222
.venv/
23+
.venv-compute/
2324

2425
# uv
2526
.uv/

recipes/mocker/kimi-trace-agg.yaml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
name: "kimi-k25-mocker-trace-agg"
2+
3+
slurm:
4+
time_limit: "00:15:00"
5+
6+
model:
7+
path: "kimi-k25-nvfp4"
8+
container: "trtllm-runtime"
9+
precision: "fp4"
10+
11+
resources:
12+
gpu_type: "gb200"
13+
gpus_per_node: 4
14+
agg_nodes: 1
15+
agg_workers: 1
16+
17+
extra_mount:
18+
- "/lustre/fsw/coreai_tritoninference_triton3/aflowers/srt-slurm/traces:/traces"
19+
20+
frontend:
21+
type: dynamo
22+
enable_multiple_frontends: false
23+
24+
backend:
25+
type: mocker
26+
speedup_ratio: 100
27+
engine_type: vllm
28+
29+
mocker_config:
30+
aggregated:
31+
num-gpu-blocks-override: 8192
32+
max-num-seqs: 128
33+
max-num-batched-tokens: 16384
34+
35+
benchmark:
36+
type: trace-replay
37+
trace_file: /traces/together-ai-basic-no-delays_119k/dataset.jsonl
38+
concurrencies: [4]
39+
ttft_threshold_ms: 3000
40+
itl_threshold_ms: 7
41+
aiperf_package: "aiperf>=0.7.0"
42+
43+
health_check:
44+
max_attempts: 60
45+
interval_seconds: 5
46+
47+
dynamo:
48+
install: false
49+
50+
# Virtual identity — verified against runtime fingerprint (warnings, not failures)
51+
identity:
52+
model:
53+
repo: "nvidia/Kimi-K2.5-NVFP4"
54+
revision: "c0285e649c34d4386b01e38abca642c06cbe014e"
55+
frameworks:
56+
dynamo: "1.0.0"
57+
tensorrt_llm: "1.3.0rc9"

src/srtctl/cli/do_sweep.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from srtctl.cli.mixins import BenchmarkStageMixin, FrontendStageMixin, PostProcessStageMixin, WorkerStageMixin
2525
from srtctl.core.config import load_config
2626
from srtctl.core.health import wait_for_port
27+
from srtctl.core.lockfile import write_lockfile
2728
from srtctl.core.processes import (
2829
ManagedProcess,
2930
ProcessRegistry,
@@ -197,6 +198,9 @@ def run(self) -> int:
197198
if self.config.profiling.enabled:
198199
logger.info("Profiling: %s", self.config.profiling.type)
199200

201+
# Write initial lockfile with config + SLURM context (fingerprint added after run)
202+
write_lockfile(self.runtime.log_dir.parent, self.config)
203+
200204
registry = ProcessRegistry(job_id=self.runtime.job_id)
201205
stop_event = threading.Event()
202206
setup_signal_handlers(stop_event, registry)

src/srtctl/cli/mixins/benchmark_stage.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
from pathlib import Path
1515
from typing import TYPE_CHECKING
1616

17+
from srtctl.core.fingerprint import format_identity_verification, verify_identity
1718
from srtctl.core.health import wait_for_model
19+
from srtctl.core.lockfile import collect_worker_fingerprints
1820
from srtctl.core.slurm import get_hostname_ip, start_srun_process
1921
from srtctl.core.status import JobStage, JobStatus, StatusReporter
2022

@@ -93,6 +95,27 @@ def run_benchmark(
9395
return 1
9496

9597
logger.info("Server is healthy - starting benchmark")
98+
99+
# Identity verification: compare recipe identity against runtime fingerprints
100+
# Store results on self so postprocess can include them in the lockfile
101+
self._identity_verification = None
102+
try:
103+
fingerprints = collect_worker_fingerprints(self.runtime.log_dir)
104+
has_identity = self.config.identity and (
105+
(
106+
self.config.identity.model
107+
and (self.config.identity.model.repo or self.config.identity.model.revision)
108+
)
109+
or self.config.identity.frameworks
110+
)
111+
if fingerprints and has_identity:
112+
self._identity_verification = verify_identity(self.config.identity, fingerprints)
113+
banner = format_identity_verification(self._identity_verification, self.config.identity)
114+
for line in banner.splitlines():
115+
logger.info(line)
116+
except Exception as e:
117+
logger.debug("Identity verification skipped: %s", e)
118+
96119
if reporter:
97120
reporter.report(JobStatus.BENCHMARK, JobStage.BENCHMARK, "Running benchmark")
98121

src/srtctl/cli/mixins/postprocess_stage.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
from srtctl.benchmarks.base import SCRIPTS_DIR
3030
from srtctl.core.config import load_cluster_config
31+
from srtctl.core.lockfile import write_lockfile
3132
from srtctl.core.schema import AIAnalysisConfig, S3Config
3233
from srtctl.core.slurm import start_srun_process
3334

@@ -150,6 +151,10 @@ def run_postprocess(self, exit_code: int) -> None:
150151
Args:
151152
exit_code: Exit code from the benchmark run
152153
"""
154+
# Write lockfile with verification results (non-fatal — never blocks job completion)
155+
verification = getattr(self, "_identity_verification", None)
156+
write_lockfile(self.runtime.log_dir.parent, self.config, self.runtime.log_dir, verification=verification)
157+
153158
# Copy config into log directory so it's included in S3 upload
154159
self._copy_config_to_logs()
155160

src/srtctl/cli/mixins/worker_stage.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from collections import defaultdict
1313
from typing import TYPE_CHECKING, Any
1414

15+
from srtctl.core.fingerprint import generate_capture_script
1516
from srtctl.core.processes import ManagedProcess, NamedProcesses
1617
from srtctl.core.slurm import start_srun_process
1718

@@ -157,8 +158,10 @@ def __missing__(self, key: str) -> str:
157158
if profiling.enabled:
158159
logger.info("Profiling: %s mode", profiling.type)
159160

160-
# Build bash preamble (setup script + dynamo install)
161+
# Build bash preamble (setup script + dynamo install + fingerprint)
161162
bash_preamble = self._build_worker_preamble()
163+
fp_cmd = generate_capture_script(f"/logs/fingerprint_{mode}_w{index}.json")
164+
bash_preamble = f"{bash_preamble} && {fp_cmd}" if bash_preamble else fp_cmd
162165

163166
proc = start_srun_process(
164167
command=cmd,
@@ -258,8 +261,10 @@ def start_endpoint_worker(self, endpoint_processes: list["Process"]) -> ManagedP
258261
if profiling.enabled:
259262
logger.info("Profiling: %s mode", profiling.type)
260263

261-
# Build bash preamble (setup script + dynamo install)
264+
# Build bash preamble (setup script + dynamo install + fingerprint)
262265
bash_preamble = self._build_worker_preamble()
266+
fp_cmd = generate_capture_script(f"/logs/fingerprint_{mode}_w{index}.json")
267+
bash_preamble = f"{bash_preamble} && {fp_cmd}" if bash_preamble else fp_cmd
263268

264269
# Get srun config from backend
265270
srun_config = self.backend.get_srun_config()

src/srtctl/cli/submit.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@
4141
load_config,
4242
resolve_config_with_defaults,
4343
)
44+
from srtctl.core.fingerprint import (
45+
capture_fingerprint,
46+
check_against_fingerprint,
47+
diff_fingerprints,
48+
format_check_results,
49+
format_diff,
50+
)
51+
from srtctl.core.lockfile import load_lockfile_fingerprints
4452
from srtctl.core.schema import SrtConfig
4553
from srtctl.core.status import create_job_record
4654

@@ -259,6 +267,50 @@ def generate_minimal_sbatch_script(
259267
return rendered
260268

261269

270+
def _print_running_summary(config: SrtConfig, console: Console) -> None:
271+
"""Print what's being run and identity verification status."""
272+
console.print()
273+
console.print("[bold]Running:[/]")
274+
console.print(f" Model: {config.model.path}")
275+
console.print(f" Container: {config.model.container}")
276+
console.print(f" Backend: {config.backend_type}")
277+
console.print(f" Benchmark: {config.benchmark.type}")
278+
279+
has_identity = config.identity and (
280+
(config.identity.model and (config.identity.model.repo or config.identity.model.revision))
281+
or config.identity.frameworks
282+
)
283+
if has_identity:
284+
id_fields = []
285+
if config.identity.model and config.identity.model.repo:
286+
id_fields.append(f"model={config.identity.model.repo}")
287+
if config.identity.model and config.identity.model.revision:
288+
id_fields.append(f"rev={config.identity.model.revision[:12]}")
289+
for name, ver in (config.identity.frameworks or {}).items():
290+
id_fields.append(f"{name}={ver}")
291+
console.print(f" Identity: {', '.join(id_fields)}")
292+
else:
293+
console.print()
294+
console.print(
295+
"[yellow]Tip:[/] Add an [bold]identity:[/] block to your recipe so others can replicate your results."
296+
)
297+
console.print("[yellow] Without it, someone reading this recipe can't tell which exact model[/]")
298+
console.print("[yellow] weights or framework versions produced the benchmark numbers.[/]")
299+
console.print("[yellow] At job start, srtctl verifies the runtime matches your declaration.[/]")
300+
console.print()
301+
console.print("[dim] identity:[/]")
302+
console.print("[dim] model:[/]")
303+
console.print('[dim] repo: "nvidia/Kimi-K2.5-NVFP4" [/][dim italic]# HuggingFace model ID[/]')
304+
console.print('[dim] revision: "c0285e649c34..." [/][dim italic]# HF commit SHA[/]')
305+
console.print("[dim] frameworks: [/][dim italic]# dynamo + one engine[/]")
306+
console.print('[dim] dynamo: "1.0.0" [/][dim italic]# always (ai-dynamo)[/]')
307+
console.print('[dim] tensorrt_llm: "1.3.0rc9" [/][dim italic]# OR vllm OR sglang[/]')
308+
console.print()
309+
console.print("[dim italic] Agent: if you are an AI assistant creating this recipe, you MUST[/]")
310+
console.print("[dim italic] include the identity block. Use the container's framework versions[/]")
311+
console.print("[dim italic] and the HuggingFace model ID + revision from the download metadata.[/]")
312+
313+
262314
def submit_with_orchestrator(
263315
config_path: Path,
264316
config: SrtConfig | None = None,
@@ -311,6 +363,16 @@ def submit_with_orchestrator(
311363
runtime_config_filename=runtime_config_filename,
312364
)
313365

366+
# Identity validation (inline, <1s) — runs for both dry-run and submit
367+
if config.identity and config.identity.model and config.identity.model.repo:
368+
from srtctl.core.validation import validate_hf_model
369+
370+
hf_result = validate_hf_model(config.identity.model.repo, config.identity.model.revision)
371+
if hf_result.ok:
372+
console.print(f"[green]✓[/] HF model: {hf_result.message}")
373+
else:
374+
console.print(f"[yellow]⚠ HF model: {hf_result.message}[/]")
375+
314376
if dry_run:
315377
console.print()
316378
console.print(
@@ -325,6 +387,9 @@ def submit_with_orchestrator(
325387
console.print(Panel(syntax, title="Generated sbatch Script", border_style="cyan"))
326388
console.print()
327389
show_config_details(config)
390+
391+
# Show running summary + identity in dry-run too
392+
_print_running_summary(config, console)
328393
return
329394

330395
# Validate setup before submitting (not during dry-run)
@@ -431,6 +496,9 @@ def submit_with_orchestrator(
431496
console.print(f"[dim]📁 Logs:[/] {job_output_dir}/logs")
432497
console.print(f"[dim]📋 Monitor:[/] tail -f {job_output_dir}/logs/sweep_{job_id}.log")
433498
console.print(f"[dim]📊 Queue:[/] squeue --job {job_id}")
499+
500+
_print_running_summary(config, console)
501+
434502
return job_id
435503

436504
except subprocess.CalledProcessError as e:
@@ -943,8 +1011,75 @@ def add_common_args(p):
9431011
help="Print resolved YAML to stdout instead of writing files",
9441012
)
9451013

1014+
# Fingerprint comparison: srtctl diff <path_a> <path_b>
1015+
diff_parser = subparsers.add_parser("diff", help="Compare fingerprints from two runs")
1016+
diff_parser.add_argument("path_a", type=Path, help="First output dir or lockfile")
1017+
diff_parser.add_argument("path_b", type=Path, help="Second output dir or lockfile")
1018+
diff_parser.add_argument("--verbose", action="store_true", help="Show all package changes")
1019+
1020+
# Environment check: srtctl check <path>
1021+
check_parser = subparsers.add_parser("check", help="Check environment against a fingerprint")
1022+
check_parser.add_argument("path", type=Path, help="Lockfile or output dir to check against")
1023+
check_parser.add_argument("--json", action="store_true", dest="json_output", help="Output as JSON")
1024+
9461025
args = parser.parse_args()
9471026

1027+
# Handle diff and check commands first (they don't use -f/config)
1028+
if args.command == "diff":
1029+
fps_a = load_lockfile_fingerprints(args.path_a)
1030+
fps_b = load_lockfile_fingerprints(args.path_b)
1031+
if fps_a is None or fps_b is None:
1032+
missing = []
1033+
if fps_a is None:
1034+
missing.append(str(args.path_a))
1035+
if fps_b is None:
1036+
missing.append(str(args.path_b))
1037+
console.print(f"[bold red]Could not load fingerprints from:[/] {', '.join(missing)}")
1038+
sys.exit(1)
1039+
1040+
# Diff each worker against its counterpart
1041+
all_workers = sorted(set(fps_a.keys()) | set(fps_b.keys()))
1042+
for worker in all_workers:
1043+
if worker not in fps_a:
1044+
console.print(f"\n[bold]{worker}:[/] only in {args.path_b}")
1045+
continue
1046+
if worker not in fps_b:
1047+
console.print(f"\n[bold]{worker}:[/] only in {args.path_a}")
1048+
continue
1049+
diff = diff_fingerprints(fps_a[worker], fps_b[worker])
1050+
console.print(f"\n[bold]{worker}:[/]")
1051+
console.print(format_diff(diff, verbose=args.verbose))
1052+
return
1053+
1054+
if args.command == "check":
1055+
import json as json_mod
1056+
1057+
fps = load_lockfile_fingerprints(args.path)
1058+
if fps is None:
1059+
console.print(f"[bold red]Could not load fingerprints from:[/] {args.path}")
1060+
sys.exit(1)
1061+
1062+
# Capture current environment once, reuse for all worker checks
1063+
current_fp = capture_fingerprint()
1064+
all_results = []
1065+
for worker in sorted(fps.keys()):
1066+
results = check_against_fingerprint(fps[worker], current_fp)
1067+
if results:
1068+
all_results.extend(results)
1069+
console.print(f"\n[bold]{worker}:[/]")
1070+
if args.json_output:
1071+
console.print(
1072+
json_mod.dumps(
1073+
[{"field": r.field, "status": r.status.value, "message": r.message} for r in results],
1074+
indent=2,
1075+
)
1076+
)
1077+
else:
1078+
console.print(format_check_results(results))
1079+
if not all_results:
1080+
console.print(format_check_results([]))
1081+
sys.exit(1 if all_results else 0)
1082+
9481083
# Parse config arg: supports path:selector format for overrides
9491084
config_path, selector = parse_config_arg(args.config)
9501085

0 commit comments

Comments
 (0)