Skip to content

Commit e3df22f

Browse files
committed
refactor: replace torch_version with frameworks dict, drop unverifiable container identity
- Removed container_identity() probe — Pyxis/enroot stores zero provenance metadata inside containers (confirmed by inspection on ptyche GB200) - Removed torch_version as a standalone field — torch version is now captured inside the frameworks dict alongside vllm, sglang, tensorrt_llm, dynamo - frameworks dict only includes detected frameworks (sparse) - Added model_identity() probe for HF repo/revision from download metadata - Updated pip freeze to use python3 -m pip freeze for better container compat - Updated all tests to use new schema
1 parent b135e7c commit e3df22f

2 files changed

Lines changed: 51 additions & 59 deletions

File tree

src/srtctl/core/fingerprint.py

Lines changed: 27 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,11 @@ class FingerprintDiff:
117117
# Core versions
118118
"python_version",
119119
"cuda_version",
120-
"torch_version",
121120
"nccl_version",
121+
# Frameworks (vllm, sglang, trtllm, dynamo, torch)
122+
"frameworks",
123+
# Model identity (HF repo, revision)
124+
"model",
122125
# Full package list (always last)
123126
"pip_packages",
124127
]
@@ -234,14 +237,6 @@ def probe_cuda_version() -> ProbeResult:
234237
return ProbeResult.failure("nvcc not found")
235238

236239

237-
def probe_torch_version() -> ProbeResult:
238-
"""Get PyTorch version."""
239-
out = _run_cmd('python3 -c "import torch; print(torch.__version__)"')
240-
if out:
241-
return ProbeResult.success(out)
242-
return ProbeResult.failure("torch not importable")
243-
244-
245240
def probe_nccl_version() -> ProbeResult:
246241
"""Get NCCL version via PyTorch."""
247242
out = _run_cmd('python3 -c "import torch; print(torch.cuda.nccl.version())"')
@@ -250,9 +245,25 @@ def probe_nccl_version() -> ProbeResult:
250245
return ProbeResult.failure("nccl version unavailable")
251246

252247

248+
def probe_frameworks() -> ProbeResult:
249+
"""Get versions of inference frameworks (only detected ones)."""
250+
versions: dict[str, str] = {}
251+
for name, cmd in [
252+
("vllm", 'python3 -c "import vllm; print(vllm.__version__)"'),
253+
("sglang", 'python3 -c "import sglang; print(sglang.__version__)"'),
254+
("tensorrt_llm", 'python3 -c "import tensorrt_llm; print(tensorrt_llm.__version__)"'),
255+
("dynamo", "python3 -c \"import importlib.metadata; print(importlib.metadata.version('ai-dynamo'))\""),
256+
("torch", 'python3 -c "import torch; print(torch.__version__)"'),
257+
]:
258+
v = _run_cmd(cmd)
259+
if v:
260+
versions[name] = v
261+
return ProbeResult.success(versions)
262+
263+
253264
def probe_pip_packages() -> ProbeResult:
254265
"""Get installed pip packages, sorted alphabetically (case-insensitive)."""
255-
out = _run_cmd("pip freeze")
266+
out = _run_cmd("python3 -m pip freeze 2>/dev/null") or _run_cmd("pip freeze")
256267
if out is None:
257268
return ProbeResult.failure("pip freeze failed")
258269

@@ -276,8 +287,8 @@ def probe_pip_packages() -> ProbeResult:
276287
"gpu": probe_gpu,
277288
"python_version": probe_python_version,
278289
"cuda_version": probe_cuda_version,
279-
"torch_version": probe_torch_version,
280290
"nccl_version": probe_nccl_version,
291+
"frameworks": probe_frameworks,
281292
"pip_packages": probe_pip_packages,
282293
}
283294

@@ -348,7 +359,6 @@ def load_fingerprint(path: Path) -> dict[str, Any] | None:
348359
"os",
349360
"python_version",
350361
"cuda_version",
351-
"torch_version",
352362
"nccl_version",
353363
]
354364

@@ -605,33 +615,17 @@ def framework_versions():
605615
('vllm', 'python3 -c "import vllm; print(vllm.__version__)"'),
606616
('sglang', 'python3 -c "import sglang; print(sglang.__version__)"'),
607617
('tensorrt_llm', 'python3 -c "import tensorrt_llm; print(tensorrt_llm.__version__)"'),
608-
('dynamo', 'python3 -c "import dynamo; print(dynamo.__version__)"'),
618+
('dynamo', 'python3 -c "import importlib.metadata; print(importlib.metadata.version(\\\"ai-dynamo\\\"))"'),
609619
]:
610620
v = run(cmd)
611621
if v:
612622
versions[name] = v
623+
# torch embeds a git hash in the version string (e.g. 2.10.0a0+b4e4ee81d3.nv25.12)
624+
torch_v = run('python3 -c "import torch; print(torch.__version__)"')
625+
if torch_v:
626+
versions['torch'] = torch_v
613627
return versions
614628
615-
def container_identity():
616-
info = {{}}
617-
# Pyxis/enroot: /etc/enroot stores import metadata
618-
for p in ['/etc/enroot/image.env', '/etc/enroot/.env']:
619-
if Path(p).exists():
620-
for line in Path(p).read_text().splitlines():
621-
if '=' in line:
622-
k, _, v = line.partition('=')
623-
info[k.strip()] = v.strip().strip('"')
624-
break
625-
# Docker: inspect labels via /proc/1/cpuset or /.dockerenv
626-
if not info and Path('/.dockerenv').exists():
627-
info['runtime'] = 'docker'
628-
# OCI image digest from /etc/enroot/image-digest if available
629-
for p in ['/etc/enroot/image-digest', '/etc/enroot/.image-digest']:
630-
if Path(p).exists():
631-
info['digest'] = Path(p).read_text().strip()
632-
break
633-
return info or None
634-
635629
def model_identity(model_path):
636630
info = {{}}
637631
mp = Path(model_path) if model_path else None
@@ -672,10 +666,8 @@ def model_identity(model_path):
672666
'gpu': gpu_info(),
673667
'python_version': platform.python_version(),
674668
'cuda_version': run('nvcc --version 2>/dev/null | grep release') or 'unavailable',
675-
'torch_version': run('python3 -c "import torch; print(torch.__version__)"') or 'unavailable',
676669
'nccl_version': run('python3 -c "import torch; print(torch.cuda.nccl.version())"') or 'unavailable',
677670
'frameworks': framework_versions(),
678-
'container': container_identity(),
679671
'model': model_identity('/model'),
680672
'pip_packages': pip_pkgs(),
681673
}}

tests/test_fingerprint.py

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ def _make_fingerprint(**overrides) -> dict:
5252
"gpu": {"available": True, "driver": "570.86.15", "gpus": [{"name": "GB200", "driver": "570.86.15", "memory": "192 GiB"}]},
5353
"python_version": "3.11.9",
5454
"cuda_version": "12.8",
55-
"torch_version": "2.6.0+cu128",
5655
"nccl_version": "2.25.1",
56+
"frameworks": {"torch": "2.6.0+cu128", "sglang": "0.4.6.post1", "dynamo": "0.8.1"},
5757
"pip_packages": [
5858
"ai-dynamo==0.8.1",
5959
"numpy==1.26.4",
@@ -85,8 +85,8 @@ def test_field_order_is_canonical(self):
8585
"gpu": {},
8686
"os": "Ubuntu",
8787
"cuda_version": "12.8",
88-
"torch_version": "2.6.0",
8988
"nccl_version": "2.25",
89+
"frameworks": {"torch": "2.6.0"},
9090
}
9191
ordered = _ordered_fingerprint(data)
9292
keys = list(ordered.keys())
@@ -157,8 +157,8 @@ def test_capture_with_all_probes_mocked(self):
157157
"gpu": lambda: ProbeResult.success({"available": False}),
158158
"python_version": lambda: ProbeResult.success("3.11.9"),
159159
"cuda_version": lambda: ProbeResult.success("12.8"),
160-
"torch_version": lambda: ProbeResult.success("2.6.0"),
161160
"nccl_version": lambda: ProbeResult.success("2.25.1"),
161+
"frameworks": lambda: ProbeResult.success({"torch": "2.6.0", "dynamo": "0.8.1"}),
162162
"pip_packages": lambda: ProbeResult.success(["numpy==1.0", "torch==2.6.0"]),
163163
}
164164

@@ -273,14 +273,14 @@ def test_identical_fingerprints(self):
273273
assert diff.packages_removed == 0
274274

275275
def test_scalar_field_change(self):
276-
"""Changed torch version shows up in field_changes."""
277-
a = _make_fingerprint(torch_version="2.6.0+cu128")
278-
b = _make_fingerprint(torch_version="2.7.0+cu128")
276+
"""Changed CUDA version shows up in field_changes."""
277+
a = _make_fingerprint(cuda_version="12.8")
278+
b = _make_fingerprint(cuda_version="13.1")
279279

280280
diff = diff_fingerprints(a, b)
281281

282-
assert "torch_version" in diff.field_changes
283-
assert diff.field_changes["torch_version"] == ("2.6.0+cu128", "2.7.0+cu128")
282+
assert "cuda_version" in diff.field_changes
283+
assert diff.field_changes["cuda_version"] == ("12.8", "13.1")
284284

285285
def test_gpu_driver_change(self):
286286
"""GPU driver change detected from nested structure."""
@@ -372,17 +372,17 @@ def test_matching_environment(self):
372372
assert results == []
373373

374374
def test_version_mismatch_reported(self):
375-
"""Torch version change appears in results."""
376-
ref = _make_fingerprint(torch_version="2.6.0")
377-
cur = _make_fingerprint(torch_version="2.7.0")
375+
"""CUDA version change appears in results."""
376+
ref = _make_fingerprint(cuda_version="12.8")
377+
cur = _make_fingerprint(cuda_version="13.1")
378378

379379
results = check_against_fingerprint(ref, cur)
380380

381-
torch_results = [r for r in results if r.field == "torch_version"]
382-
assert len(torch_results) == 1
383-
assert torch_results[0].status == CheckStatus.MISMATCH
384-
assert torch_results[0].expected == "2.6.0"
385-
assert torch_results[0].actual == "2.7.0"
381+
cuda_results = [r for r in results if r.field == "cuda_version"]
382+
assert len(cuda_results) == 1
383+
assert cuda_results[0].status == CheckStatus.MISMATCH
384+
assert cuda_results[0].expected == "12.8"
385+
assert cuda_results[0].actual == "13.1"
386386

387387
def test_missing_package_reported(self):
388388
"""Package in reference but not current is MISSING."""
@@ -429,8 +429,8 @@ def test_captures_fresh_if_current_is_none(self):
429429
"gpu": lambda: ProbeResult.success({"available": True, "driver": "570.86.15", "gpus": []}),
430430
"python_version": lambda: ProbeResult.success("3.11.9"),
431431
"cuda_version": lambda: ProbeResult.success("12.8"),
432-
"torch_version": lambda: ProbeResult.success("2.6.0+cu128"),
433432
"nccl_version": lambda: ProbeResult.success("2.25.1"),
433+
"frameworks": lambda: ProbeResult.success(ref["frameworks"]),
434434
"pip_packages": lambda: ProbeResult.success(ref["pip_packages"]),
435435
}
436436

@@ -620,8 +620,8 @@ def test_format_diff_identical(self):
620620

621621
def test_format_diff_with_changes(self):
622622
"""Changes are clearly shown."""
623-
a = _make_fingerprint(torch_version="2.6.0", pip_packages=["torch==2.6.0"])
624-
b = _make_fingerprint(torch_version="2.7.0", pip_packages=["torch==2.7.0"])
623+
a = _make_fingerprint(cuda_version="12.8", pip_packages=["torch==2.6.0"])
624+
b = _make_fingerprint(cuda_version="13.1", pip_packages=["torch==2.7.0"])
625625

626626
diff = diff_fingerprints(a, b)
627627
output = format_diff(diff)
@@ -664,11 +664,11 @@ def test_format_check_results_with_mismatches(self):
664664
"""Mismatches are clearly reported."""
665665
results = [
666666
CheckResult(
667-
field="torch_version",
667+
field="cuda_version",
668668
status=CheckStatus.MISMATCH,
669-
message="torch_version: 2.6.0 -> 2.7.0",
670-
expected="2.6.0",
671-
actual="2.7.0",
669+
message="cuda_version: 12.8 -> 13.1",
670+
expected="12.8",
671+
actual="13.1",
672672
),
673673
CheckResult(
674674
field="pip:sglang",
@@ -680,5 +680,5 @@ def test_format_check_results_with_mismatches(self):
680680
output = format_check_results(results)
681681

682682
assert "2 mismatches" in output
683-
assert "torch_version" in output
683+
assert "cuda_version" in output
684684
assert "sglang" in output

0 commit comments

Comments
 (0)