Skip to content

Commit ff3e4f0

Browse files
alec-flowersclaude
andcommitted
fix: use heredoc for fingerprint capture script to avoid escaping bugs
The inline python3 -c approach produced literal \n characters instead of newlines when passing through bash → srun → bash → python. This caused a SyntaxError that was silently swallowed by || true, so fingerprints were never actually collected. Fix: write the capture script via a bash heredoc (cat <<'EOF') and pipe to python3 via process substitution. This is immune to quoting/escaping issues in the srun chain. Also add two new tests: - test_embedded_python_is_syntactically_valid: ast.parse() the extracted Python source to catch syntax errors at test time - test_embedded_python_produces_json: actually execute the script in a subprocess to verify it runs end-to-end Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent a4b943c commit ff3e4f0

2 files changed

Lines changed: 104 additions & 54 deletions

File tree

src/srtctl/core/fingerprint.py

Lines changed: 50 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -434,14 +434,10 @@ def diff_fingerprints(a: dict[str, Any], b: dict[str, Any]) -> FingerprintDiff:
434434
)
435435
elif in_a and not in_b:
436436
removed += 1
437-
package_diffs.append(
438-
PackageDiff(package=pkg, status=CheckStatus.MISSING, version_a=pkgs_a[pkg])
439-
)
437+
package_diffs.append(PackageDiff(package=pkg, status=CheckStatus.MISSING, version_a=pkgs_a[pkg]))
440438
else:
441439
added += 1
442-
package_diffs.append(
443-
PackageDiff(package=pkg, status=CheckStatus.EXTRA, version_b=pkgs_b[pkg])
444-
)
440+
package_diffs.append(PackageDiff(package=pkg, status=CheckStatus.EXTRA, version_b=pkgs_b[pkg]))
445441

446442
return FingerprintDiff(
447443
field_changes=field_changes,
@@ -573,51 +569,54 @@ def generate_capture_script(output_path: str) -> str:
573569
Returns:
574570
Bash command string safe for inclusion in a preamble chain.
575571
"""
576-
# We inline the capture as a Python -c script to avoid depending on
577-
# srtctl being installed inside the container
578-
return (
579-
f'python3 -c "'
580-
"import json, subprocess, platform, socket, sys;"
581-
"from pathlib import Path;"
582-
"from datetime import datetime, timezone;"
583-
""
584-
"def run(cmd, timeout=5):\\n"
585-
" try:\\n"
586-
" r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)\\n"
587-
" return r.stdout.strip() if r.returncode == 0 else None\\n"
588-
" except Exception:\\n"
589-
" return None\\n"
590-
""
591-
"def pip_pkgs():\\n"
592-
" out = run('pip freeze')\\n"
593-
" if not out: return []\\n"
594-
" return sorted([l.strip() for l in out.splitlines() if l.strip()], key=lambda s: s.lower())\\n"
595-
""
596-
"def gpu_info():\\n"
597-
" out = run('nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader')\\n"
598-
" if not out: return {'available': False}\\n"
599-
" gpus = []\\n"
600-
" for line in out.splitlines():\\n"
601-
" parts = [p.strip() for p in line.split(',')]\\n"
602-
" if len(parts) >= 3: gpus.append({'name': parts[0], 'driver': parts[1], 'memory': parts[2]})\\n"
603-
" return {'available': True, 'driver': gpus[0]['driver'] if gpus else 'unknown', 'gpus': gpus}\\n"
604-
""
605-
"fp = {"
606-
"'hostname': socket.gethostname(),"
607-
"'timestamp': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),"
608-
"'arch': platform.machine(),"
609-
"'os': next((l.split('=',1)[1].strip('\\\"') for l in Path('/etc/os-release').read_text().splitlines() if l.startswith('PRETTY_NAME=')), platform.platform()) if Path('/etc/os-release').exists() else platform.platform(),"
610-
"'gpu': gpu_info(),"
611-
"'python_version': platform.python_version(),"
612-
"'cuda_version': run('nvcc --version 2>/dev/null | grep release') or 'unavailable',"
613-
"'torch_version': run('python3 -c \\\"import torch; print(torch.__version__)\\\"') or 'unavailable',"
614-
"'nccl_version': run('python3 -c \\\"import torch; print(torch.cuda.nccl.version())\\\"') or 'unavailable',"
615-
"'pip_packages': pip_pkgs(),"
616-
"};"
617-
f"Path('{output_path}').parent.mkdir(parents=True, exist_ok=True);"
618-
f"Path('{output_path}').write_text(json.dumps(fp, indent=2) + '\\n')"
619-
f'" || true'
620-
)
572+
# We write a temp Python script and execute it, rather than using
573+
# python3 -c with inline code. This avoids escaping nightmares when
574+
# the command passes through bash → srun → bash → python.
575+
script = f"""\
576+
import json, subprocess, platform, socket, sys
577+
from pathlib import Path
578+
from datetime import datetime, timezone
579+
580+
def run(cmd, timeout=5):
581+
try:
582+
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
583+
return r.stdout.strip() if r.returncode == 0 else None
584+
except Exception:
585+
return None
586+
587+
def pip_pkgs():
588+
out = run('pip freeze')
589+
if not out: return []
590+
return sorted([l.strip() for l in out.splitlines() if l.strip()], key=lambda s: s.lower())
591+
592+
def gpu_info():
593+
out = run('nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader')
594+
if not out: return {{'available': False}}
595+
gpus = []
596+
for line in out.splitlines():
597+
parts = [p.strip() for p in line.split(',')]
598+
if len(parts) >= 3:
599+
gpus.append({{'name': parts[0], 'driver': parts[1], 'memory': parts[2]}})
600+
return {{'available': True, 'driver': gpus[0]['driver'] if gpus else 'unknown', 'gpus': gpus}}
601+
602+
fp = {{
603+
'hostname': socket.gethostname(),
604+
'timestamp': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
605+
'arch': platform.machine(),
606+
'os': next((l.split('=',1)[1].strip('"') for l in Path('/etc/os-release').read_text().splitlines() if l.startswith('PRETTY_NAME=')), platform.platform()) if Path('/etc/os-release').exists() else platform.platform(),
607+
'gpu': gpu_info(),
608+
'python_version': platform.python_version(),
609+
'cuda_version': run('nvcc --version 2>/dev/null | grep release') or 'unavailable',
610+
'torch_version': run('python3 -c "import torch; print(torch.__version__)"') or 'unavailable',
611+
'nccl_version': run('python3 -c "import torch; print(torch.cuda.nccl.version())"') or 'unavailable',
612+
'pip_packages': pip_pkgs(),
613+
}}
614+
Path('{output_path}').parent.mkdir(parents=True, exist_ok=True)
615+
Path('{output_path}').write_text(json.dumps(fp, indent=2) + '\\n')
616+
"""
617+
# Use a heredoc to write the script to a temp file, then execute it.
618+
# This is immune to quoting/escaping issues in the bash → srun chain.
619+
return f"python3 <(cat <<'__FINGERPRINT_EOF__'\n{script}__FINGERPRINT_EOF__\n) || true"
621620

622621

623622
# ============================================================================

tests/test_fingerprint.py

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -527,17 +527,17 @@ class TestBashGeneration:
527527
def test_script_ends_with_or_true(self):
528528
"""Script is wrapped in || true so it never blocks the worker."""
529529
script = generate_capture_script("/logs/fingerprint.json")
530-
assert script.endswith('|| true')
530+
assert script.rstrip().endswith('|| true')
531531

532532
def test_script_contains_output_path(self):
533533
"""Output path appears in the generated script."""
534534
script = generate_capture_script("/logs/fingerprint_prefill_w0.json")
535535
assert "/logs/fingerprint_prefill_w0.json" in script
536536

537537
def test_script_starts_with_python(self):
538-
"""Script runs as python3 -c."""
538+
"""Script invokes python3."""
539539
script = generate_capture_script("/logs/fp.json")
540-
assert script.startswith('python3 -c "')
540+
assert script.startswith('python3')
541541

542542
def test_script_includes_pip_freeze(self):
543543
"""Script captures pip packages."""
@@ -549,6 +549,57 @@ def test_script_includes_sorted(self):
549549
script = generate_capture_script("/logs/fp.json")
550550
assert "sorted" in script
551551

552+
def test_embedded_python_is_syntactically_valid(self):
553+
"""The Python code inside the script must parse without SyntaxError.
554+
555+
This is the test that would have caught the \\n escaping bug where
556+
literal backslash-n characters were passed to python3 -c instead of
557+
real newlines, causing a SyntaxError at runtime.
558+
"""
559+
import ast
560+
import re
561+
562+
script = generate_capture_script("/logs/fingerprint.json")
563+
# Extract the Python source from between the heredoc markers
564+
match = re.search(
565+
r"<<'__FINGERPRINT_EOF__'\n(.+?)__FINGERPRINT_EOF__",
566+
script,
567+
re.DOTALL,
568+
)
569+
assert match, f"Could not find heredoc Python source in script:\n{script[:200]}"
570+
python_source = match.group(1)
571+
# ast.parse raises SyntaxError if the code is invalid
572+
ast.parse(python_source)
573+
574+
def test_embedded_python_produces_json(self):
575+
"""The embedded script runs and produces valid JSON output."""
576+
import re
577+
import subprocess
578+
import tempfile
579+
580+
script = generate_capture_script("/tmp/test_fingerprint_output.json")
581+
match = re.search(
582+
r"<<'__FINGERPRINT_EOF__'\n(.+?)__FINGERPRINT_EOF__",
583+
script,
584+
re.DOTALL,
585+
)
586+
assert match
587+
python_source = match.group(1)
588+
589+
# Run the script in a subprocess — probes will return "unavailable"
590+
# on a dev machine (no GPU, etc.) but the script must not crash
591+
with tempfile.NamedTemporaryFile(suffix=".py", mode="w", delete=False) as f:
592+
# Rewrite output path to a temp location
593+
f.write(python_source.replace("/tmp/test_fingerprint_output.json", f.name + ".out"))
594+
f.flush()
595+
result = subprocess.run(
596+
["python3", f.name],
597+
capture_output=True,
598+
text=True,
599+
timeout=15,
600+
)
601+
assert result.returncode == 0, f"Fingerprint script failed:\n{result.stderr}"
602+
552603

553604
# ============================================================================
554605
# Formatting

0 commit comments

Comments
 (0)