Skip to content

Commit c044da9

Browse files
alec-flowersclaude
andcommitted
fix: support cross-arch clusters (x86_64 login, aarch64 compute)
On clusters like Lyris where the login node is x86_64 but compute nodes are aarch64 (GB200/Grace), the uv binary in ~/.local/bin is the wrong arch for sbatch scripts running on compute nodes. Changes: - make setup now downloads a compute-arch uv binary to bin/ - sbatch template prepends $SRTCTL_SOURCE/bin to PATH so compute-arch uv takes precedence, while login node still uses ~/.local/bin/uv - make setup is now idempotent (skips downloads if already present) - .gitignore excludes bin/ (compute-arch binaries) Regression from 452bb8d which replaced container pip install + python3 with uv run --python 3.12, assuming uv would be the same arch on both login and compute nodes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 31c3e59 commit c044da9

6 files changed

Lines changed: 209 additions & 36 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ Thumbs.db
4040
srtslurm.yaml
4141
srtslurm.toml
4242

43+
# Compute-arch uv binary (installed by make setup)
44+
bin/
45+
4346
# Configs - ignore downloaded files but keep deepep_config.json
4447
configs/nats-server
4548
configs/etcd

Makefile

Lines changed: 54 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -45,43 +45,67 @@ setup:
4545
@mkdir -p logs
4646
@echo "🖥️ Using architecture: $(ARCH)"
4747
@case "$(ARCH)" in \
48-
x86_64) ARCH_SHORT="amd64" ;; \
49-
aarch64) ARCH_SHORT="arm64" ;; \
48+
x86_64) ARCH_SHORT="amd64"; ARCH_FILE_PATTERN="x86-64" ;; \
49+
aarch64) ARCH_SHORT="arm64"; ARCH_FILE_PATTERN="aarch64" ;; \
5050
*) echo "❌ Unsupported architecture: $(ARCH)"; exit 1 ;; \
5151
esac; \
5252
echo "ℹ️ Dynamo 0.8.0 will be installed from PyPI when workers start"; \
53-
echo "⬇️ Downloading NATS ($(NATS_VERSION)) for $$ARCH_SHORT..."; \
54-
NATS_DEB="nats-server-$(NATS_VERSION)-$$ARCH_SHORT.deb"; \
55-
NATS_URL="https://github.com/nats-io/nats-server/releases/download/$(NATS_VERSION)/$$NATS_DEB"; \
56-
wget -q --show-progress --tries=3 --waitretry=5 "$$NATS_URL" -O "configs/$$NATS_DEB"; \
57-
echo "📁 Extracting NATS binary..."; \
58-
TMP_DIR=$$(mktemp -d); \
59-
dpkg-deb -x "configs/$$NATS_DEB" "$$TMP_DIR"; \
60-
if [ -f "$$TMP_DIR/usr/local/bin/nats-server" ]; then \
61-
cp "$$TMP_DIR/usr/local/bin/nats-server" configs/; \
62-
elif [ -f "$$TMP_DIR/usr/bin/nats-server" ]; then \
63-
cp "$$TMP_DIR/usr/bin/nats-server" configs/; \
53+
echo ""; \
54+
echo "--- NATS $(NATS_VERSION) ---"; \
55+
if [ -f configs/nats-server ] && file configs/nats-server | grep -q "$$ARCH_FILE_PATTERN"; then \
56+
echo "✅ NATS already installed at configs/nats-server ($(ARCH))"; \
6457
else \
65-
echo "❌ Could not find nats-server binary inside the .deb package"; \
66-
ls -R "$$TMP_DIR" | head -n 50; \
67-
exit 1; \
58+
echo "⬇️ Downloading NATS ($(NATS_VERSION)) for $$ARCH_SHORT..."; \
59+
NATS_DEB="nats-server-$(NATS_VERSION)-$$ARCH_SHORT.deb"; \
60+
NATS_URL="https://github.com/nats-io/nats-server/releases/download/$(NATS_VERSION)/$$NATS_DEB"; \
61+
wget -q --show-progress --tries=3 --waitretry=5 "$$NATS_URL" -O "configs/$$NATS_DEB"; \
62+
echo "📁 Extracting NATS binary..."; \
63+
TMP_DIR=$$(mktemp -d); \
64+
dpkg-deb -x "configs/$$NATS_DEB" "$$TMP_DIR"; \
65+
if [ -f "$$TMP_DIR/usr/local/bin/nats-server" ]; then \
66+
cp "$$TMP_DIR/usr/local/bin/nats-server" configs/; \
67+
elif [ -f "$$TMP_DIR/usr/bin/nats-server" ]; then \
68+
cp "$$TMP_DIR/usr/bin/nats-server" configs/; \
69+
else \
70+
echo "❌ Could not find nats-server binary inside the .deb package"; \
71+
ls -R "$$TMP_DIR" | head -n 50; \
72+
exit 1; \
73+
fi; \
74+
chmod +x configs/nats-server; \
75+
rm -rf "$$TMP_DIR" "configs/$$NATS_DEB"; \
76+
echo "✅ NATS installed to configs/nats-server"; \
77+
fi; \
78+
echo ""; \
79+
echo "--- ETCD $(ETCD_VERSION) ---"; \
80+
if [ -f configs/etcd ] && [ -f configs/etcdctl ] && file configs/etcd | grep -q "$$ARCH_FILE_PATTERN"; then \
81+
echo "✅ ETCD already installed at configs/etcd ($(ARCH))"; \
82+
else \
83+
echo "⬇️ Downloading ETCD ($(ETCD_VERSION)) for $$ARCH_SHORT..."; \
84+
ETCD_TAR="etcd-$(ETCD_VERSION)-linux-$$ARCH_SHORT.tar.gz"; \
85+
ETCD_URL="https://github.com/etcd-io/etcd/releases/download/$(ETCD_VERSION)/$$ETCD_TAR"; \
86+
wget -q --show-progress --tries=3 --waitretry=5 "$$ETCD_URL" -O "configs/$$ETCD_TAR"; \
87+
echo "📁 Extracting ETCD binaries..."; \
88+
tar -xzf "configs/$$ETCD_TAR" --strip-components=1 -C configs etcd-$(ETCD_VERSION)-linux-$$ARCH_SHORT/etcd etcd-$(ETCD_VERSION)-linux-$$ARCH_SHORT/etcdctl; \
89+
chmod +x configs/etcd configs/etcdctl; \
90+
rm "configs/$$ETCD_TAR"; \
91+
echo "✅ ETCD installed to configs/etcd"; \
92+
fi; \
93+
echo ""; \
94+
echo "--- uv (compute node arch: $(ARCH)) ---"; \
95+
if [ -f bin/uv ] && file bin/uv | grep -q "$$ARCH_FILE_PATTERN"; then \
96+
echo "✅ uv already installed at bin/uv ($(ARCH))"; \
97+
else \
98+
echo "⬇️ Downloading uv for $(ARCH)..."; \
99+
mkdir -p bin; \
100+
UV_URL="https://github.com/astral-sh/uv/releases/latest/download/uv-$(ARCH)-unknown-linux-gnu.tar.gz"; \
101+
curl -LsSf "$$UV_URL" | tar -xz --strip-components=1 -C bin; \
102+
chmod +x bin/uv bin/uvx 2>/dev/null; \
103+
echo "✅ uv installed to bin/uv ($$(file bin/uv | grep -o 'ARM aarch64\|x86-64'))"; \
68104
fi; \
69-
chmod +x configs/nats-server; \
70-
rm -rf "$$TMP_DIR" "configs/$$NATS_DEB"; \
71-
echo "⬇️ Downloading ETCD ($(ETCD_VERSION)) for $$ARCH_SHORT..."; \
72-
ETCD_TAR="etcd-$(ETCD_VERSION)-linux-$$ARCH_SHORT.tar.gz"; \
73-
ETCD_URL="https://github.com/etcd-io/etcd/releases/download/$(ETCD_VERSION)/$$ETCD_TAR"; \
74-
wget -q --show-progress --tries=3 --waitretry=5 "$$ETCD_URL" -O "configs/$$ETCD_TAR"; \
75-
echo "📁 Extracting ETCD binaries..."; \
76-
tar -xzf "configs/$$ETCD_TAR" --strip-components=1 -C configs etcd-$(ETCD_VERSION)-linux-$$ARCH_SHORT/etcd etcd-$(ETCD_VERSION)-linux-$$ARCH_SHORT/etcdctl; \
77-
chmod +x configs/etcd configs/etcdctl; \
78-
rm "configs/$$ETCD_TAR"; \
79-
echo "✅ Done. Contents of configs directory:"; \
80-
ls -lh configs/; \
81105
echo ""; \
82-
echo "⚙️ Setting up srtslurm.yaml..."; \
106+
echo "--- srtslurm.yaml ---"; \
83107
if [ -f srtslurm.yaml ]; then \
84-
echo "ℹ️ srtslurm.yaml already exists, skipping..."; \
108+
echo "srtslurm.yaml already exists"; \
85109
else \
86110
echo "Creating srtslurm.yaml with your cluster settings..."; \
87111
echo ""; \

src/srtctl/cli/submit.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,33 @@ def show_config_details(config: SrtConfig) -> None:
151151
console.print(f"[dim]srun options:[/] {opts}")
152152

153153

154+
def validate_setup(srtctl_source: Path) -> None:
155+
"""Validate that make setup has been run and required binaries exist.
156+
157+
Checks for NATS, etcd, and compute-arch uv binaries. Raises SystemExit
158+
with a clear error message if anything is missing.
159+
"""
160+
missing = []
161+
162+
configs_dir = srtctl_source / "configs"
163+
if not (configs_dir / "nats-server").exists():
164+
missing.append("configs/nats-server")
165+
if not (configs_dir / "etcd").exists():
166+
missing.append("configs/etcd")
167+
if not (srtctl_source / "bin" / "uv").exists():
168+
missing.append("bin/uv (compute-arch uv)")
169+
170+
if missing:
171+
console.print(f"\n[red bold]ERROR:[/] Required binaries not found in {srtctl_source}:")
172+
for m in missing:
173+
console.print(f" [red]✗[/] {m}")
174+
console.print("\nRun [bold]make setup ARCH=<compute_arch>[/] first:")
175+
console.print(f" cd {srtctl_source}")
176+
console.print(" make setup ARCH=aarch64 [dim]# for GB200/Grace compute nodes[/]")
177+
console.print(" make setup ARCH=x86_64 [dim]# for x86_64 compute nodes[/]\n")
178+
raise SystemExit(1)
179+
180+
154181
def generate_minimal_sbatch_script(
155182
config: SrtConfig,
156183
config_path: Path,
@@ -300,6 +327,11 @@ def submit_with_orchestrator(
300327
show_config_details(config)
301328
return
302329

330+
# Validate setup before submitting (not during dry-run)
331+
srtctl_root = get_srtslurm_setting("srtctl_root")
332+
srtctl_source = Path(srtctl_root) if srtctl_root else Path(__file__).parent.parent.parent.parent
333+
validate_setup(srtctl_source)
334+
303335
# Write script to temp file
304336
fd, script_path = tempfile.mkstemp(suffix=".slurm", prefix="srtctl_", text=True)
305337
with os.fdopen(fd, "w") as f:

src/srtctl/templates/job_script_minimal.j2

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,17 @@ export SRTCTL_SOURCE_DIR="${SRTCTL_SOURCE}"
7878
echo ""
7979
echo "Preparing srtctl environment..."
8080

81-
# Ensure ~/.local/bin is in PATH (uv installs here)
82-
export PATH="$HOME/.local/bin:$PATH"
81+
# Use compute-arch uv from srt-slurm/bin (installed by make setup ARCH=<compute_arch>)
82+
# This avoids arch mismatch when login node (x86_64) != compute node (aarch64)
83+
export PATH="${SRTCTL_SOURCE}/bin:$HOME/.local/bin:$PATH"
8384

84-
# Install uv if not present (single binary, no dependencies)
8585
if ! command -v uv &> /dev/null; then
86-
echo "Installing uv package manager..."
87-
curl -LsSf https://astral.sh/uv/install.sh | sh
86+
echo "ERROR: uv not found. Run 'make setup ARCH=<compute_arch>' first."
87+
echo " e.g., make setup ARCH=aarch64 (for GB200/Grace compute nodes)"
88+
exit 1
8889
fi
8990

90-
echo "Using uv with Python 3.12..."
91+
echo "Using uv $(uv --version)..."
9192

9293
{% if setup_script %}
9394
# Custom setup script override from CLI

tests/test_override.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ def sbatch_count(selector: str | None = None) -> int:
391391
patch("subprocess.run", return_value=mock_result) as mock_run,
392392
patch("srtctl.cli.submit.get_srtslurm_setting", return_value=None),
393393
patch("srtctl.cli.submit.create_job_record"),
394+
patch("srtctl.cli.submit.validate_setup"),
394395
):
395396
submit_override(cfg, selector=selector, output_dir=tmp_path)
396397
return sum(1 for c in mock_run.call_args_list if c[0][0][0] == "sbatch")
@@ -417,6 +418,7 @@ def test_selector_submission_keeps_source_and_executes_resolved_variant(self, tm
417418
patch("subprocess.run", return_value=mock_result),
418419
patch("srtctl.cli.submit.get_srtslurm_setting", return_value=None),
419420
patch("srtctl.cli.submit.create_job_record"),
421+
patch("srtctl.cli.submit.validate_setup"),
420422
):
421423
submit_override(cfg, selector="zip_override_tp[1]", output_dir=tmp_path)
422424

@@ -467,6 +469,7 @@ def test_selector_submission_preserves_comments_in_resolved_variant(self, tmp_pa
467469
patch("subprocess.run", return_value=mock_result),
468470
patch("srtctl.cli.submit.get_srtslurm_setting", return_value=None),
469471
patch("srtctl.cli.submit.create_job_record"),
472+
patch("srtctl.cli.submit.validate_setup"),
470473
):
471474
submit_override(cfg, selector="override_lowmem", output_dir=tmp_path)
472475

@@ -489,6 +492,7 @@ def test_plain_config_submission_still_uses_config_yaml(self, tmp_path: Path) ->
489492
patch("subprocess.run", return_value=mock_result),
490493
patch("srtctl.cli.submit.get_srtslurm_setting", return_value=None),
491494
patch("srtctl.cli.submit.create_job_record"),
495+
patch("srtctl.cli.submit.validate_setup"),
492496
):
493497
submit_single(config_path=cfg, output_dir=tmp_path)
494498

tests/test_validate_setup.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Tests for validate_setup pre-flight check and Makefile arch detection."""
5+
6+
import subprocess
7+
from pathlib import Path
8+
9+
import pytest
10+
11+
from srtctl.cli.submit import validate_setup
12+
13+
14+
class TestValidateSetup:
15+
"""Tests for the validate_setup function."""
16+
17+
def test_passes_when_all_binaries_exist(self, tmp_path: Path):
18+
"""validate_setup succeeds when all required binaries are present."""
19+
(tmp_path / "configs").mkdir()
20+
(tmp_path / "configs" / "nats-server").touch()
21+
(tmp_path / "configs" / "etcd").touch()
22+
(tmp_path / "bin").mkdir()
23+
(tmp_path / "bin" / "uv").touch()
24+
25+
# Should not raise
26+
validate_setup(tmp_path)
27+
28+
def test_fails_when_nats_missing(self, tmp_path: Path):
29+
"""validate_setup fails when nats-server is missing."""
30+
(tmp_path / "configs").mkdir()
31+
(tmp_path / "configs" / "etcd").touch()
32+
(tmp_path / "bin").mkdir()
33+
(tmp_path / "bin" / "uv").touch()
34+
35+
with pytest.raises(SystemExit):
36+
validate_setup(tmp_path)
37+
38+
def test_fails_when_etcd_missing(self, tmp_path: Path):
39+
"""validate_setup fails when etcd is missing."""
40+
(tmp_path / "configs").mkdir()
41+
(tmp_path / "configs" / "nats-server").touch()
42+
(tmp_path / "bin").mkdir()
43+
(tmp_path / "bin" / "uv").touch()
44+
45+
with pytest.raises(SystemExit):
46+
validate_setup(tmp_path)
47+
48+
def test_fails_when_uv_missing(self, tmp_path: Path):
49+
"""validate_setup fails when bin/uv is missing."""
50+
(tmp_path / "configs").mkdir()
51+
(tmp_path / "configs" / "nats-server").touch()
52+
(tmp_path / "configs" / "etcd").touch()
53+
54+
with pytest.raises(SystemExit):
55+
validate_setup(tmp_path)
56+
57+
def test_fails_when_all_missing(self, tmp_path: Path):
58+
"""validate_setup fails when nothing has been set up."""
59+
with pytest.raises(SystemExit):
60+
validate_setup(tmp_path)
61+
62+
63+
class TestMakefileArchDetection:
64+
"""Test that the file | grep pattern used in Makefile matches correctly.
65+
66+
The Makefile uses `file <binary> | grep -q "$ARCH_FILE_PATTERN"` to check
67+
if an existing binary matches the requested architecture. These tests verify
68+
the pattern works by creating minimal ELF binaries and checking `file` output.
69+
"""
70+
71+
# Minimal ELF headers: just enough for `file` to identify the architecture
72+
# ELF magic + class(64-bit) + data(little-endian) + version + OS/ABI + padding + type + machine
73+
ELF_X86_64 = b"\x7fELF\x02\x01\x01\x00" + b"\x00" * 8 + b"\x02\x00\x3e\x00"
74+
ELF_AARCH64 = b"\x7fELF\x02\x01\x01\x00" + b"\x00" * 8 + b"\x02\x00\xb7\x00"
75+
76+
@staticmethod
77+
def _file_description(path: Path) -> str:
78+
"""Get just the description part of file(1) output (after the colon)."""
79+
result = subprocess.run(["file", str(path)], capture_output=True, text=True)
80+
return result.stdout.split(":", 1)[1] if ":" in result.stdout else result.stdout
81+
82+
def test_file_detects_x86_64(self, tmp_path: Path):
83+
"""file(1) description for x86_64 ELF contains 'x86-64' (hyphen, not underscore)."""
84+
binary = tmp_path / "fake_bin"
85+
binary.write_bytes(self.ELF_X86_64 + b"\x00" * 44)
86+
desc = self._file_description(binary)
87+
assert "x86-64" in desc, f"Expected 'x86-64' in: {desc}"
88+
assert "x86_64" not in desc, f"file uses hyphen not underscore: {desc}"
89+
90+
def test_file_detects_aarch64(self, tmp_path: Path):
91+
"""file(1) description for aarch64 ELF contains 'aarch64'."""
92+
binary = tmp_path / "fake_bin"
93+
binary.write_bytes(self.ELF_AARCH64 + b"\x00" * 44)
94+
desc = self._file_description(binary)
95+
assert "aarch64" in desc, f"Expected 'aarch64' in: {desc}"
96+
97+
def test_x86_64_not_matched_by_aarch64_pattern(self, tmp_path: Path):
98+
"""An x86_64 binary must not match the aarch64 pattern."""
99+
binary = tmp_path / "fake_bin"
100+
binary.write_bytes(self.ELF_X86_64 + b"\x00" * 44)
101+
desc = self._file_description(binary)
102+
assert "aarch64" not in desc
103+
104+
def test_aarch64_not_matched_by_x86_pattern(self, tmp_path: Path):
105+
"""An aarch64 binary must not match the x86-64 pattern."""
106+
binary = tmp_path / "fake_bin"
107+
binary.write_bytes(self.ELF_AARCH64 + b"\x00" * 44)
108+
desc = self._file_description(binary)
109+
assert "x86-64" not in desc

0 commit comments

Comments
 (0)