diff --git a/cluv/__main__.py b/cluv/__main__.py index 1c71424..8be8f30 100644 --- a/cluv/__main__.py +++ b/cluv/__main__.py @@ -19,6 +19,7 @@ import rich_argparse import simple_parsing +from .cli.build import build from .cli.init import init from .cli.login import login from .cli.run import run @@ -57,6 +58,9 @@ def main(argv: list[str] | None = None) -> None: subparsers = parser.add_subparsers(dest="", required=True) # add -v/--verbose to each subparser as well. + build_parser = add_build_args(subparsers) + _add_v_arg(build_parser) + init_parser = add_init_args(subparsers) _add_v_arg(init_parser) @@ -136,6 +140,33 @@ def add_submit_args( return submit_parser +def add_build_args(subparsers: Subparsers) -> argparse.ArgumentParser: + build_parser = subparsers.add_parser( + "build", + help="Build an Apptainer container on a remote cluster.", + formatter_class=rich_argparse.RichHelpFormatter, + ) + build_parser.add_argument( + "cluster", + metavar="", + help="The cluster to build the container on.", + ) + build_parser.add_argument( + "--extra", + metavar="", + default=None, + help="Optional extras group to include (e.g. 'runtime').", + ) + build_parser.add_argument( + "--no-sync", + action="store_true", + default=False, + help="Skip syncing the project before building.", + ) + build_parser.set_defaults(func=build) + return build_parser + + def add_status_args(subparsers: Subparsers) -> argparse.ArgumentParser: status_parser = subparsers.add_parser( "status", diff --git a/cluv/cli/build.py b/cluv/cli/build.py new file mode 100644 index 0000000..4ee37dc --- /dev/null +++ b/cluv/cli/build.py @@ -0,0 +1,175 @@ +"""Build an Apptainer container on a remote cluster. + +Generates pinned requirements from uv.lock, uploads an Apptainer definition, +builds a .sif image, and deploys it to the configured path. +""" + +from __future__ import annotations + +import logging +from pathlib import Path, PurePosixPath + +from cluv.cli.login import login +from cluv.cli.sync import sync +from cluv.config import ContainerConfig, find_pyproject, get_config +from cluv.utils import console + +logger = logging.getLogger(__name__) + +__all__ = ["build"] + + +def generate_def(base_image: str, extra_apt: list[str], extra_pip_args: str) -> str: + post_lines = [] + if extra_apt: + pkgs = " ".join(extra_apt) + post_lines.append( + f"apt-get update && apt-get install -y --no-install-recommends {pkgs} " + "&& rm -rf /var/lib/apt/lists/*" + ) + pip_cmd = "pip install --no-cache-dir" + if extra_pip_args: + pip_cmd += f" {extra_pip_args}" + pip_cmd += " -r /build/requirements.txt" + post_lines.append(pip_cmd) + post_lines.append("mv /build/requirements.txt /opt/requirements.txt") + post_lines.append("rm -rf /build") + + post_body = "\n ".join(post_lines) + + return ( + f"Bootstrap: docker\n" + f"From: {base_image}\n" + f"\n" + f"%files\n" + f" /tmp/cluv-build/requirements.txt /build/requirements.txt\n" + f"\n" + f"%post\n" + f" {post_body}\n" + f"\n" + f"%test\n" + f' python -c "import importlib.metadata; print(\'container OK\')"\n' + ) + + +async def build(cluster: str, extra: str | None = None, no_sync: bool = False) -> str | None: + """Build an Apptainer container on the given cluster. + + Returns the remote path to the built .sif, or None on failure. + """ + config = get_config() + cluster_config = config.clusters.get(cluster) + if not cluster_config or not cluster_config.container: + console.print( + f"[red]No container config for cluster '{cluster}'.[/red]\n" + f"Add [tool.cluv.clusters.{cluster}.container] to pyproject.toml." + ) + return None + + container: ContainerConfig = cluster_config.container + + if not no_sync: + remotes = await sync(clusters=[cluster]) + else: + remotes = await login([cluster]) + + remote = remotes[0] + project_path = PurePosixPath(find_pyproject().parent.relative_to(Path.home())) + + console.print("[bold]Exporting pinned requirements from uv.lock...[/bold]") + export_parts = [ + "uv export --locked --no-dev --no-hashes --no-annotate --no-header --no-emit-project", + ] + if extra: + export_parts.append(f"--extra {extra}") + export_parts.append("--format requirements-txt") + export_cmd = f"bash -l -c 'cd ~/{project_path} && {' '.join(export_parts)}'" + result = await remote.run(export_cmd, display=True, hide="out") + if result.returncode != 0: + stderr = result.stderr.strip() + if "locked" in stderr.lower() or "lock" in stderr.lower(): + console.print( + "[red]uv.lock is out of sync with pyproject.toml. " + "Run 'uv lock' locally, commit, and try again.[/red]" + ) + else: + console.print(f"[red]uv export failed: {stderr}[/red]") + return None + requirements = result.stdout + + console.print("[bold]Uploading build context...[/bold]") + await remote.run("mkdir -p /tmp/cluv-build", hide=True) + await remote.run( + "cat > /tmp/cluv-build/requirements.txt", + input=requirements, + hide=True, + ) + + def_content = generate_def(container.base_image, container.extra_apt, container.extra_pip_args) + await remote.run("cat > /tmp/cluv-build/container.def", input=def_content, hide=True) + + git_sha = await remote.get_output( + f"git -C ~/{project_path} rev-parse --short HEAD", + ) + project_name = find_pyproject().parent.name + sif_name = f"{project_name}-{git_sha}.sif" + deploy_path = container.deploy_path + + console.print("[bold]Building container (this may take several minutes)...[/bold]") + + # GOMAXPROCS=1 prevents pids.max cgroup kills on DRAC login nodes. + # Their user.slice cgroup has pids.max=512; Go's default thread-per-CPU + # overshoots during OCI fetch, killing the build with EAGAIN. + build_cmd = ( + f"bash -l -c '" + f"export GOMAXPROCS=${{GOMAXPROCS:-1}} GOMEMLIMIT=${{GOMEMLIMIT:-2GiB}}; " + f"module load apptainer 2>/dev/null || true; " + f"cd /tmp/cluv-build && " + f"apptainer build {sif_name} container.def" + f"'" + ) + result = await remote.run(build_cmd, display=True, hide=False) + if result.returncode != 0: + console.print("[red]Container build failed.[/red]") + await _cleanup_build_dir(remote) + return None + + # Verify the image loads before deploying. + console.print("[bold]Verifying container...[/bold]") + verify_script = "import sys; sys.exit(0)" + verify_cmd = ( + f"bash -l -c '" + f"module load apptainer 2>/dev/null || true; " + f"apptainer exec /tmp/cluv-build/{sif_name} python -c \"{verify_script}\"'" + ) + result = await remote.run(verify_cmd, display=True, hide="out") + if result.returncode != 0: + console.print("[red]Container verification failed.[/red]") + await _cleanup_build_dir(remote) + return None + + console.print(f"[bold]Deploying to {deploy_path}...[/bold]") + deploy_cmd = ( + f"bash -l -c '" + f"mkdir -p {deploy_path} && " + f"cp /tmp/cluv-build/{sif_name} {deploy_path}/{sif_name} && " + f"chmod 640 {deploy_path}/{sif_name} && " + f"ln -sfn {sif_name} {deploy_path}/current.sif" + f"'" + ) + result = await remote.run(deploy_cmd, display=True, hide=True) + if result.returncode != 0: + console.print("[red]Deploy failed.[/red]") + await _cleanup_build_dir(remote) + return None + + await _cleanup_build_dir(remote) + + sif_path = f"{deploy_path}/{sif_name}" + console.print(f"[green]Container deployed: {sif_path}[/green]") + console.print(f"[green]Symlink: {deploy_path}/current.sif -> {sif_name}[/green]") + return sif_path + + +async def _cleanup_build_dir(remote) -> None: + await remote.run("rm -rf /tmp/cluv-build", warn=True, hide=True, display=False) diff --git a/cluv/cli/submit.py b/cluv/cli/submit.py index 53d5da1..6f013c2 100644 --- a/cluv/cli/submit.py +++ b/cluv/cli/submit.py @@ -246,6 +246,11 @@ def get_sbatch_command( env_vars["SBATCH_JOB_NAME"] = f"cluv-{base_name}" env_vars["GIT_COMMIT"] = git_commit + # Inject CONTAINER_PATH when the cluster has container config. + cluster_cfg = config.clusters.get(cluster, ClusterConfig()) + if cluster_cfg.container and "CONTAINER_PATH" not in env_vars: + env_vars["CONTAINER_PATH"] = f"{cluster_cfg.container.deploy_path}/current.sif" + env_vars_prefix = " ".join(f"{k}={shlex.quote(str(v))}" for k, v in env_vars.items()) sbatch_args_str = " ".join(shlex.quote(f) for f in sbatch_args) program_args_str = shlex.join(program_args) diff --git a/cluv/config.py b/cluv/config.py index de5fbf0..a88bd5d 100644 --- a/cluv/config.py +++ b/cluv/config.py @@ -6,17 +6,44 @@ import logging import tomllib from pathlib import Path -from pydantic import BaseModel +from pydantic import BaseModel, field_validator logger = logging.getLogger(__name__) +class ContainerConfig(BaseModel): + """Configuration for Apptainer container builds on a cluster.""" + + deploy_path: str + """Remote path where built .sif images are stored (e.g. '/project/acct/containers').""" + + @field_validator("deploy_path") + @classmethod + def deploy_path_must_be_absolute(cls, v: str) -> str: + if not v.startswith("/"): + raise ValueError(f"deploy_path must be an absolute path, got '{v}'") + return v + + base_image: str = "python:3.12-slim" + """Docker base image for the Apptainer definition.""" + + extra_apt: list[str] = [] + """Additional apt packages to install in the container.""" + + extra_pip_args: str = "" + """Extra arguments passed to pip install inside the container (e.g. '--extra-index-url ...').""" + + class ClusterConfig(BaseModel): """Per-cluster configuration options.""" env: dict[str, str] = {} """Environment variables to set when running Slurm commands on this cluster.""" + container: ContainerConfig | None = None + """Optional Apptainer container configuration. When set, `cluv build` can build containers on + this cluster and `cluv submit` will use the container instead of a venv.""" + class CluvConfig(BaseModel): """Configuration options for Cluv, loaded from the pyproject.toml file.""" diff --git a/scripts/container_job.sh b/scripts/container_job.sh new file mode 100644 index 0000000..9be48c9 --- /dev/null +++ b/scripts/container_job.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=4G +#SBATCH --time=0:05:00 +#SBATCH --output=logs/%j/slurm-%j.out + +project_name="cluv" +project_root="$HOME/repos/$project_name" + +echo "GIT_COMMIT=${GIT_COMMIT:?GIT_COMMIT is not set. Use 'cluv submit' to submit this job script.}" +echo "CONTAINER_PATH=${CONTAINER_PATH:?CONTAINER_PATH is not set. Run 'cluv build' first or set it in your cluv config.}" + +if [ ! -f "$CONTAINER_PATH" ]; then + echo "FATAL: container not found at $CONTAINER_PATH" >&2 + echo "Run 'cluv build ' to build one." >&2 + exit 1 +fi + +module load apptainer 2>/dev/null || true + +echo "Running command: apptainer exec $CONTAINER_PATH $@" +srun apptainer exec --nv \ + --env PYTHONUNBUFFERED=1 \ + --env "PYTHONPATH=$project_root" \ + --env MPLCONFIGDIR=/tmp/mpl \ + --env TORCHDYNAMO_DISABLE=1 \ + --bind "$project_root":"$project_root" \ + --bind /dev/shm:/dev/shm \ + ${SLURM_TMPDIR:+--bind "$SLURM_TMPDIR":"$SLURM_TMPDIR"} \ + ${SCRATCH:+--bind "$SCRATCH":"$SCRATCH"} \ + ${PROJECT:+--bind /project:/project} \ + "$CONTAINER_PATH" \ + "$@" diff --git a/tests/test_build.py b/tests/test_build.py new file mode 100644 index 0000000..f17626b --- /dev/null +++ b/tests/test_build.py @@ -0,0 +1,113 @@ +"""Unit tests for cluv/cli/build.py — pure, no I/O beyond string generation.""" + +from pathlib import Path + +import pytest + +from cluv.cli.build import generate_def +from cluv.config import ContainerConfig, ClusterConfig, load_cluv_config + + +class TestGenerateDef: + def test_minimal(self): + result = generate_def("python:3.12-slim", [], "") + assert "Bootstrap: docker" in result + assert "From: python:3.12-slim" in result + assert "pip install --no-cache-dir -r /build/requirements.txt" in result + assert "apt-get" not in result + + def test_with_apt_packages(self): + result = generate_def("python:3.12-slim", ["gcc", "libgomp1"], "") + assert "apt-get update" in result + assert "gcc libgomp1" in result + + def test_with_extra_pip_args(self): + result = generate_def( + "python:3.12-slim", [], + "--extra-index-url https://download.pytorch.org/whl/cu126", + ) + assert "--extra-index-url https://download.pytorch.org/whl/cu126" in result + + def test_custom_base_image(self): + result = generate_def("nvidia/cuda:12.6.0-runtime-ubuntu22.04", [], "") + assert "From: nvidia/cuda:12.6.0-runtime-ubuntu22.04" in result + + def test_all_options(self): + result = generate_def( + "python:3.12-slim", + ["gcc", "libc6-dev"], + "--extra-index-url https://download.pytorch.org/whl/cu126", + ) + assert "apt-get" in result + assert "gcc libc6-dev" in result + assert "--extra-index-url" in result + assert "%test" in result + + +class TestContainerConfig: + def test_defaults(self): + cfg = ContainerConfig(deploy_path="/project/acct/containers") + assert cfg.base_image == "python:3.12-slim" + assert cfg.extra_apt == [] + assert cfg.extra_pip_args == "" + + def test_custom_values(self): + cfg = ContainerConfig( + deploy_path="/project/acct/containers", + base_image="nvidia/cuda:12.6.0-runtime-ubuntu22.04", + extra_apt=["gcc", "libgomp1"], + extra_pip_args="--extra-index-url https://download.pytorch.org/whl/cu126", + ) + assert cfg.deploy_path == "/project/acct/containers" + assert cfg.base_image == "nvidia/cuda:12.6.0-runtime-ubuntu22.04" + assert cfg.extra_apt == ["gcc", "libgomp1"] + + def test_cluster_without_container(self): + cfg = ClusterConfig(env={"SBATCH_ACCOUNT": "def-me"}) + assert cfg.container is None + + def test_relative_deploy_path_rejected(self): + with pytest.raises(ValueError, match="absolute path"): + ContainerConfig(deploy_path="relative/path") + + +class TestContainerConfigFromToml: + def test_parse_container_config(self, tmp_path: Path): + p = tmp_path / "pyproject.toml" + p.write_text("""\ +[tool.cluv] +results_path = "logs" + +[tool.cluv.clusters.mila] + +[tool.cluv.clusters.rorqual.env] +SBATCH_ACCOUNT = "def-bengioy" + +[tool.cluv.clusters.rorqual.container] +deploy_path = "/project/acct/containers" +base_image = "python:3.12-slim" +extra_apt = ["gcc", "libgomp1"] +extra_pip_args = "--extra-index-url https://download.pytorch.org/whl/cu126" +""") + cfg = load_cluv_config(p) + assert cfg.clusters["mila"].container is None + container = cfg.clusters["rorqual"].container + assert container is not None + assert container.deploy_path == "/project/acct/containers" + assert container.extra_apt == ["gcc", "libgomp1"] + + def test_minimal_container_config(self, tmp_path: Path): + p = tmp_path / "pyproject.toml" + p.write_text("""\ +[tool.cluv] +results_path = "logs" + +[tool.cluv.clusters.narval.container] +deploy_path = "/project/acct/containers" +""") + cfg = load_cluv_config(p) + container = cfg.clusters["narval"].container + assert container is not None + assert container.deploy_path == "/project/acct/containers" + assert container.base_image == "python:3.12-slim" + assert container.extra_apt == [] diff --git a/tests/test_submit.py b/tests/test_submit.py index 8706ae4..3bdb4fc 100644 --- a/tests/test_submit.py +++ b/tests/test_submit.py @@ -81,6 +81,78 @@ def test_only_override_slurm_vars_with_selected_cluster_vars(self, project_dir: ) + def test_container_path_injected_when_container_config_present(self, project_dir: Path) -> None: + p = project_dir / "pyproject.toml" + p.write_text( + textwrap.dedent( + """\ + [tool.cluv] + results_path = "results" + [tool.cluv.clusters.rorqual.container] + deploy_path = "/project/acct/containers" + """ + ) + ) + + sbatch_command = get_sbatch_command( + cluster="rorqual", + job_script=Path("scripts/container_job.sh"), + sbatch_args=[], + program_args=["python", "main.py"], + git_commit="abc1234", + ) + + assert "CONTAINER_PATH=/project/acct/containers/current.sif" in sbatch_command + + def test_container_path_not_injected_without_container_config(self, project_dir: Path) -> None: + p = project_dir / "pyproject.toml" + p.write_text( + textwrap.dedent( + """\ + [tool.cluv] + results_path = "results" + [tool.cluv.clusters.mila] + """ + ) + ) + + sbatch_command = get_sbatch_command( + cluster="mila", + job_script=Path("scripts/job.sh"), + sbatch_args=[], + program_args=[], + git_commit="abc1234", + ) + + assert "CONTAINER_PATH" not in sbatch_command + + def test_explicit_container_path_env_not_overridden(self, project_dir: Path) -> None: + p = project_dir / "pyproject.toml" + p.write_text( + textwrap.dedent( + """\ + [tool.cluv] + results_path = "results" + [tool.cluv.clusters.rorqual.env] + CONTAINER_PATH = "/custom/path.sif" + [tool.cluv.clusters.rorqual.container] + deploy_path = "/project/acct/containers" + """ + ) + ) + + sbatch_command = get_sbatch_command( + cluster="rorqual", + job_script=Path("scripts/container_job.sh"), + sbatch_args=[], + program_args=[], + git_commit="abc1234", + ) + + assert "CONTAINER_PATH=/custom/path.sif" in sbatch_command + assert "CONTAINER_PATH=/project/acct/containers/current.sif" not in sbatch_command + + class TestEnsureCleanGitState: def test_prefers_branch_tip_in_github_actions_detached_head( self, monkeypatch: pytest.MonkeyPatch