Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions cluv/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import rich_argparse
import simple_parsing

from .cli.build import build
from .cli.init import init
from .cli.login import login
from .cli.run import run
Expand Down Expand Up @@ -57,6 +58,9 @@ def main(argv: list[str] | None = None) -> None:
subparsers = parser.add_subparsers(dest="<command>", required=True)

# add -v/--verbose to each subparser as well.
build_parser = add_build_args(subparsers)
_add_v_arg(build_parser)

init_parser = add_init_args(subparsers)
_add_v_arg(init_parser)

Expand Down Expand Up @@ -136,6 +140,33 @@ def add_submit_args(
return submit_parser


def add_build_args(subparsers: Subparsers) -> argparse.ArgumentParser:
build_parser = subparsers.add_parser(
"build",
help="Build an Apptainer container on a remote cluster.",
formatter_class=rich_argparse.RichHelpFormatter,
)
build_parser.add_argument(
"cluster",
metavar="<cluster>",
help="The cluster to build the container on.",
)
build_parser.add_argument(
"--extra",
metavar="<group>",
default=None,
help="Optional extras group to include (e.g. 'runtime').",
)
build_parser.add_argument(
"--no-sync",
action="store_true",
default=False,
help="Skip syncing the project before building.",
)
build_parser.set_defaults(func=build)
return build_parser


def add_status_args(subparsers: Subparsers) -> argparse.ArgumentParser:
status_parser = subparsers.add_parser(
"status",
Expand Down
175 changes: 175 additions & 0 deletions cluv/cli/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""Build an Apptainer container on a remote cluster.

Generates pinned requirements from uv.lock, uploads an Apptainer definition,
builds a .sif image, and deploys it to the configured path.
"""

from __future__ import annotations

import logging
from pathlib import Path, PurePosixPath

from cluv.cli.login import login
from cluv.cli.sync import sync
from cluv.config import ContainerConfig, find_pyproject, get_config
from cluv.utils import console

logger = logging.getLogger(__name__)

__all__ = ["build"]


def generate_def(base_image: str, extra_apt: list[str], extra_pip_args: str) -> str:
post_lines = []
if extra_apt:
pkgs = " ".join(extra_apt)
post_lines.append(
f"apt-get update && apt-get install -y --no-install-recommends {pkgs} "
"&& rm -rf /var/lib/apt/lists/*"
)
pip_cmd = "pip install --no-cache-dir"
if extra_pip_args:
pip_cmd += f" {extra_pip_args}"
pip_cmd += " -r /build/requirements.txt"
post_lines.append(pip_cmd)
post_lines.append("mv /build/requirements.txt /opt/requirements.txt")
post_lines.append("rm -rf /build")

post_body = "\n ".join(post_lines)

return (
f"Bootstrap: docker\n"
f"From: {base_image}\n"
f"\n"
f"%files\n"
f" /tmp/cluv-build/requirements.txt /build/requirements.txt\n"
f"\n"
f"%post\n"
f" {post_body}\n"
f"\n"
f"%test\n"
f' python -c "import importlib.metadata; print(\'container OK\')"\n'
)


async def build(cluster: str, extra: str | None = None, no_sync: bool = False) -> str | None:
"""Build an Apptainer container on the given cluster.

Returns the remote path to the built .sif, or None on failure.
"""
config = get_config()
cluster_config = config.clusters.get(cluster)
if not cluster_config or not cluster_config.container:
console.print(
f"[red]No container config for cluster '{cluster}'.[/red]\n"
f"Add [tool.cluv.clusters.{cluster}.container] to pyproject.toml."
)
return None

container: ContainerConfig = cluster_config.container

if not no_sync:
remotes = await sync(clusters=[cluster])
else:
remotes = await login([cluster])

remote = remotes[0]
project_path = PurePosixPath(find_pyproject().parent.relative_to(Path.home()))

console.print("[bold]Exporting pinned requirements from uv.lock...[/bold]")
export_parts = [
"uv export --locked --no-dev --no-hashes --no-annotate --no-header --no-emit-project",
]
if extra:
export_parts.append(f"--extra {extra}")
export_parts.append("--format requirements-txt")
export_cmd = f"bash -l -c 'cd ~/{project_path} && {' '.join(export_parts)}'"
result = await remote.run(export_cmd, display=True, hide="out")
if result.returncode != 0:
stderr = result.stderr.strip()
if "locked" in stderr.lower() or "lock" in stderr.lower():
console.print(
"[red]uv.lock is out of sync with pyproject.toml. "
"Run 'uv lock' locally, commit, and try again.[/red]"
)
else:
console.print(f"[red]uv export failed: {stderr}[/red]")
return None
requirements = result.stdout

console.print("[bold]Uploading build context...[/bold]")
await remote.run("mkdir -p /tmp/cluv-build", hide=True)
await remote.run(
"cat > /tmp/cluv-build/requirements.txt",
input=requirements,
hide=True,
)

def_content = generate_def(container.base_image, container.extra_apt, container.extra_pip_args)
await remote.run("cat > /tmp/cluv-build/container.def", input=def_content, hide=True)

git_sha = await remote.get_output(
f"git -C ~/{project_path} rev-parse --short HEAD",
)
project_name = find_pyproject().parent.name
sif_name = f"{project_name}-{git_sha}.sif"
deploy_path = container.deploy_path

console.print("[bold]Building container (this may take several minutes)...[/bold]")

# GOMAXPROCS=1 prevents pids.max cgroup kills on DRAC login nodes.
# Their user.slice cgroup has pids.max=512; Go's default thread-per-CPU
# overshoots during OCI fetch, killing the build with EAGAIN.
build_cmd = (
f"bash -l -c '"
f"export GOMAXPROCS=${{GOMAXPROCS:-1}} GOMEMLIMIT=${{GOMEMLIMIT:-2GiB}}; "
f"module load apptainer 2>/dev/null || true; "
f"cd /tmp/cluv-build && "
f"apptainer build {sif_name} container.def"
f"'"
)
result = await remote.run(build_cmd, display=True, hide=False)
if result.returncode != 0:
console.print("[red]Container build failed.[/red]")
await _cleanup_build_dir(remote)
return None

# Verify the image loads before deploying.
console.print("[bold]Verifying container...[/bold]")
verify_script = "import sys; sys.exit(0)"
verify_cmd = (
f"bash -l -c '"
f"module load apptainer 2>/dev/null || true; "
f"apptainer exec /tmp/cluv-build/{sif_name} python -c \"{verify_script}\"'"
)
result = await remote.run(verify_cmd, display=True, hide="out")
if result.returncode != 0:
console.print("[red]Container verification failed.[/red]")
await _cleanup_build_dir(remote)
return None

console.print(f"[bold]Deploying to {deploy_path}...[/bold]")
deploy_cmd = (
f"bash -l -c '"
f"mkdir -p {deploy_path} && "
f"cp /tmp/cluv-build/{sif_name} {deploy_path}/{sif_name} && "
f"chmod 640 {deploy_path}/{sif_name} && "
f"ln -sfn {sif_name} {deploy_path}/current.sif"
f"'"
)
result = await remote.run(deploy_cmd, display=True, hide=True)
if result.returncode != 0:
console.print("[red]Deploy failed.[/red]")
await _cleanup_build_dir(remote)
return None

await _cleanup_build_dir(remote)

sif_path = f"{deploy_path}/{sif_name}"
console.print(f"[green]Container deployed: {sif_path}[/green]")
console.print(f"[green]Symlink: {deploy_path}/current.sif -> {sif_name}[/green]")
return sif_path


async def _cleanup_build_dir(remote) -> None:
await remote.run("rm -rf /tmp/cluv-build", warn=True, hide=True, display=False)
5 changes: 5 additions & 0 deletions cluv/cli/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ def get_sbatch_command(
env_vars["SBATCH_JOB_NAME"] = f"cluv-{base_name}"
env_vars["GIT_COMMIT"] = git_commit

# Inject CONTAINER_PATH when the cluster has container config.
cluster_cfg = config.clusters.get(cluster, ClusterConfig())
if cluster_cfg.container and "CONTAINER_PATH" not in env_vars:
env_vars["CONTAINER_PATH"] = f"{cluster_cfg.container.deploy_path}/current.sif"

env_vars_prefix = " ".join(f"{k}={shlex.quote(str(v))}" for k, v in env_vars.items())
sbatch_args_str = " ".join(shlex.quote(f) for f in sbatch_args)
program_args_str = shlex.join(program_args)
Expand Down
29 changes: 28 additions & 1 deletion cluv/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,44 @@
import logging
import tomllib
from pathlib import Path
from pydantic import BaseModel
from pydantic import BaseModel, field_validator

logger = logging.getLogger(__name__)


class ContainerConfig(BaseModel):
"""Configuration for Apptainer container builds on a cluster."""

deploy_path: str
"""Remote path where built .sif images are stored (e.g. '/project/acct/containers')."""

@field_validator("deploy_path")
@classmethod
def deploy_path_must_be_absolute(cls, v: str) -> str:
if not v.startswith("/"):
raise ValueError(f"deploy_path must be an absolute path, got '{v}'")
return v

base_image: str = "python:3.12-slim"
"""Docker base image for the Apptainer definition."""

extra_apt: list[str] = []
"""Additional apt packages to install in the container."""

extra_pip_args: str = ""
"""Extra arguments passed to pip install inside the container (e.g. '--extra-index-url ...')."""


class ClusterConfig(BaseModel):
"""Per-cluster configuration options."""

env: dict[str, str] = {}
"""Environment variables to set when running Slurm commands on this cluster."""

container: ContainerConfig | None = None
"""Optional Apptainer container configuration. When set, `cluv build` can build containers on
this cluster and `cluv submit` will use the container instead of a venv."""


class CluvConfig(BaseModel):
"""Configuration options for Cluv, loaded from the pyproject.toml file."""
Expand Down
34 changes: 34 additions & 0 deletions scripts/container_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=4G
#SBATCH --time=0:05:00
#SBATCH --output=logs/%j/slurm-%j.out

project_name="cluv"
project_root="$HOME/repos/$project_name"

echo "GIT_COMMIT=${GIT_COMMIT:?GIT_COMMIT is not set. Use 'cluv submit' to submit this job script.}"
echo "CONTAINER_PATH=${CONTAINER_PATH:?CONTAINER_PATH is not set. Run 'cluv build' first or set it in your cluv config.}"

if [ ! -f "$CONTAINER_PATH" ]; then
echo "FATAL: container not found at $CONTAINER_PATH" >&2
echo "Run 'cluv build <cluster>' to build one." >&2
exit 1
fi

module load apptainer 2>/dev/null || true

echo "Running command: apptainer exec $CONTAINER_PATH $@"
srun apptainer exec --nv \
--env PYTHONUNBUFFERED=1 \
--env "PYTHONPATH=$project_root" \
--env MPLCONFIGDIR=/tmp/mpl \
--env TORCHDYNAMO_DISABLE=1 \
--bind "$project_root":"$project_root" \
--bind /dev/shm:/dev/shm \
${SLURM_TMPDIR:+--bind "$SLURM_TMPDIR":"$SLURM_TMPDIR"} \
${SCRATCH:+--bind "$SCRATCH":"$SCRATCH"} \
${PROJECT:+--bind /project:/project} \
"$CONTAINER_PATH" \
"$@"
Loading