Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ assignees: ''
<!-- If applicable, add screenshots to help explain your problem. -->

### Setup
<!-- Please provide details about your setup : OS, Cluv config, Clusters... -->:
<!-- Please provide details about your setup : OS, Cluv config, Clusters... -->

### Additional context
<!-- Add any other context about the problem here. -->
18 changes: 18 additions & 0 deletions .github/ISSUE_TEMPLATE/feature_proposal.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---

name: Feature proposal
about: Propose a new feature or improvement
title: '[FEAT]: '
labels: enhancement
assignees: ''

---

### Context
<!-- Why is this feature needed? What problem does it solve or what use case does it enable? -->

### Proposed solution
<!-- Describe how you'd like this to work. Include example commands or expected output if applicable. -->

### Additional context
<!-- Any other context, references, or screenshots that might be helpful. -->
5 changes: 5 additions & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## Summary
<!-- A clear and concise description of the feature you're proposing. -->

## Issues
<!-- List any related issues here. If this is a new feature, you can create an issue first and link it here. -->
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ my_project/
├── logs -> $SCRATCH/logs/my_project # symlink to $SCRATCH
├── pyproject.toml # includes [tool.cluv] config
├── scripts/
│ └── job.sh # Slurm job script template
│ ├── job.sh # Slurm job script template
│ └── safe_job.sh # Slurm job script template (copies .venv and prior results)
└── src/
└── my_project/
└── __init__.py
Expand Down
211 changes: 107 additions & 104 deletions cluv/cli/init.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,19 @@
import os
import re
import subprocess
from pathlib import Path
import textwrap

from milatools.cli.init_command import DRAC_CLUSTERS

from cluv.config import find_pyproject, has_cluv_config, load_cluv_config
from cluv.utils import console
from cluv.ssh import get_ssh_hostnames
from cluv.utils import console

__all__ = ["init"]

JOB_SCRIPT_PATH = "scripts/job.sh"
SCRIPTS_DIR_PATH = "scripts"
DEFAULT_RESULTS_PATH = "logs"

CLUV_DEFAULT_CONFIG = textwrap.dedent(f"""\
[tool.cluv]
results_path = "{DEFAULT_RESULTS_PATH}"

[tool.cluv.env]
# Environment variables applied when using Slurm commands on all clusters.
UV_OFFLINE = "1"
WANDB_MODE = "offline"
"""
)

CLUV_CLUSTER_MILA_DEFAULT_ARGUMENTS = [
'UV_OFFLINE = "0"',
'WANDB_MODE = "online"',
]
PACKAGE_ROOT = Path(__file__).resolve().parents[1]
# Repository root when running cluv from a source checkout.
REPO_ROOT = Path(__file__).resolve().parents[2]


def init() -> None:
Expand Down Expand Up @@ -138,10 +123,7 @@ def check_cluv_config(pyproject_path: Path) -> None:
)
console.print("Adding config for cluv tool :")

cluv_config = CLUV_DEFAULT_CONFIG
cluv_config += generate_cluster_config("mila", CLUV_CLUSTER_MILA_DEFAULT_ARGUMENTS)
for cluster in DRAC_CLUSTERS:
cluv_config += generate_cluster_config(cluster)
cluv_config = _load_cluv_config_template()
add_cluv_config_section(pyproject_path, cluv_config)


Expand All @@ -154,15 +136,6 @@ def add_cluv_config_section(pyproject_path: Path, section_lines: str) -> None:
f.write("\n" + section_lines)


def generate_cluster_config(cluster: str, config_lines: list[str] = []) -> str:
"""
Generate a cluster config section for the given cluster, with the given variables.
"""
if config_lines:
return f"\n[tool.cluv.clusters.{cluster}.env]\n" + "\n".join(config_lines) + "\n"
return f"\n[tool.cluv.clusters.{cluster}]\n"


def check_git() -> None:
"""
Check if the current project is in a git repository. If not, raise an error and exit.
Expand All @@ -182,17 +155,11 @@ def check_git() -> None:
raise RuntimeError("Error when checking git remote: ", git_remote.stderr)


def check_symlink_to_scratch(project_root: Path, results_path: str | None) -> None:
def check_symlink_to_scratch(project_root: Path, results_path: str) -> None:
"""
Check if a symlink from the results_path in the project in $HOME to the corresponding path in $SCRATCH already exists. If not, create it.
The symlink should be like : $HOME/<project>/<results_path> -> $SCRATCH/<results_path>/<project_name>
"""
if results_path is None:
console.print(
"[yellow]⚠️ Warning: Results path is not configured. Skipping symlink creation.[/yellow]"
)
return

if "SCRATCH" not in os.environ:
console.print(
"[yellow]⚠️ Warning: $SCRATCH variable not set. Skipping symlink creation.[/yellow]"
Expand Down Expand Up @@ -228,78 +195,114 @@ def check_ssh_hostnames(clusters: list[str]) -> None:
missing_clusters = set(clusters).difference(ssh_hostnames)

if len(missing_clusters) > 0:
console.print(f"[yellow]⚠️ Warning: Missing SSH config for {len(missing_clusters)} clusters. Try to run [bold]mila init[/bold] to add all available clusters.[/yellow]")
console.print(
f"[yellow]⚠️ Warning: Missing SSH config for {len(missing_clusters)} clusters. Try to run [bold]mila init[/bold] to add all available clusters.[/yellow]"
)
for cluster in missing_clusters:
console.print(f"[yellow] - {cluster}[/yellow]")
else:
console.print("[green]✅ All clusters in the cluv config are present in your SSH config.[/green]")
console.print(
"[green]✅ All clusters in the cluv config are present in your SSH config.[/green]"
)


def check_job_script(project_root: Path, results_path: str | None) -> None:
def check_job_script(project_root: Path, results_path: str) -> None:
"""
Check if the job script template exists. If not, create it.
The job script is a template for users to submit jobs to Slurm with cluv.
Check if job script templates exist. If not, create them.
The scripts are templates for users to submit jobs to Slurm with cluv.
"""
job_script_path = project_root / JOB_SCRIPT_PATH
try:
project_root_relative_to_home = project_root.relative_to(Path.home())
project_root_for_script = f"$HOME/{project_root_relative_to_home}"
except ValueError:
project_root_for_script = str(project_root)
scripts_dir = project_root / SCRIPTS_DIR_PATH
script_templates_path = _get_script_templates_path()
script_templates = sorted(script_templates_path.glob("*.sh"))

if not script_templates:
console.print("[yellow]⚠️ Warning: No script templates found.[/yellow]")
return

if job_script_path.exists():
console.print(
f"[green]✅ Job template script already exists at '{job_script_path}'.[/green]"
scripts_dir.mkdir(parents=True, exist_ok=True)
for script_template in script_templates:
script_path = scripts_dir / script_template.name
if script_path.exists():
console.print(
f"[green]✅ Job template script already exists at '{script_path}'.[/green]"
)
continue
script_content = script_template.read_text()
script_content = re.sub(
r"^#SBATCH --output=.*$",
f"#SBATCH --output={results_path}/%j/slurm-%j.out",
script_content,
flags=re.MULTILINE,
)
return
script_content = re.sub(
r"^project_name=.*$",
f'project_name="{project_root.name}"',
script_content,
flags=re.MULTILINE,
)
script_content = re.sub(
r"^project_root=.*$",
f'project_root="{project_root_for_script}"',
script_content,
flags=re.MULTILINE,
)
script_content = re.sub(
r"^results_(?:dir|path)=.*$",
f'results_path="{results_path}"',
script_content,
flags=re.MULTILINE,
)
script_content = re.sub(
r"\$(?:\{results_dir\}|results_dir\b)", "$results_path", script_content
)
script_path.write_text(script_content)
console.print(f"Adding job template script at '{script_path}'.")

if results_path is None:
console.print(
"[yellow]⚠️ Warning: Results path is not configured. Skipping job template script generation.[/yellow]"

def _load_cluv_config_template() -> str:
pyproject_template_path = _get_pyproject_template_path()
pyproject_lines = pyproject_template_path.read_text().splitlines()
start = next(
(line_index for line_index, line in enumerate(pyproject_lines) if line == "[tool.cluv]"),
None,
)
if start is None:
raise RuntimeError(
f"Template file {pyproject_template_path} is missing required [tool.cluv] section."
)
return
end = next(
(
line_index
for line_index, line in enumerate(pyproject_lines[start + 1 :], start=start + 1)
if line.startswith("[") and not line.startswith("[tool.cluv")
),
len(pyproject_lines),
)
return "\n".join(pyproject_lines[start:end]).strip() + "\n"


console.print(f"Adding job template script at '{job_script_path}'.")

project_name = project_root.name
project_root = str(project_root.relative_to(Path.home()))

script_content = f"""#!/bin/bash
#SBATCH --output={results_path}/%j/slurm-%j.out
#SBATCH --ntasks=1
#SBATCH --mem=8G
#SBATCH --time=0:05:00

project_name="{project_name}"
results_path="{results_path}"
project_root="{project_root}"
"""

script_content += """
# Minimal test job for cluv submit.
echo "hostname: $(hostname)"
echo "GIT_COMMIT=${GIT_COMMIT:?GIT_COMMIT is not set. Use 'cluv submit' to submit this job script.}"

# Setup the repo in $SLURM_TMPDIR, so the code can change in the project without affecting the job.
echo "Preparing the repo and virtual environment in $SLURM_TMPDIR"
srun --ntasks-per-node=1 --ntasks=$SLURM_NNODES --input=all bash -e <<END
cd $SLURM_TMPDIR
git clone $project_root
cd $SLURM_TMPDIR/$project_name
git checkout --detach $GIT_COMMIT
exec uv sync
END

# Run the actual job command passed as an argument ('python main.py' for example)
echo "Running command: $@"
# Note: This `--gres-flags=allow-task-sharing` is required to allow tasks on the same node to access
# GPUs allocated to other tasks on that node. Without this flag, --gpus-per-task=1 would isolate
# each task to only see its own GPU, which can cause some mysterious NCCL errors.
srun --gres-flags=allow-task-sharing uv --directory=$SLURM_TMPDIR/$project_name run "$@"

# Copy results (if any) from the local storage back to the results dir (eg in $SCRATCH)
echo "Copying logs from $SLURM_TMPDIR/$project_name/$results_path to $project_root/$results_path"
if [ -d "$SLURM_TMPDIR/$project_name/$results_path/$SLURM_JOB_ID" ]; then
srun --ntasks-per-node=1 \
rsync --update --recursive "$SLURM_TMPDIR/$project_name/$results_path/$SLURM_JOB_ID" "$project_root/$results_path/"
fi
"""

job_script_path.parent.mkdir(exist_ok=True)
with open(job_script_path, "w") as sh_file:
sh_file.write(script_content)
def _get_script_templates_path() -> Path:
checked_paths = [REPO_ROOT / "scripts", PACKAGE_ROOT / "templates" / "scripts"]
for script_templates_path in checked_paths:
if script_templates_path.exists():
return script_templates_path
checked_paths_text = ", ".join(str(path) for path in checked_paths)
raise RuntimeError(
f"Couldn't find the script templates folder. Checked: {checked_paths_text}."
)


def _get_pyproject_template_path() -> Path:
checked_paths = [REPO_ROOT / "pyproject.toml", PACKAGE_ROOT / "templates" / "pyproject.toml"]
for pyproject_template_path in checked_paths:
if pyproject_template_path.exists():
return pyproject_template_path
checked_paths_text = ", ".join(str(path) for path in checked_paths)
raise RuntimeError(
f"Couldn't find pyproject.toml template for cluv init. Checked: {checked_paths_text}."
)
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["cluv"]

[tool.hatch.build.targets.wheel.force-include]
"scripts" = "cluv/templates/scripts"
"pyproject.toml" = "cluv/templates/pyproject.toml"

[tool.hatch.version]
source = "uv-dynamic-versioning"

Expand Down
14 changes: 7 additions & 7 deletions scripts/safe_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

project_name="cluv" # to be replaced with the user's project name.
project_root="$HOME/repos/$project_name" # to be replaced with the path to the user's project in their $HOME.
results_dir="logs" # to be replaced with the path to the results dir name. (--output flag above too)
results_path="logs" # to be replaced with the path to the results path name. (--output flag above too)


echo "GIT_COMMIT=${GIT_COMMIT:?GIT_COMMIT is not set. Use 'cluv submit' to submit this job script.}"
Expand All @@ -28,9 +28,9 @@ srun --ntasks-per-node=1 --ntasks=$SLURM_JOB_NUM_NODES bash -e <<END
uv sync

# Copy any existing results from $SCRATCH to the project root.
mkdir -p $project_root_in_tmpdir/$results_dir
if [ -d "$project_root/$results_dir/$SLURM_JOB_ID" ]; then
rsync --update --recursive $project_root/$results_dir/$SLURM_JOB_ID $project_root_in_tmpdir/$results_dir/
mkdir -p $project_root_in_tmpdir/$results_path
if [ -d "$project_root/$results_path/$SLURM_JOB_ID" ]; then
rsync --update --recursive $project_root/$results_path/$SLURM_JOB_ID $project_root_in_tmpdir/$results_path/
fi
END

Expand All @@ -39,8 +39,8 @@ echo "Running command: 'uv run $@' in $project_root_in_tmpdir"
srun uv --directory=$project_root_in_tmpdir run "$@"

# Copy results (if any) from the local storage back to the results dir (eg in $SCRATCH)
echo "Copying logs from $project_root_in_tmpdir/$results_dir to $project_root/$results_dir"
if [ -d "$project_root_in_tmpdir/$results_dir/$SLURM_JOB_ID" ]; then
echo "Copying logs from $project_root_in_tmpdir/$results_path to $project_root/$results_path"
if [ -d "$project_root_in_tmpdir/$results_path/$SLURM_JOB_ID" ]; then
srun --ntasks-per-node=1 --ntasks=$SLURM_JOB_NUM_NODES \
rsync --update --recursive $project_root_in_tmpdir/$results_dir/$SLURM_JOB_ID $project_root/$results_dir/
rsync --update --recursive $project_root_in_tmpdir/$results_path/$SLURM_JOB_ID $project_root/$results_path/
fi
Loading