Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
15aed72
fix: move auto-export to separate CPU-only sbatch job
AdamRajfer Apr 9, 2026
5231377
Merge branch 'main' into arajfer/fix-export-cpu-partition
AdamRajfer Apr 11, 2026
985d78a
fix: handle empty export config and pip install conflicts in auto-export
AdamRajfer Apr 11, 2026
af58520
refactor: restructure auto-export to match main code style
AdamRajfer Apr 11, 2026
0e838cd
feat: add auto_export config fields to slurm executor defaults
AdamRajfer Apr 11, 2026
2f2f666
merge: resolve conflicts with main, accept upstream changes
AdamRajfer Apr 14, 2026
01c5325
fix: add missing account/partition fields to auto-export test fixtures
AdamRajfer Apr 14, 2026
0b123b0
fix: move auto-export to separate CPU-only sbatch job
AdamRajfer Apr 14, 2026
0471b09
fix: remove duplicate config fields from default.yaml
AdamRajfer Apr 15, 2026
d526b57
fix: move auto-export to separate CPU-only sbatch job
AdamRajfer Apr 15, 2026
7709df9
fix: use cpu_partition field instead of auto_export.partition
AdamRajfer Apr 15, 2026
d328784
fix: restore no-container-mount-home comment from main
AdamRajfer Apr 15, 2026
ff03e58
refactor: simplify cpu_partition fallback with get default
AdamRajfer Apr 15, 2026
366b707
fix: add --gpus 0 to export srun to avoid gres inheritance on CPU par…
AdamRajfer Apr 15, 2026
d27878d
fix: default cpu_partition to null, fallback to partition in code
AdamRajfer Apr 15, 2026
4a0a043
fix: request 1 GPU when falling back to batch partition for export
AdamRajfer Apr 15, 2026
da94f3f
fix: add --gpus-per-node=1 to sbatch header when falling back to GPU …
AdamRajfer Apr 15, 2026
bd1a1eb
fix: use --gpus 0 on CPU partition srun, --gpus 1 on GPU fallback
AdamRajfer Apr 15, 2026
38d5565
fix: remove GPU fallback logic, always use --gpus 0 in export srun
AdamRajfer Apr 15, 2026
855903b
refactor: hardcode --gpus 0 in export srun, remove variable
AdamRajfer Apr 15, 2026
6198dd3
docs: add cpu_partition and export_image to auto-export docs
AdamRajfer Apr 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/libraries/nemo-evaluator-launcher/exporters/mlflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ execution:
MLFLOW_TRACKING_URI: MLFLOW_TRACKING_URI
```

Auto-export runs as a separate CPU-only sbatch job after the GPU evaluation completes, so GPU nodes are released immediately. To target a dedicated CPU partition (recommended on clusters like HSG that require GPU specs on batch):

```yaml
execution:
cpu_partition: cpu # CPU-only partition for export jobs (defaults to execution.partition)
auto_export:
destinations: ["mlflow"]
export_image: /path/to/python-3.12.sqsh # optional, defaults to python:3.12.7-slim
```

Set optional fields to customize your export:

```yaml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ username: ${oc.env:USER} # Defaults to $USER env var
account: ??? # SLURM account allocation (required)
output_dir: ??? # Absolute path accessible on compute nodes (required)
partition: batch
cpu_partition: null # CPU-only partition for export jobs. Falls back to execution.partition when null.
num_nodes: 1 # Total SLURM nodes (num_nodes_per_instance = num_nodes / num_instances)
num_instances: 1 # Number of independent deployment instances
ntasks_per_node: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1168,32 +1168,65 @@ def _generate_auto_export_section(
if not launcher_install_cmd:
launcher_install_cmd = "pip install nemo-evaluator-launcher[all]"

s += " # export\n"
s += " srun --mpi pmix --overlap "
s += '--nodelist "${PRIMARY_NODE}" --nodes 1 --ntasks 1 '
s += "--container-image {} ".format(export_image)
if env_var_names:
s += "--container-env {} ".format(",".join(env_var_names))
# never mount home directory for export jobs - this is error prone
# and there's no use-case for mounting it
s += "--no-container-mount-home "
cpu_partition = cfg.execution.get("cpu_partition")
export_partition = cpu_partition or cfg.execution.partition
output_dir = cfg.execution.output_dir
invocation_dir = remote_task_subdir.parent

# --- Build export sbatch script (CPU-only, no GPUs) ---
export_sbatch = "#!/bin/bash\n"
export_sbatch += f"#SBATCH --job-name=nel-export-{remote_task_subdir.name}\n"
export_sbatch += "#SBATCH --nodes=1\n"
export_sbatch += "#SBATCH --ntasks=1\n"
export_sbatch += "#SBATCH --time=00:30:00\n"
export_sbatch += f"#SBATCH --account {cfg.execution.account}\n"
export_sbatch += f"#SBATCH --partition {export_partition}\n"
export_sbatch += "#SBATCH --no-requeue\n"
export_sbatch += (
f"#SBATCH --output {remote_task_subdir / 'logs' / 'export-%A.log'}\n"
)
export_sbatch += "\nset -uo pipefail\n"
secrets_path = remote_task_subdir / ".secrets.env"
export_sbatch += f'[ -f "{secrets_path}" ] && source "{secrets_path}"\n'
if secrets:
export_sbatch += f"{build_reexport_commands('export', secrets)}\n"

mounts = [
f"{remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts",
f"{remote_task_subdir}/logs:{remote_task_subdir}/logs",
f"{invocation_dir}:{invocation_dir}",
f"{output_dir}:{output_dir}",
]
for host_path, container_path in export_mounts.items():
mounts.append(f"{host_path}:{container_path}")
s += "--container-mounts {} ".format(",".join(mounts))
s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.log")
s += " bash -c '\n"
s += f" {launcher_install_cmd}\n"
s += f" cd {remote_task_subdir}/artifacts\n"

export_sbatch += (
f"\nsrun --nodes 1 --ntasks 1 --gpus 0 --container-image {export_image} "
)
if env_var_names:
export_sbatch += "--container-env {} ".format(",".join(env_var_names))
# never mount home directory for export jobs - this is error prone
# and there's no use-case for mounting it
export_sbatch += "--no-container-mount-home "
export_sbatch += "--container-mounts {} ".format(",".join(mounts))
export_sbatch += "bash -c '\n"
export_sbatch += f" {launcher_install_cmd}\n"
export_sbatch += f" cd {remote_task_subdir}/artifacts\n"
for dest in destinations:
s += f' echo "Exporting to {dest}..."\n'
s += f' nemo-evaluator-launcher export {job_id} --dest {dest} --config export_config.yml --job-dirs {cfg.execution.output_dir} || echo "Export to {dest} failed"\n'
s += "'\n"
s += " echo 'Auto-export completed.'\n"
export_sbatch += f' echo "Exporting to {dest}..."\n'
export_sbatch += f' nemo-evaluator-launcher export {job_id} --dest {dest} --config {remote_task_subdir}/artifacts/export_config.yml --job-dirs {output_dir} || echo "Export to {dest} failed"\n'
export_sbatch += "'\n"

# --- Write and submit the export sbatch script ---
export_script_path = remote_task_subdir / "export.sbatch"
s += f" cat > {export_script_path} << 'EXPORT_EOF'\n"
s += export_sbatch
s += "EXPORT_EOF\n"
s += f' _export_out=$(sbatch "{export_script_path}" 2>&1)\n'
s += " _export_id=$(echo \"$_export_out\" | grep -oE '[0-9]+')\n"
s += ' if [ -n "$_export_id" ]; then\n'
s += ' echo "Export job submitted: $_export_id"\n'
s += " else\n"
s += ' echo "WARNING: Failed to submit export job: $_export_out"\n'
s += " fi\n"
s += "else\n"
s += " echo 'Evaluation failed with exit code $EVAL_EXIT_CODE. Skipping auto-export.'\n"
s += "fi\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1358,6 +1358,8 @@ def test_generate_auto_export_section_skips_marker_interrupted_runs(self):
cfg = OmegaConf.create(
{
"execution": {
"account": "test_account",
"partition": "batch",
"output_dir": "/tmp/out",
"auto_export": {"destinations": ["wandb"]},
},
Expand All @@ -1383,6 +1385,8 @@ def test_generate_auto_export_section_with_export_mounts(self):
cfg = OmegaConf.create(
{
"execution": {
"account": "test_account",
"partition": "gpu_partition",
"output_dir": "/tmp/out",
"auto_export": {
"destinations": ["mlflow"],
Expand All @@ -1405,15 +1409,16 @@ def test_generate_auto_export_section_with_export_mounts(self):
remote_task_subdir=Path("/tmp/out/test_task"),
)

assert "/tmp/out/test_task/artifacts:/tmp/out/test_task/artifacts" in section
assert "/tmp/out/test_task/logs:/tmp/out/test_task/logs" in section
assert "/tmp/out:/tmp/out" in section
assert "/lustre/cache/uv:/cache/uv" in section
assert "/lustre/data:/data" in section

def test_generate_auto_export_section_with_custom_image(self):
cfg = OmegaConf.create(
{
"execution": {
"account": "test_account",
"partition": "gpu_partition",
"output_dir": "/tmp/out",
"auto_export": {
"destinations": ["mlflow"],
Expand Down
Loading