NVIDIA-NeMo · marta-sd · Apr 15, 2026 · Apr 9, 2026 · Apr 11, 2026 · Apr 11, 2026
@@ -88,6 +88,16 @@ execution:
       MLFLOW_TRACKING_URI: MLFLOW_TRACKING_URI
 ```
 
+Auto-export runs as a separate CPU-only sbatch job after the GPU evaluation completes, so GPU nodes are released immediately. To target a dedicated CPU partition (recommended on clusters like HSG that require GPU specs on batch):
+
+```yaml
+execution:
+  cpu_partition: cpu  # CPU-only partition for export jobs (defaults to execution.partition)
+  auto_export:
+    destinations: ["mlflow"]
+    export_image: /path/to/python-3.12.sqsh  # optional, defaults to python:3.12.7-slim
+```
+
 Set optional fields to customize your export:
 
 ```yaml

@@ -20,6 +20,7 @@ username: ${oc.env:USER} # Defaults to $USER env var
 account: ??? # SLURM account allocation (required)
 output_dir: ??? # Absolute path accessible on compute nodes (required)
 partition: batch
+cpu_partition: null  # CPU-only partition for export jobs. Falls back to execution.partition when null.
 num_nodes: 1             # Total SLURM nodes (num_nodes_per_instance = num_nodes / num_instances)
 num_instances: 1         # Number of independent deployment instances
 ntasks_per_node: 1

@@ -1168,32 +1168,65 @@ def _generate_auto_export_section(
     if not launcher_install_cmd:
         launcher_install_cmd = "pip install nemo-evaluator-launcher[all]"
 
-    s += "    # export\n"
-    s += "    srun --mpi pmix --overlap "
-    s += '--nodelist "${PRIMARY_NODE}" --nodes 1 --ntasks 1 '
-    s += "--container-image {} ".format(export_image)
-    if env_var_names:
-        s += "--container-env {} ".format(",".join(env_var_names))
-    # never mount home directory for export jobs - this is error prone
-    # and there's no use-case for mounting it
-    s += "--no-container-mount-home "
+    cpu_partition = cfg.execution.get("cpu_partition")
+    export_partition = cpu_partition or cfg.execution.partition
+    output_dir = cfg.execution.output_dir
+    invocation_dir = remote_task_subdir.parent
+
+    # --- Build export sbatch script (CPU-only, no GPUs) ---
+    export_sbatch = "#!/bin/bash\n"
+    export_sbatch += f"#SBATCH --job-name=nel-export-{remote_task_subdir.name}\n"
+    export_sbatch += "#SBATCH --nodes=1\n"
+    export_sbatch += "#SBATCH --ntasks=1\n"
+    export_sbatch += "#SBATCH --time=00:30:00\n"
+    export_sbatch += f"#SBATCH --account {cfg.execution.account}\n"
+    export_sbatch += f"#SBATCH --partition {export_partition}\n"
+    export_sbatch += "#SBATCH --no-requeue\n"
+    export_sbatch += (
+        f"#SBATCH --output {remote_task_subdir / 'logs' / 'export-%A.log'}\n"
+    )
+    export_sbatch += "\nset -uo pipefail\n"
+    secrets_path = remote_task_subdir / ".secrets.env"
+    export_sbatch += f'[ -f "{secrets_path}" ] && source "{secrets_path}"\n'
+    if secrets:
+        export_sbatch += f"{build_reexport_commands('export', secrets)}\n"
 
     mounts = [
-        f"{remote_task_subdir}/artifacts:{remote_task_subdir}/artifacts",
-        f"{remote_task_subdir}/logs:{remote_task_subdir}/logs",
+        f"{invocation_dir}:{invocation_dir}",
+        f"{output_dir}:{output_dir}",
     ]
     for host_path, container_path in export_mounts.items():
         mounts.append(f"{host_path}:{container_path}")
-    s += "--container-mounts {} ".format(",".join(mounts))
-    s += "--output {} ".format(remote_task_subdir / "logs" / "export-%A.log")
-    s += "    bash -c '\n"
-    s += f"        {launcher_install_cmd}\n"
-    s += f"        cd {remote_task_subdir}/artifacts\n"
+
+    export_sbatch += (
+        f"\nsrun --nodes 1 --ntasks 1 --gpus 0 --container-image {export_image} "
+    )
+    if env_var_names:
+        export_sbatch += "--container-env {} ".format(",".join(env_var_names))
+    # never mount home directory for export jobs - this is error prone
+    # and there's no use-case for mounting it
+    export_sbatch += "--no-container-mount-home "
+    export_sbatch += "--container-mounts {} ".format(",".join(mounts))
+    export_sbatch += "bash -c '\n"
+    export_sbatch += f"    {launcher_install_cmd}\n"
+    export_sbatch += f"    cd {remote_task_subdir}/artifacts\n"
     for dest in destinations:
-        s += f'        echo "Exporting to {dest}..."\n'
-        s += f'        nemo-evaluator-launcher export {job_id} --dest {dest} --config export_config.yml --job-dirs {cfg.execution.output_dir} || echo "Export to {dest} failed"\n'
-    s += "'\n"
-    s += "    echo 'Auto-export completed.'\n"
+        export_sbatch += f'    echo "Exporting to {dest}..."\n'
+        export_sbatch += f'    nemo-evaluator-launcher export {job_id} --dest {dest} --config {remote_task_subdir}/artifacts/export_config.yml --job-dirs {output_dir} || echo "Export to {dest} failed"\n'
+    export_sbatch += "'\n"
+
+    # --- Write and submit the export sbatch script ---
+    export_script_path = remote_task_subdir / "export.sbatch"
+    s += f"    cat > {export_script_path} << 'EXPORT_EOF'\n"
+    s += export_sbatch
+    s += "EXPORT_EOF\n"
+    s += f'    _export_out=$(sbatch "{export_script_path}" 2>&1)\n'
+    s += "    _export_id=$(echo \"$_export_out\" | grep -oE '[0-9]+')\n"
+    s += '    if [ -n "$_export_id" ]; then\n'
+    s += '        echo "Export job submitted: $_export_id"\n'
+    s += "    else\n"
+    s += '        echo "WARNING: Failed to submit export job: $_export_out"\n'
+    s += "    fi\n"
     s += "else\n"
     s += "    echo 'Evaluation failed with exit code $EVAL_EXIT_CODE. Skipping auto-export.'\n"
     s += "fi\n"

@@ -1358,6 +1358,8 @@ def test_generate_auto_export_section_skips_marker_interrupted_runs(self):
         cfg = OmegaConf.create(
             {
                 "execution": {
+                    "account": "test_account",
+                    "partition": "batch",
                     "output_dir": "/tmp/out",
                     "auto_export": {"destinations": ["wandb"]},
                 },
@@ -1383,6 +1385,8 @@ def test_generate_auto_export_section_with_export_mounts(self):
         cfg = OmegaConf.create(
             {
                 "execution": {
+                    "account": "test_account",
+                    "partition": "gpu_partition",
                     "output_dir": "/tmp/out",
                     "auto_export": {
                         "destinations": ["mlflow"],
@@ -1405,15 +1409,16 @@ def test_generate_auto_export_section_with_export_mounts(self):
             remote_task_subdir=Path("/tmp/out/test_task"),
         )
 
-        assert "/tmp/out/test_task/artifacts:/tmp/out/test_task/artifacts" in section
-        assert "/tmp/out/test_task/logs:/tmp/out/test_task/logs" in section
+        assert "/tmp/out:/tmp/out" in section
         assert "/lustre/cache/uv:/cache/uv" in section
         assert "/lustre/data:/data" in section
 
     def test_generate_auto_export_section_with_custom_image(self):
         cfg = OmegaConf.create(
             {
                 "execution": {
+                    "account": "test_account",
+                    "partition": "gpu_partition",
                     "output_dir": "/tmp/out",
                     "auto_export": {
                         "destinations": ["mlflow"],