Fix time-epochs max_time formatting

sgreenbury · sgreenbury · commit 18b4a52a3783 · 2026-04-16T17:30:33.000+01:00
Correct max_time output to DD:HH:MM:SS and add validation for
budget, margin, and num_epochs. Update docs example to match.
diff --git a/docs/SCRIPTS_AND_CONFIGS.md b/docs/SCRIPTS_AND_CONFIGS.md
@@ -217,7 +217,7 @@ The output includes recommended Hydra overrides ready to copy-paste:
 ============================================================
 
 Recommended overrides:
-  trainer.max_epochs=564 trainer.max_time=24:00:00:00 optimizer=adamw_half
+  trainer.max_epochs=564 trainer.max_time=01:00:00:00 optimizer=adamw_half
 ```
 
 The calculation is conservative:
diff --git a/src/autocast/scripts/workflow/commands.py b/src/autocast/scripts/workflow/commands.py
@@ -935,6 +935,16 @@ def _compute_max_epochs(
     2. A cosine half-period schedule (``cosine_epochs = max_epochs``)
        reaches exactly zero and never starts increasing again.
     """
+    if seconds_per_epoch <= 0:
+        msg = "seconds_per_epoch must be positive"
+        raise ValueError(msg)
+    if budget_hours <= 0:
+        msg = "budget_hours must be positive"
+        raise ValueError(msg)
+    if not (0.0 <= margin < 1.0):
+        msg = "margin must be in [0, 1)"
+        raise ValueError(msg)
+
     budget_seconds = budget_hours * 3600
     usable_seconds = budget_seconds * (1.0 - margin)
     max_epochs = math.floor(usable_seconds / seconds_per_epoch)
@@ -952,25 +962,42 @@ def _compute_max_epochs(
 
 def _format_max_time(budget_hours: float) -> str:
     """Format *budget_hours* as a ``DD:HH:MM:SS`` string for Lightning."""
-    if budget_hours != int(budget_hours):
-        return f"{int(budget_hours):02d}:{int(budget_hours % 1 * 60):02d}:00:00"
-    return f"{int(budget_hours):02d}:00:00:00"
+    if budget_hours <= 0:
+        msg = "budget_hours must be positive"
+        raise ValueError(msg)
+
+    total_seconds = round(budget_hours * 3600)
+    days, rem = divmod(total_seconds, 24 * 3600)
+    hours, rem = divmod(rem, 3600)
+    minutes, seconds = divmod(rem, 60)
+    return f"{days:02d}:{hours:02d}:{minutes:02d}:{seconds:02d}"
 
 
 def _print_timing_results(
     epoch_times: list[float],
     budget_hours: float,
     margin: float,
-) -> dict:
+) -> dict | None:
     """Compute and print the ``max_epochs`` recommendation from epoch timings."""
     seconds_per_epoch = sum(epoch_times) / len(epoch_times)
     print(
         "\nPer-epoch times (from TrainingTimerCallback): "
         + ", ".join(f"{t:.1f}s" for t in epoch_times)
     )
 
-    result = _compute_max_epochs(seconds_per_epoch, budget_hours, margin)
-    max_time_str = _format_max_time(budget_hours)
+    try:
+        result = _compute_max_epochs(seconds_per_epoch, budget_hours, margin)
+        max_time_str = _format_max_time(budget_hours)
+    except ValueError as exc:
+        print(f"\nERROR: {exc}")
+        return None
+
+    if result["max_epochs"] < 1:
+        print(
+            "\nERROR: Computed max_epochs < 1. Increase the budget, reduce the "
+            "margin, or re-check the epoch timing estimate."
+        )
+        return None
 
     print(f"\n{'=' * 60}")
     print(f"  Seconds/epoch:  {result['seconds_per_epoch']:.1f}s")
@@ -988,83 +1015,39 @@ def _print_timing_results(
     return result
 
 
-def time_epochs_command(
+def _validate_time_epochs_args(
+    *, num_epochs: int, budget_hours: float, margin: float
+) -> None:
+    if num_epochs < 1:
+        msg = "--num-epochs must be >= 1"
+        raise ValueError(msg)
+    if budget_hours <= 0:
+        msg = "--budget must be > 0"
+        raise ValueError(msg)
+    if not (0.0 <= margin < 1.0):
+        msg = "--margin must be in [0, 1)"
+        raise ValueError(msg)
+
+
+def _run_time_epochs_training(
     *,
-    kind: str = "epd",
+    kind: str,
     mode: str,
     dataset: str | None,
     output_base: str,
     overrides: list[str],
-    num_epochs: int = 3,
-    budget_hours: float = 24.0,
-    margin: float = 0.02,
-    run_group: str | None = None,
-    run_id: str | None = None,
-    work_dir: str | None = None,
-    from_checkpoint: str | None = None,
-    runtime_typechecking: bool = False,
-    dry_run: bool = False,
-) -> dict | None:
-    """Run a short training to time per-epoch duration and recommend ``max_epochs``.
-
-    Executes *num_epochs* epochs of training (ae, epd, or processor) with
-    W&B logging and testing disabled, saves a checkpoint so that per-epoch
-    wall-clock times can be extracted from ``TrainingTimerCallback``, and
-    prints the recommended ``trainer.max_epochs`` for a cosine half-period
-    schedule (``optimizer=adamw_half``) that completes within *budget_hours*.
-
-    The calculation is conservative: a *margin* fraction is subtracted
-    from the budget **and** the result is rounded down to a whole epoch,
-    so the schedule will always reach zero before the wall-clock limit.
-    ``trainer.max_time`` is emitted as a hard safety stop equal to the
-    full (un-margined) budget.
-
-    With ``--mode slurm`` the timing run is submitted via sbatch and the
-    command exits immediately, printing a ``--from-checkpoint`` command to
-    retrieve results once the job completes.
-
-    Parameters
-    ----------
-    kind:
-        Training kind: ``"ae"``, ``"epd"``, or ``"processor"``.
-    dataset:
-        Hydra datamodule group name (e.g. ``"advection_diffusion_multichannel"``).
-    output_base:
-        Root output directory (forwarded to ``build_train_overrides``).
-    overrides:
-        Additional Hydra overrides forwarded to the timing run.
-    num_epochs:
-        How many epochs to run for the timing measurement.
-    budget_hours:
-        Target wall-clock budget in hours.
-    margin:
-        Fraction of *budget_hours* held back as safety headroom (default 2 %).
-    from_checkpoint:
-        Path to an existing checkpoint; skips training and computes the
-        recommendation directly.
-    """
-    # ------------------------------------------------------------------
-    # Fast path: compute from an existing checkpoint (no training needed)
-    # ------------------------------------------------------------------
-    if from_checkpoint is not None:
-        ckpt = Path(from_checkpoint)
-        epoch_times = _extract_epoch_times_from_checkpoint(ckpt)
-        if not epoch_times:
-            print(
-                f"ERROR: Could not extract per-epoch times from {ckpt}. "
-                "Check that the checkpoint was produced by a timing run with "
-                "TrainingTimerCallback."
-            )
-            return None
-        return _print_timing_results(epoch_times, budget_hours, margin)
-
-    # ------------------------------------------------------------------
-    # Training path: run a short timing job (local or SLURM)
-    # ------------------------------------------------------------------
+    num_epochs: int,
+    budget_hours: float,
+    margin: float,
+    run_group: str | None,
+    run_id: str | None,
+    work_dir: str | None,
+    runtime_typechecking: bool,
+    dry_run: bool,
+) -> tuple[list[float] | None, bool]:
+    """Run timing training job and return (epoch_times, exit_early)."""
     timing_run_id = run_id or "timing"
 
-    # Local without explicit workdir: use a tempdir (cleaned up after).
-    # SLURM or explicit workdir: use a persistent path so results survive.
     use_tempdir = mode == "local" and work_dir is None
     tmpdir_ctx = (
         tempfile.TemporaryDirectory(prefix="autocast_timing_") if use_tempdir else None
@@ -1074,9 +1057,6 @@ def time_epochs_command(
     try:
         effective_work_dir = tmpdir if use_tempdir else work_dir
 
-        # Build overrides: short run, no wandb, no test, checkpoint for
-        # timer extraction.  Use a relative checkpoint name so the
-        # training script resolves it against its own work_dir.
         timing_overrides = [
             f"++trainer.max_epochs={num_epochs}",
             "++trainer.max_time=null",
@@ -1093,9 +1073,9 @@ def time_epochs_command(
             output_base=output_base,
             run_group=run_group,
             run_id=timing_run_id,
-            work_dir=(
-                str(effective_work_dir) if effective_work_dir is not None else None
-            ),
+            work_dir=str(effective_work_dir)
+            if effective_work_dir is not None
+            else None,
             resume_from=None,
             overrides=[*timing_overrides, *overrides],
         )
@@ -1107,7 +1087,7 @@ def time_epochs_command(
             print(f"DRY-RUN: {format_command(cmd)}")
             print(f"\nWould time {num_epochs} epochs, then compute max_epochs")
             print(f"for a {budget_hours}h budget with {margin:.0%} margin.")
-            return None
+            return None, True
 
         if mode == "slurm":
             run_module(
@@ -1122,8 +1102,6 @@ def time_epochs_command(
                 f"--from-checkpoint {ckpt_path} "
                 f"-b {budget_hours} -m {margin}"
             )
-            # Write retrieval command to workdir so batch results are easy
-            # to collect: for f in outputs/timing/*/retrieve.sh; do bash "$f"; done
             final_work_dir.mkdir(parents=True, exist_ok=True)
             (final_work_dir / "retrieve.sh").write_text(
                 f"#!/usr/bin/env bash\n{retrieve_cmd}\n"
@@ -1133,12 +1111,10 @@ def time_epochs_command(
             print(f"  {retrieve_cmd}")
             print(
                 "\nOr collect all timing results at once:\n"
-                "  for f in outputs/timing/*/retrieve.sh; "
-                'do bash "$f"; done'
+                '  for f in outputs/timing/*/retrieve.sh; do bash "$f"; done'
             )
-            return None
+            return None, True
 
-        # Local execution
         print(f"Timing {num_epochs} epoch(s) to estimate per-epoch duration...")
         run_module(
             TRAIN_MODULES[kind],
@@ -1147,12 +1123,107 @@ def time_epochs_command(
             mode="local",
             runtime_typechecking=runtime_typechecking,
         )
-
-        epoch_times = _extract_epoch_times_from_checkpoint(ckpt_path)
+        return _extract_epoch_times_from_checkpoint(ckpt_path), False
     finally:
         if tmpdir_ctx is not None:
             tmpdir_ctx.__exit__(None, None, None)
 
+
+def time_epochs_command(
+    *,
+    kind: str = "epd",
+    mode: str,
+    dataset: str | None,
+    output_base: str,
+    overrides: list[str],
+    num_epochs: int = 3,
+    budget_hours: float = 24.0,
+    margin: float = 0.02,
+    run_group: str | None = None,
+    run_id: str | None = None,
+    work_dir: str | None = None,
+    from_checkpoint: str | None = None,
+    runtime_typechecking: bool = False,
+    dry_run: bool = False,
+) -> dict | None:
+    """Run a short training to time per-epoch duration and recommend ``max_epochs``.
+
+    Executes *num_epochs* epochs of training (ae, epd, or processor) with
+    W&B logging and testing disabled, saves a checkpoint so that per-epoch
+    wall-clock times can be extracted from ``TrainingTimerCallback``, and
+    prints the recommended ``trainer.max_epochs`` for a cosine half-period
+    schedule (``optimizer=adamw_half``) that completes within *budget_hours*.
+
+    The calculation is conservative: a *margin* fraction is subtracted
+    from the budget **and** the result is rounded down to a whole epoch,
+    so the schedule will always reach zero before the wall-clock limit.
+    ``trainer.max_time`` is emitted as a hard safety stop equal to the
+    full (un-margined) budget.
+
+    With ``--mode slurm`` the timing run is submitted via sbatch and the
+    command exits immediately, printing a ``--from-checkpoint`` command to
+    retrieve results once the job completes.
+
+    Parameters
+    ----------
+    kind:
+        Training kind: ``"ae"``, ``"epd"``, or ``"processor"``.
+    dataset:
+        Hydra datamodule group name (e.g. ``"advection_diffusion_multichannel"``).
+    output_base:
+        Root output directory (forwarded to ``build_train_overrides``).
+    overrides:
+        Additional Hydra overrides forwarded to the timing run.
+    num_epochs:
+        How many epochs to run for the timing measurement.
+    budget_hours:
+        Target wall-clock budget in hours.
+    margin:
+        Fraction of *budget_hours* held back as safety headroom (default 2 %).
+    from_checkpoint:
+        Path to an existing checkpoint; skips training and computes the
+        recommendation directly.
+    """
+    try:
+        _validate_time_epochs_args(
+            num_epochs=num_epochs,
+            budget_hours=budget_hours,
+            margin=margin,
+        )
+    except ValueError as exc:
+        print(f"ERROR: {exc}")
+        return None
+
+    if from_checkpoint is not None:
+        ckpt = Path(from_checkpoint)
+        epoch_times = _extract_epoch_times_from_checkpoint(ckpt)
+        if not epoch_times:
+            print(
+                f"ERROR: Could not extract per-epoch times from {ckpt}. "
+                "Check that the checkpoint was produced by a timing run with "
+                "TrainingTimerCallback."
+            )
+            return None
+        return _print_timing_results(epoch_times, budget_hours, margin)
+
+    epoch_times, exit_early = _run_time_epochs_training(
+        kind=kind,
+        mode=mode,
+        dataset=dataset,
+        output_base=output_base,
+        overrides=overrides,
+        num_epochs=num_epochs,
+        budget_hours=budget_hours,
+        margin=margin,
+        run_group=run_group,
+        run_id=run_id,
+        work_dir=work_dir,
+        runtime_typechecking=runtime_typechecking,
+        dry_run=dry_run,
+    )
+    if exit_early:
+        return None
+
     if not epoch_times:
         print(
             "\nWARNING: Could not extract per-epoch times from checkpoint. "