Fix plateau detection to use absolute threshold mode (#2469)

gitttt-1234 · claude · web-flow · commit 321454a4f3e6 · 2025-11-20T20:50:05.000-08:00
Changed plateau detection from relative to absolute threshold mode to match PyTorch's ReduceLROnPlateau behavior. This fixes inconsistencies in early stopping and learning rate scheduling. Changes: - Updated monitor.py plateau check from relative (< best * (1 - delta)) to absolute (< best - delta) mode - Changed training_editor_form.yaml default min_delta from 1e-6 to 1e-8 - Updated all training profile configs to use threshold_mode='abs' with threshold=1e-6 - Reduced early stopping patience from 20 to 10 epochs - Updated code comments to clarify absolute threshold usage 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/sleap/config/training_editor_form.yaml b/sleap/config/training_editor_form.yaml
@@ -487,7 +487,7 @@ optimization:
   label: Stop Training on Plateau
   name: trainer_config.early_stopping.stop_training_on_plateau
   type: bool
-- default: 1e-6
+- default: 1e-8
   help: Minimum absolute decrease in the loss in order to consider an epoch as not in a plateau.
   label: Plateau Min. Delta
   name: trainer_config.early_stopping.min_delta
diff --git a/sleap/gui/widgets/monitor.py b/sleap/gui/widgets/monitor.py
@@ -670,7 +670,7 @@ def reset(
                 corresponds to.
             plateau_patience: Number of epochs to wait in plateau before stopping.
             plateau_min_delta: Minimum change in validation loss to be considered
-                significant.
+                significant (absolute threshold).
         """
         self.canvas = LossPlot(
             width=5,
@@ -935,11 +935,10 @@ def _check_messages(
                                 self.best_epoch_loss = self.last_epoch_val_loss
 
                             if self.plateau_min_delta is not None:
-                                # plateau check according to `rel` thrsh mode in torch.
+                                # Plateau check using absolute threshold mode in torch
                                 is_better = (
                                     self.last_epoch_val_loss
-                                    < self.best_epoch_loss
-                                    * (1.0 - self.plateau_min_delta)
+                                    < self.best_epoch_loss - self.plateau_min_delta
                                 )
                             else:
                                 is_better = (
diff --git a/sleap/training_profiles/baseline.centroid.yaml b/sleap/training_profiles/baseline.centroid.yaml
@@ -126,15 +126,15 @@ trainer_config:
   lr_scheduler:
     step_lr: null
     reduce_lr_on_plateau:
-      threshold: 1.0e-08
-      threshold_mode: rel
+      threshold: 1.0e-06
+      threshold_mode: abs
       cooldown: 3
       patience: 5
       factor: 0.5
       min_lr: 1.0e-08
   early_stopping:
     min_delta: 1.0e-08
-    patience: 20
+    patience: 10
     stop_training_on_plateau: true
   online_hard_keypoint_mining:
     online_mining: false
diff --git a/sleap/training_profiles/baseline.multi_class_bottomup.yaml b/sleap/training_profiles/baseline.multi_class_bottomup.yaml
@@ -122,7 +122,7 @@ trainer_config:
     step_lr: null
     reduce_lr_on_plateau:
       threshold: 1.0e-06
-      threshold_mode: rel
+      threshold_mode: abs
       cooldown: 3
       patience: 5
       factor: 0.5
diff --git a/sleap/training_profiles/baseline.multi_class_topdown.yaml b/sleap/training_profiles/baseline.multi_class_topdown.yaml
@@ -124,7 +124,7 @@ trainer_config:
     step_lr: null
     reduce_lr_on_plateau:
       threshold: 1.0e-06
-      threshold_mode: rel
+      threshold_mode: abs
       cooldown: 3
       patience: 5
       factor: 0.5
diff --git a/sleap/training_profiles/baseline_large_rf.bottomup.yaml b/sleap/training_profiles/baseline_large_rf.bottomup.yaml
@@ -132,10 +132,10 @@ trainer_config:
   lr_scheduler:
     step_lr: null
     reduce_lr_on_plateau:
-      threshold: 1.0e-08
-      threshold_mode: rel
+      threshold: 1.0e-06
+      threshold_mode: abs
       cooldown: 3
-      patience: 8
+      patience: 5
       factor: 0.5
       min_lr: 1.0e-08
   early_stopping:
diff --git a/sleap/training_profiles/baseline_large_rf.single.yaml b/sleap/training_profiles/baseline_large_rf.single.yaml
@@ -126,8 +126,8 @@ trainer_config:
   lr_scheduler:
     step_lr: null
     reduce_lr_on_plateau:
-      threshold: 1.0e-05
-      threshold_mode: rel
+      threshold: 1.0e-06
+      threshold_mode: abs
       cooldown: 3
       patience: 5
       factor: 0.5
diff --git a/sleap/training_profiles/baseline_large_rf.topdown.yaml b/sleap/training_profiles/baseline_large_rf.topdown.yaml
@@ -128,8 +128,8 @@ trainer_config:
   lr_scheduler:
     step_lr: null
     reduce_lr_on_plateau:
-      threshold: 1.0e-08
-      threshold_mode: rel
+      threshold: 1.0e-06
+      threshold_mode: abs
       cooldown: 3
       patience: 5
       factor: 0.5
diff --git a/sleap/training_profiles/baseline_medium_rf.bottomup.yaml b/sleap/training_profiles/baseline_medium_rf.bottomup.yaml
@@ -132,10 +132,10 @@ trainer_config:
   lr_scheduler:
     step_lr: null
     reduce_lr_on_plateau:
-      threshold: 1.0e-08
-      threshold_mode: rel
+      threshold: 1.0e-06
+      threshold_mode: abs
       cooldown: 3
-      patience: 8
+      patience: 5
       factor: 0.5
       min_lr: 1.0e-08
   early_stopping:
diff --git a/sleap/training_profiles/baseline_medium_rf.single.yaml b/sleap/training_profiles/baseline_medium_rf.single.yaml
@@ -126,8 +126,8 @@ trainer_config:
   lr_scheduler:
     step_lr: null
     reduce_lr_on_plateau:
-      threshold: 1.0e-08
-      threshold_mode: rel
+      threshold: 1.0e-06
+      threshold_mode: abs
       cooldown: 3
       patience: 5
       factor: 0.5
diff --git a/sleap/training_profiles/baseline_medium_rf.topdown.yaml b/sleap/training_profiles/baseline_medium_rf.topdown.yaml
@@ -128,8 +128,8 @@ trainer_config:
   lr_scheduler:
     step_lr: null
     reduce_lr_on_plateau:
-      threshold: 1.0e-08
-      threshold_mode: rel
+      threshold: 1.0e-06
+      threshold_mode: abs
       cooldown: 3
       patience: 5
       factor: 0.5