feat: efficiency improvements (#25)

jannisborn · web-flow · commit 61d186a629bd · 2025-04-17T00:10:57.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# MacOS files
+*.DS_Store
diff --git a/src/mblm/train/core/trainer.py b/src/mblm/train/core/trainer.py
@@ -54,9 +54,15 @@
     TTrainConfig,
 )
 from mblm.train.core.iter import epoch_cycler
-from mblm.utils.cuda import cuda_memory_snapshot, cuda_properties
+from mblm.utils.cuda import IS_BF16_AVAILABLE, cuda_memory_snapshot, cuda_properties
 from mblm.utils.distributed import ElasticRunVars
-from mblm.utils.io import CSVWriter, StateDict, dump_yml, load_model_state, save_model_state
+from mblm.utils.io import (
+    CSVWriter,
+    StateDict,
+    dump_yml,
+    load_model_state,
+    save_model_state,
+)
 from mblm.utils.logging import create_logger
 from mblm.utils.misc import retry
 from mblm.utils.top_n import TopN
@@ -76,7 +82,7 @@ class CoreTrainerOptions:
     train_prog_min_interval_seconds: int = 1
     valid_prog_min_interval_seconds: int = 1
     track_first_fw_bw_exec_times: int | None = 30  # for 30 first passes, track fw/bw time
-    amp_dtype: torch.dtype = torch.half  # may use bfloat16
+    amp_dtype: torch.dtype = torch.bfloat16 if IS_BF16_AVAILABLE else torch.half
 
 
 class CoreTrainer(ABC, Generic[TModel, TBatch, TModelParams, TTrainConfig, TIoConfig]):
@@ -159,7 +165,9 @@ def __init__(
         )
 
         assert config.io.validate_amount > 0, "Validate amount must be strictly positive"
-        assert config.io.num_models_to_save > 0, "Must save at least 1 model"
+        assert config.io.num_models_to_save >= 0, "num_models_to_save cant be negative"
+        if config.io.num_models_to_save == 0:
+            self._log.warning("No model of this training will be saved!")
 
         if config.io.validate_amount < config.io.num_models_to_save:
             self._log.warning(
@@ -963,7 +971,7 @@ def before_new_epoch(epoch: int) -> None:
 
         best_model = self._unpack_distributed_model(self._model_dist)
 
-        if self._is_main_worker:
+        if self._is_main_worker and self.config.io.num_models_to_save > 0:
             # if, on the main worker, populate the model with the best state
             # non-main workers will simply return the latest model, which won't
             # be used anyway because testing happens only on the main worker
@@ -1003,4 +1011,3 @@ def test(
             avg_grad_clipped=-1,
         )
         self._log.info("Finished testing")
-        return None
diff --git a/src/mblm/utils/cuda.py b/src/mblm/utils/cuda.py
@@ -28,6 +28,9 @@
 import torch.version
 from torch.types import Device
 
+IS_CUDA_AVAILABLE = torch.cuda.is_available()
+IS_BF16_AVAILABLE = IS_CUDA_AVAILABLE and torch.cuda.is_bf16_supported()
+
 
 @dataclass
 class CudaProperties:
diff --git a/src/mblm/utils/top_n.py b/src/mblm/utils/top_n.py
@@ -40,7 +40,7 @@ class TopN(Generic[_T]):
     with the first element being the smallest. A max heap can be specified via `top_largest`.
 
     Args:
-        n (int): Max number of items to store
+        n (int): Max number of items to store. If zero this class is a no-op
         deep_copy (bool = `False`): Create a deep copy of elements
         top_largest (bool = `False`): If true, store the `n` largest items instead of
             the smallest items
@@ -77,6 +77,8 @@ def add(self, item: tuple[SupportsFloat, _T]) -> None:
         Add an item to the queue. The first tuple entry is used to
         determine the position of the newly added element
         """
+        if self._max_heap_items == 0:
+            return
         val, data = item
         if self._deep_copy:
             data = copy.deepcopy(data)
diff --git a/tests/e2e/trainer/sample-config-grad-acc-1.yaml b/tests/e2e/trainer/sample-config-grad-acc-1.yaml
@@ -1,7 +1,7 @@
 io:
   name_model: my-model
   output_dir: tests/e2e/trainer/outputs # static
-  num_models_to_save: 2
+  num_models_to_save: 0
   validate_amount: 10
   log_train_loss_amount: 20
   description: >-