Allow snapshot.pickle to be saved for all recipes with profiler support (#1384)

janeyx99 · web-flow · commit f9f75bb563ec · 2024-08-22T12:12:18.000-07:00
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -311,6 +311,12 @@ def _setup_profiler(
         if self._is_rank_zero:
             log.info(f" Profiler config after instantiation: {profiler_cfg}")
 
+            self.profiler_profile_memory = profiler_cfg.get("profile_memory", False)
+            if profiler_cfg["enabled"]:
+                self.profiler_wait_steps = profiler_cfg["wait_steps"]
+                self.profiler_warmup_steps = profiler_cfg["warmup_steps"]
+                self.profiler_active_steps = profiler_cfg["active_steps"]
+
         return profiler
 
     def _setup_model(
@@ -569,6 +575,15 @@ def train(self) -> None:
                 ):
                     break
 
+                # Start tracking CUDA memory for active steps for just the first epoch
+                if (
+                    self._is_rank_zero
+                    and curr_epoch == 0
+                    and self.profiler_profile_memory
+                    and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                ):
+                    torch.cuda.memory._record_memory_history()
+
                 # Both are shape [b, s]
                 tokens, labels = batch["tokens"], batch["labels"]
                 # Get the attention mask and position ids from the dataset if they
@@ -635,6 +650,18 @@ def train(self) -> None:
                     num_tokens = 0
                     t0 = time.perf_counter()
 
+                    # Stop tracking CUDA memory now that active steps are complete
+                    if (
+                        self._is_rank_zero
+                        and curr_epoch == 0
+                        and self.profiler_profile_memory
+                        and idx
+                        == self.profiler_wait_steps
+                        + self.profiler_warmup_steps
+                        + self.profiler_active_steps
+                    ):
+                        torch.cuda.memory._record_memory_history(enabled=None)
+
                     # Step profiler
                     # Note that this is called within gradient accumulation block, hence
                     # will include multiple forward / backward passes if gradient accumulation > 1
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -312,6 +312,12 @@ def _setup_profiler(
 
         log.info(f" Profiler config after instantiation: {profiler_cfg}")
 
+        self.profiler_profile_memory = profiler_cfg.get("profile_memory", False)
+        if profiler_cfg["enabled"]:
+            self.profiler_wait_steps = profiler_cfg["wait_steps"]
+            self.profiler_warmup_steps = profiler_cfg["warmup_steps"]
+            self.profiler_active_steps = profiler_cfg["active_steps"]
+
         return profiler
 
     def _setup_model(
@@ -519,6 +525,14 @@ def train(self) -> None:
                 ):
                     break
 
+                # Start tracking CUDA memory for active steps for just the first epoch
+                if (
+                    curr_epoch == 0
+                    and self.profiler_profile_memory
+                    and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                ):
+                    torch.cuda.memory._record_memory_history()
+
                 batch = {k: v.to(self._device) for k, v in batch.items()}
                 num_tokens += batch["tokens"].numel()
 
@@ -567,6 +581,17 @@ def train(self) -> None:
                     num_tokens = 0
                     t0 = time.perf_counter()
 
+                # Stop tracking CUDA memory now that active steps are complete
+                if (
+                    curr_epoch == 0
+                    and self.profiler_profile_memory
+                    and idx
+                    == self.profiler_wait_steps
+                    + self.profiler_warmup_steps
+                    + self.profiler_active_steps
+                ):
+                    torch.cuda.memory._record_memory_history(enabled=None)
+
                 # Step the profiler
                 # Note we are stepping each batch, which might not include optimizer step in the trace
                 # if the schedule cycle doesn't align with gradient accumulation.
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -340,6 +340,12 @@ def _setup_profiler(
         if self._is_rank_zero:
             log.info(f" Profiler config after instantiation: {profiler_cfg}")
 
+            self.profiler_profile_memory = profiler_cfg.get("profile_memory", False)
+            if profiler_cfg["enabled"]:
+                self.profiler_wait_steps = profiler_cfg["wait_steps"]
+                self.profiler_warmup_steps = profiler_cfg["warmup_steps"]
+                self.profiler_active_steps = profiler_cfg["active_steps"]
+
         return profiler
 
     def _setup_model(
@@ -654,6 +660,15 @@ def train(self) -> None:
                 ):
                     break
 
+                # Start tracking CUDA memory for active steps for just the first epoch
+                if (
+                    self._is_rank_zero
+                    and curr_epoch == 0
+                    and self.profiler_profile_memory
+                    and idx == self.profiler_wait_steps + self.profiler_warmup_steps
+                ):
+                    torch.cuda.memory._record_memory_history()
+
                 # Both are shape [b, s]
                 tokens, labels = batch["tokens"], batch["labels"]
                 # Get the attention mask and position ids from the dataset if they
@@ -721,6 +736,18 @@ def train(self) -> None:
                     num_tokens = 0
                     t0 = time.perf_counter()
 
+                    # Stop tracking CUDA memory now that active steps are complete
+                    if (
+                        self._is_rank_zero
+                        and curr_epoch == 0
+                        and self.profiler_profile_memory
+                        and idx
+                        == self.profiler_wait_steps
+                        + self.profiler_warmup_steps
+                        + self.profiler_active_steps
+                    ):
+                        torch.cuda.memory._record_memory_history(enabled=None)
+
                     # Step profiler
                     # Note that this is called within gradient accumulation block, hence
                     # will include multiple forward / backward passes if gradient accumulation > 1