pytorch · felipemello1 · Nov 1, 2024 · Nov 1, 2024
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -767,7 +767,7 @@ def train(self) -> None:
                 self._profiler.step()
 
             self.epochs_run += 1
-            self.save_checkpoint(epoch=curr_epoch)
+            # self.save_checkpoint(epoch=curr_epoch)
 
         self._profiler.stop()
 

diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py
@@ -481,7 +481,6 @@ def reset_caches(self):
         for layer in self.layers:
             layer.reset_cache()
 
-    @torch.compiler.disable
     def chunked_output(self, last_hidden_state: torch.Tensor) -> List[torch.Tensor]:
         """
         Apply output projection in chunks. This should be applied in conjunction with

diff --git a/torchtune/training/_compile.py b/torchtune/training/_compile.py
@@ -41,6 +41,7 @@ def compile_model(
     """
     backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
     if isinstance(model, DeepFusionModel):
+        # TODO: compile encoder too. Make sure to compile decoder.norm, etc..
         model = model.decoder
     if torch_version_ge("2.5.0"):
         if verbose:
@@ -50,6 +51,16 @@ def compile_model(
                 m, TransformerCrossAttentionLayer
             ):
                 m.compile(backend=backend)
+
+        if hasattr(model, "norm"):
+            model.norm.compile(backend=backend)
+
+        if hasattr(model, "chunked_output"):
+            model.chunked_output = torch.compile(model.chunked_output, backend=backend)
+
+        if hasattr(model, "token_embeddings"):
+            model.token_embeddings.compile(backend=backend)
+
     else:
         if verbose:
             log.info(