fix state dict hook for early fusion models (#2317)

acisseJZhong · jessicazhongeee · web-flow · commit be4ff508477b · 2025-01-29T16:07:07.000-08:00
Co-authored-by: JessicaZhong &lt;zhengjesszhong@gmail.com&gt;
diff --git a/torchtune/modules/model_fusion/_early_fusion.py b/torchtune/modules/model_fusion/_early_fusion.py
@@ -137,8 +137,11 @@ def _state_dict_hook(module, state_dict, prefix, *args, **kwargs):
         [!Note] This update changes the order of the OrderedDict
         """
         for n, p in module.tok_embeddings.named_parameters():
-            state_dict[f"{prefix}decoder.tok_embeddings.{n}"] = p
-            del state_dict[f"{prefix}tok_embeddings.{n}"]
+            orig_key = f"{prefix}tok_embeddings.{n}"
+            if orig_key in state_dict:
+                # preserve the original tensor with its requires_grad state
+                state_dict[f"{prefix}decoder.tok_embeddings.{n}"] = state_dict[orig_key]
+                del state_dict[orig_key]
 
     @staticmethod
     def _load_state_dict_hook(module, state_dict, prefix, *args, **kwargs):