huggingface · S1ro1 · May 19, 2025 · May 1, 2025 · May 1, 2025 · May 2, 2025
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -3491,7 +3491,7 @@ def save_pretrained(
             for name, tensor in state_dict.items():
                 # Sometimes in the state_dict we have non-tensor objects.
                 # e.g. in bitsandbytes we have some `str` objects in the state_dict
-                if isinstance(tensor, torch.Tensor):
+                if isinstance(tensor, torch.Tensor) or isinstance(tensor, torch.distributed.tensor.DTensor):
                     ptrs[id_tensor_storage(tensor)].append(name)
                 else:
                     # In the non-tensor case, fall back to the pointer of the object itself
@@ -3601,7 +3601,10 @@ def save_pretrained(
         for shard_file, tensors in filename_to_tensors:
             shard = {}
             for tensor in tensors:
-                shard[tensor] = state_dict[tensor].contiguous()
+                if isinstance(state_dict[tensor], torch.distributed.tensor.DTensor):
+                    shard[tensor] = state_dict[tensor].full_tensor().contiguous()
+                else:
+                    shard[tensor] = state_dict[tensor].contiguous()
                 # delete reference, see https://github.com/huggingface/transformers/pull/34890
                 del state_dict[tensor]
 

diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
@@ -296,6 +296,13 @@ def id_tensor_storage(tensor: torch.Tensor) -> tuple[torch.device, int, int]:
     guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
     non-overlapping lifetimes may have the same id.
     """
+    if is_torch_greater_or_equal_than_2_0:
+        from torch.distributed.tensor import DTensor
+
+        if isinstance(tensor, DTensor):
+            local_tensor = tensor.to_local()
+            return tensor.device, local_tensor.storage().data_ptr(), tensor.nbytes
+
     if tensor.device.type == "xla" and is_torch_xla_available():
         # NOTE: xla tensors dont have storage
         # use some other unique id to distinguish.

diff --git a/tests/tensor_parallel/test_tensor_parallel.py b/tests/tensor_parallel/test_tensor_parallel.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import subprocess
 import tempfile
 import textwrap
@@ -32,15 +33,18 @@
 class TestTensorParallel(TestCasePlus):
     nproc_per_node = 2
 
-    def torchrun(self, script: str):
+    def torchrun(self, script: str, is_torchrun: bool = True):
         """Run the `script` using `torchrun` command for multi-processing in a subprocess. Captures errors as necessary."""
         with tempfile.NamedTemporaryFile(mode="w+", suffix=".py") as tmp:
             tmp.write(script)
             tmp.flush()
             tmp.seek(0)
-            cmd = (
-                f"torchrun --nproc_per_node {self.nproc_per_node} --master_port {get_torch_dist_unique_port()} {tmp.name}"
-            ).split()
+            if is_torchrun:
+                cmd = (
+                    f"torchrun --nproc_per_node {self.nproc_per_node} --master_port {get_torch_dist_unique_port()} {tmp.name}"
+                ).split()
+            else:
+                cmd = ["python", tmp.name]
 
             # Note that the subprocess will be waited for here, and raise an error if not successful
             try:
@@ -88,6 +92,47 @@ def test_model_forward(self):
         )
         self.torchrun(script_to_run)
 
+    def test_model_save(self):
+        from safetensors import safe_open
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            for is_torchrun in [True, False]:
+                script_to_run = textwrap.dedent(
+                    f"""
+                    import torch
+                    import os
+                    from transformers import AutoModelForCausalLM
+
+                    model_id = "JackFram/llama-68m"
+                    kwargs = dict()
+
+                    if os.environ.get("RANK", None) is not None:
+                        kwargs["tp_plan"] = "auto"
+                        result_dir = "{tmp_dir}/tp"
+                    else:
+                        result_dir = "{tmp_dir}/nontp"
+
+                    model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
+                    model.save_pretrained(result_dir)
+                    """
+                )
+                self.torchrun(script_to_run, is_torchrun=is_torchrun)
+
+            non_tp_model_path = os.path.join(tmp_dir, "nontp")
+            tp_model_path = os.path.join(tmp_dir, "tp")
+
+            for filename in os.listdir(non_tp_model_path):
+                if not filename.endswith(".safetensors"):
+                    continue
+
+                non_tp_model = safe_open(os.path.join(non_tp_model_path, filename), device="cpu", framework="pt")
+                tp_model = safe_open(os.path.join(tp_model_path, filename), device="cpu", framework="pt")
+                for non_tp_key in non_tp_model.keys():
+                    non_tp_tensor = non_tp_model.get_tensor(non_tp_key)
+                    tp_tensor = tp_model.get_tensor(non_tp_key)
+                    assert torch.allclose(non_tp_tensor, tp_tensor), f"Tensor with key: {non_tp_key} does not match"
+                    del non_tp_tensor, tp_tensor
+
 
 @require_torch_multi_gpu
 class TestTensorParallelCuda(TestTensorParallel):