Disable DSD and fix bitsandbytes test (#2314)

RdoubleA · web-flow · commit d3b39cf6a512 · 2025-01-29T15:15:20.000-08:00
diff --git a/tests/torchtune/modules/low_precision/test_nf4_linear.py b/tests/torchtune/modules/low_precision/test_nf4_linear.py
@@ -4,12 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-try:
-    import bitsandbytes as bnb
-
-    bnb_installed = True
-except ImportError:
-    bnb_installed = False
 import pytest
 import torch
 from torchao.dtypes.nf4tensor import NF4Tensor
@@ -22,19 +16,6 @@ def random():
     set_seed(31)
 
 
-def _build_bnb_linear(input_weight):
-    """
-    Builds a bnb.nn.LinearNF4 from a given input weight
-    """
-    param = bnb.nn.Params4bit(input_weight, requires_grad=False, quant_type="nf4")
-    bnb_linear = bnb.nn.LinearNF4(
-        input_weight.size(0), input_weight.size(1), bias=False
-    )
-    bnb_linear.weight = param
-    bnb_linear.cuda()
-    return bnb_linear
-
-
 class TestNF4Linear:
     """
     Class for testing our NF4Linear implementation.
@@ -88,18 +69,29 @@ def test_backward_dtype(self, dtype):
         assert inp.grad is not None and inp.grad.dtype == dtype
         assert nf4_linear.weight.grad is None
 
-    @pytest.mark.skipif(not bnb_installed, reason="bitsandbytes is not installed")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
     def test_nf4_reconstruction_vs_bnb(self, dtype):
         """
         Ensures a BNB NF4 linear and our FrozenNF4Linear have low error when
         reconstructing the respective original weights.
         """
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            pytest.skip("bitsandbytes is not installed")
+            return
+
         dim = 512
         nf4_linear = FrozenNF4Linear(dim, dim, device="cuda", dtype=dtype)
         orig_weight = nf4_linear.weight.get_original_weight().clone().detach()
-        bnb_nf4_linear = _build_bnb_linear(input_weight=orig_weight)
+
+        param = bnb.nn.Params4bit(orig_weight, requires_grad=False, quant_type="nf4")
+        bnb_nf4_linear = bnb.nn.LinearNF4(
+            orig_weight.size(0), orig_weight.size(1), bias=False
+        )
+        bnb_nf4_linear.weight = param
+        bnb_nf4_linear.cuda()
 
         # From https://github.com/drisspg/transformer_nuggets/blob/f05afad68ad9086d342268f46a7f344617a02314/test/test_qlora.py#L65
         bnb_reconstruction = bnb_nf4_linear(
@@ -110,18 +102,30 @@ def test_nf4_reconstruction_vs_bnb(self, dtype):
             bnb_reconstruction.T, nf4_linear.weight.get_original_weight(), 1e-2
         )
 
-    @pytest.mark.skipif(not bnb_installed, reason="bitsandbytes is not installed")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
     def test_nf4_bnb_linear(self, dtype):
         """
         This test ensures that nf4_linear is "no worse" than BNB by ensuring the
         error compared to a bf16 linear is not more than BNB's implementation.
         """
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            pytest.skip("bitsandbytes is not installed")
+            return
+
         dim = 512
         nf4_linear = FrozenNF4Linear(dim, dim, device="cuda", dtype=dtype)
         orig_weight = nf4_linear.weight.get_original_weight().clone().detach()
-        bnb_nf4_linear = _build_bnb_linear(input_weight=orig_weight)
+
+        param = bnb.nn.Params4bit(orig_weight, requires_grad=False, quant_type="nf4")
+        bnb_nf4_linear = bnb.nn.LinearNF4(
+            orig_weight.size(0), orig_weight.size(1), bias=False
+        )
+        bnb_nf4_linear.weight = param
+        bnb_nf4_linear.cuda()
+
         bf16_linear = torch.nn.Linear(dim, dim, device="cuda", dtype=dtype)
 
         inp = torch.randn(2, 512, dtype=dtype, device="cuda")
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
@@ -36,17 +36,18 @@
 from torchtune.modules.peft import get_adapter_state_dict
 from torchtune.utils import get_device, get_logger
 from torchtune.utils._logging import deprecated
-from torchtune.utils._version import torch_version_ge
 
 _log: logging.Logger = get_logger()
 
 
 _valid_distributed_single_node_nnodes = ["1:1", "1"]
 
 torch_version = torch.__version__
-_DISTRIBUTED_STATE_DICT_API_IS_AVAILABLE = (
-    "dev" not in torch_version and torch_version_ge("2.6.0")
-) or ("dev" in torch_version and torch_version.split("dev")[1] >= "20241220")
+# TODO: Fix issues with DSD before uncommenting. See #2313 and #2277.
+# _DISTRIBUTED_STATE_DICT_API_IS_AVAILABLE = (
+#     "dev" not in torch_version and torch_version_ge("2.6.0")
+# ) or ("dev" in torch_version and torch_version.split("dev")[1] >= "20241220")
+_DISTRIBUTED_STATE_DICT_API_IS_AVAILABLE = False
 
 
 def _get_sharding_strategy(strategy: str) -> ShardingStrategy: