add warning for async save with fp8 params

pstjohn · pstjohn · commit 0da20a04d349 · 2026-02-25T13:26:16.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/llama3_native_te/checkpoint.py b/bionemo-recipes/recipes/llama3_native_te/checkpoint.py
@@ -34,7 +34,9 @@
 from torch.distributed.checkpoint.state_dict_saver import async_save as dcp_async_save
 from torch.distributed.checkpoint.state_dict_saver import save as dcp_save
 from torch.distributed.checkpoint.stateful import Stateful
+from torch.distributed.tensor import DTensor
 from torchdata.stateful_dataloader import StatefulDataLoader
+from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor
 
 from distributed_config import DistributedConfig
 
@@ -338,6 +340,13 @@ def save_checkpoint_fsdp2(
     checkpoint_path = ckpt_path / f"step_{step}"
     checkpoint_path.mkdir(parents=True, exist_ok=True)
 
+    model_params = (p.to_local() if isinstance(p, DTensor) else p for p in model.parameters())
+    if async_save and any((isinstance(p, Float8Tensor) for p in model_params)):
+        logger.warning(
+            "Async checkpointing is not supported for FP8 models, falling back to synchronous checkpointing."
+        )
+        async_save = False
+
     if dataloader is not None:
         save_dataloader(
             dataloader=dataloader,
diff --git a/bionemo-recipes/recipes/llama3_native_te/tests/test_distributed_checkpointing.py b/bionemo-recipes/recipes/llama3_native_te/tests/test_distributed_checkpointing.py
@@ -531,7 +531,6 @@ def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized(recipe_p
     )
 
 
-@pytest.mark.xfail()
 def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized_async(recipe_path, tmp_path):
     """Test checkpoint save/resume for FSDP2+CP with FP8 quantized model init and async save.
 

Original file line number	Diff line number	Diff line change
`@@ -531,7 +531,6 @@ def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized(recipe_p`
`531`	`531`	`)`
`532`	`532`
`533`	`533`
`534`		`-@pytest.mark.xfail()`
`535`	`534`	`def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized_async(recipe_path, tmp_path):`
`536`	`535`	`"""Test checkpoint save/resume for FSDP2+CP with FP8 quantized model init and async save.`
`537`	`536`