test distributed checkpoing with different recipes

pstjohn · pstjohn · commit 741132070380 · 2026-02-25T14:54:44.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/llama3_native_te/tests/conftest.py b/bionemo-recipes/recipes/llama3_native_te/tests/conftest.py
@@ -19,6 +19,7 @@
 
 import pytest
 import torch
+from transformer_engine.pytorch import fp8 as te_fp8
 
 
 sys.path.append(Path(__file__).parent.parent.as_posix())
@@ -61,6 +62,56 @@ def pytest_collection_modifyitems(items):
     items[:] = stats_tests + other_tests
 
 
+# ---------------------------------------------------------------------------
+# FP8 recipe parametrization
+# ---------------------------------------------------------------------------
+
+# Each entry: (recipe_class_name, hydra_overrides, check_fn)
+_FP8_RECIPE_CONFIGS = [
+    (
+        "DelayedScaling",
+        ["fp8_config.fp8_recipe=transformer_engine.common.recipe.DelayedScaling"],
+        te_fp8.check_fp8_support,
+    ),
+    (
+        "Float8CurrentScaling",
+        ["fp8_config.fp8_recipe=transformer_engine.common.recipe.Float8CurrentScaling"],
+        te_fp8.check_fp8_support,
+    ),
+    (
+        "Float8BlockScaling",
+        ["fp8_config.fp8_recipe=transformer_engine.common.recipe.Float8BlockScaling"],
+        te_fp8.check_fp8_block_scaling_support,
+    ),
+    (
+        "MXFP8BlockScaling",
+        ["fp8_config.fp8_recipe=transformer_engine.common.recipe.MXFP8BlockScaling"],
+        te_fp8.check_mxfp8_support,
+    ),
+]
+
+
+def _parametrize_fp8_recipes():
+    """Generate pytest.param objects with xfail marks for unsupported FP8 recipes."""
+    params = []
+    for name, overrides, check_fn in _FP8_RECIPE_CONFIGS:
+        supported, reason = check_fn()
+        params.append(
+            pytest.param(
+                overrides,
+                id=name,
+                marks=pytest.mark.xfail(condition=not supported, reason=reason),
+            )
+        )
+    return params
+
+
+@pytest.fixture(params=_parametrize_fp8_recipes())
+def fp_recipe(request):
+    """Parametrized fixture providing FP8 recipe Hydra overrides for each supported TE recipe."""
+    return request.param
+
+
 @pytest.fixture(scope="session", autouse=True)
 def device_mesh():
     """Create a re-usable torch process group for testing.
diff --git a/bionemo-recipes/recipes/llama3_native_te/tests/test_distributed_checkpointing.py b/bionemo-recipes/recipes/llama3_native_te/tests/test_distributed_checkpointing.py
@@ -492,46 +492,46 @@ def test_checkpoint_pruning_with_files(tmp_path):
 ]
 
 
-def test_checkpoint_save_and_load_single_process_ddp_fp8_quantized(recipe_path, tmp_path):
+def test_checkpoint_save_and_load_single_process_ddp_fp8_quantized(recipe_path, tmp_path, fp_recipe):
     """Test checkpoint save/resume for DDP with FP8 quantized model init."""
     _run_single_process_checkpoint_test(
         recipe_path,
         tmp_path,
         main_ddp,
         ckpt_subdir_name="train_ddp",
         config_name="L0_sanity_cp",
-        extra_overrides=_FP8_QUANTIZED_OVERRIDES,
+        extra_overrides=[*_FP8_QUANTIZED_OVERRIDES, *fp_recipe],
         is_ddp=True,
     )
 
 
-def test_checkpoint_save_and_load_single_process_fsdp2_fp8_quantized(recipe_path, tmp_path):
+def test_checkpoint_save_and_load_single_process_fsdp2_fp8_quantized(recipe_path, tmp_path, fp_recipe):
     """Test checkpoint save/resume for FSDP2 with FP8 quantized model init."""
     _run_single_process_checkpoint_test(
         recipe_path,
         tmp_path,
         main_fsdp2,
         ckpt_subdir_name="train_fsdp2",
         config_name="L0_sanity_cp",
-        extra_overrides=_FP8_QUANTIZED_OVERRIDES,
+        extra_overrides=[*_FP8_QUANTIZED_OVERRIDES, *fp_recipe],
         is_ddp=False,
     )
 
 
-def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized(recipe_path, tmp_path):
+def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized(recipe_path, tmp_path, fp_recipe):
     """Test checkpoint save/resume for FSDP2 with context parallelism and FP8 quantized model init."""
     _run_single_process_checkpoint_test(
         recipe_path,
         tmp_path,
         main_fsdp2_cp,
         ckpt_subdir_name="train_fsdp2",
         config_name="L0_sanity_cp",
-        extra_overrides=_FP8_QUANTIZED_OVERRIDES,
+        extra_overrides=[*_FP8_QUANTIZED_OVERRIDES, *fp_recipe],
         is_ddp=False,
     )
 
 
-def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized_async(recipe_path, tmp_path):
+def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized_async(recipe_path, tmp_path, fp_recipe):
     """Test checkpoint save/resume for FSDP2+CP with FP8 quantized model init and async save.
 
     This reproduces the corys_config scenario where async_save=true (the default)
@@ -545,6 +545,7 @@ def test_checkpoint_save_and_load_single_process_fsdp2_cp_fp8_quantized_async(re
         config_name="L0_sanity_cp",
         extra_overrides=[
             *_FP8_QUANTIZED_OVERRIDES,
+            *fp_recipe,
             "checkpoint.async_save=true",
         ],
         is_ddp=False,