Adds context parallelism to FSDP2 (#1358)

jomitchellnv · Jonathan Mitchell · Jonathan Mitchell · web-flow · commit 3fc854515f24 · 2025-12-03T02:22:04.000Z
### Description Creates a new training script callled `train_fsdp2_cp.py` where we add CP to FSDP2 #### Usage This you can run this script in the same way that you execute `train_ddp_cp.py` ```python torchrun --nproc_per_node=8 train_fsdp2_cp.py cp_size=<CP_SIZE> ``` For equivalence see <img width="5056" height="2656" alt="W B Chart 12_1_2025, 2_56_52 PM" src="https://github.com/user-attachments/assets/ef1513e0-0d3e-4fb9-a8b0-7bed8a47e86c" /> ### Type of changes  - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels. By default, only basic unit tests are run. - [ciflow:skip](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:skip) - Skip all CI tests for this PR - [ciflow:notebooks](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:notebooks) - Run Jupyter notebooks execution tests for bionemo2 - [ciflow:slow](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:slow) - Run slow single GPU integration tests marked as @pytest.mark.slow for bionemo2 - [ciflow:all](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all) - Run all tests (unit tests, slow tests, and notebooks) for bionemo2. This label can be used to enforce running tests for all bionemo2. - [ciflow:all-recipes](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all-recipes) - Run tests for all recipes (under bionemo-recipes). This label can be used to enforce running tests for all recipes. Unit tests marked as `@pytest.mark.multi_gpu` or `@pytest.mark.distributed` are not run in the PR pipeline. For more details, see [CONTRIBUTING](CONTRIBUTING.md) > [!NOTE] > By default, only basic unit tests are run. Add appropriate labels to enable an additional test coverage. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. - If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) - If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. ### Pre-submit Checklist  - [ ] I have tested these changes locally - [ ] I have updated the documentation accordingly - [ ] I have added/updated tests as needed - [X] All existing tests pass successfully --------- Signed-off-by: Jonathan Mitchell <jomitchell@ipp1-1334.ipp1a1.colossus.nvidia.com> Signed-off-by: Jonathan Mitchell <jomitchell@nvidia.com> Signed-off-by: Jonathan Mitchell <jomitchell@ipp1-1428.ipp1a1.colossus.nvidia.com> Co-authored-by: Jonathan Mitchell <jomitchell@ipp1-1334.ipp1a1.colossus.nvidia.com> Co-authored-by: Jonathan Mitchell <jomitchell@ipp1-1428.ipp1a1.colossus.nvidia.com>
diff --git a/bionemo-recipes/recipes/esm2_native_te/tests/test_train_two_gpu.py b/bionemo-recipes/recipes/esm2_native_te/tests/test_train_two_gpu.py
@@ -145,3 +145,21 @@ def test_multi_gpu_train_te_ddp_cp(tmp_path, recipe_path):
         ],
         recipe_path,
     )
+
+
+@requires_multi_gpu
+@requires_datacenter_hardware
+def test_multi_gpu_train_te_fsdp2_cp(tmp_path, recipe_path):
+    # Run 'accelerate launch train.py' as a subprocess
+    run_train_cmd(
+        [
+            "torchrun",
+            "--nproc_per_node=2",
+            "train_fsdp2_cp.py",
+            "--config-name",
+            "L0_sanity_cp",
+            "num_train_steps=4",
+            "cp_size=2",
+        ],
+        recipe_path,
+    )
diff --git a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2_cp.py b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2_cp.py
@@ -0,0 +1,238 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from contextlib import nullcontext
+from pathlib import Path
+
+import hydra
+import torch
+import transformer_engine.pytorch
+from omegaconf import DictConfig, OmegaConf
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import fully_shard
+from torch.optim import AdamW
+from transformer_engine.common.recipe import Format
+from transformers import AutoConfig, AutoModelForMaskedLM
+
+# This import seems to be needed with meta device init and AutoModel.from_config
+from transformers.models.esm.modeling_esm import EsmForMaskedLM  # noqa: F401
+
+from checkpoint import load_checkpoint_fsdp2, save_checkpoint_fsdp2, save_final_model_fsdp2, should_save_checkpoint
+from dataset import create_cp_dataloader
+from distributed_config import DistributedConfig
+from perf_logger import PerfLogger
+from scheduler import get_linear_schedule_with_warmup
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+@hydra.main(config_path="hydra_config", config_name="L0_sanity_cp", version_base="1.2")
+def main(args: DictConfig) -> float | None:  # noqa: C901
+    """Train ESM-2 with TE layers using fsdp2.
+
+    Returns:
+        float: The loss value for the final batch.
+    """
+    # Initialize the distributed configuration, including creating the distributed process group.
+    dist_config = DistributedConfig()
+    logger.info("Initializing distributed training: %s", dist_config)
+    device = torch.device(f"cuda:{dist_config.local_rank}")
+    torch.distributed.init_process_group(backend="nccl", device_id=device)
+    torch.cuda.set_device(dist_config.local_rank)
+
+    # Validate that world_size is divisible by cp_size
+    if dist_config.world_size % args.cp_size != 0:
+        raise ValueError(
+            f"world_size ({dist_config.world_size}) must be divisible by cp_size ({args.cp_size}). "
+            f"Set cp_size to a divisor of world_size."
+        )
+
+    # Calculate DP size (number of data parallel replicas)
+    dp_size = dist_config.world_size // args.cp_size
+
+    # Create a device mesh for DP and CP.
+    # The mesh is organized as [CP_dimension, DDP_dimension] where:
+    # - DDP dimension: number of data parallel replicas (world_size // cp_size)
+    # - CP dimension: context parallel size
+    # Total ranks = cp_size * dp_size = world_size
+    device_mesh = init_device_mesh(
+        "cuda",
+        mesh_shape=(dp_size, args.cp_size),
+        mesh_dim_names=("dp", "cp"),
+    )
+
+    # Our flattened group must have at least 2 ranks to enable Context Parallelism.
+    if dp_size * args.cp_size <= 1:
+        cp_dp_mesh = device_mesh["dp", "cp"]._flatten(mesh_dim_name="dp_shard_cp")
+    else:
+        cp_dp_mesh = device_mesh
+
+    logger.info(
+        f"Creating device mesh: world_size={dist_config.world_size}, dp_size={dp_size}, cp_size={args.cp_size}"
+    )
+
+    cp_group = device_mesh["cp"].get_group()
+    cp_rank = device_mesh.get_local_rank("cp")
+
+    # Create an FP8 recipe -- this is only used if FP8 is enabled in the config.
+    fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)(
+        fp8_format=Format[args.fp8_config.fp8_format], **args.fp8_config.fp8_recipe_kwargs
+    )
+
+    # Create an empty ESM-2 model with a masked language model head, e.g. "nvidia/esm2_t6_8M_UR50D".
+    config = AutoConfig.from_pretrained(
+        args.model_tag, trust_remote_code=True, token_dropout=False, dtype=torch.bfloat16
+    )
+    # If we're using sequence packing with TE layers, we need to pass the `attn_input_format` argument.
+    if args.use_sequence_packing:
+        config.attn_input_format = "thd"
+
+    # Optionally use transformer engine to initialize only fp8 versions of weights by setting
+    # `fp8_config.fp8_model_init_kwargs.enabled` to `True`, as opposed to using the default where both bfloat16 and fp8
+    # versions of weights are kept.
+    with (
+        torch.device("meta") if args.use_meta_device else nullcontext(),
+        transformer_engine.pytorch.fp8_model_init(recipe=fp8_recipe, **args.fp8_config.fp8_model_init_kwargs),
+    ):
+        model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True)
+
+    logger.info("Initialized Model:\n%s", model)
+
+    # We call the transformer stack "layers" in our TE models, but it's called "layer" in the original ESM-2 models.
+    transformer_stack = model.esm.encoder.layers if hasattr(model.esm.encoder, "layers") else model.esm.encoder.layer
+    # Fully shard takes in a DeviceMesh object, which is a 2D mesh of dimensions (CP_dimension, DP_dimension).
+    # FSDP2 will shard the model across the DP (dim=1) dimension and then duplicate across the CP (dim=0) dimension.
+    for layer in transformer_stack:
+        fully_shard(layer, mesh=cp_dp_mesh)
+        # Set CP group for layer if CP is enabled.
+        if args.cp_size > 1:
+            logger.debug(f"Rank {dist_config.rank}: Setting CP group for layer {layer}")
+            layer.set_context_parallel_group(
+                cp_group, torch.distributed.get_process_group_ranks(cp_group), torch.cuda.Stream()
+            )
+    fully_shard(model, mesh=cp_dp_mesh)
+
+    # Create optimizer. Convert OmegaConf to regular dict to avoid serialization issues (BIONEMO-2873).
+    optimizer = AdamW(model.parameters(), **OmegaConf.to_container(args.adamw_kwargs, resolve=True))  # type: ignore
+    scheduler = get_linear_schedule_with_warmup(optimizer, **args.lr_scheduler_kwargs)
+
+    if args.use_meta_device:
+        model.to_empty(device=device)
+        for module in model.modules():
+            if hasattr(module, "reset_parameters"):
+                module.reset_parameters()
+
+    # Context Parallelism requires THD Sequence Packing.
+    assert args.use_sequence_packing, "Context Parallelism requires THD Sequence Packing."
+
+    train_dataloader, dataset_or_sampler = create_cp_dataloader(
+        dist_config,
+        cp_world_size=torch.distributed.get_world_size(group=cp_group),
+        cp_group=cp_group,
+        cp_rank=cp_rank,
+        **args.dataset,
+    )
+
+    if args.use_torch_compile:
+        # If we're using torch.compile, we need to do this before loading the checkpoint to ensure key consistency.
+        model = torch.compile(model)
+
+    # If we're resuming from a checkpoint, load it and set the start step. Otherwise, start from step 0.
+    ckpt_path = Path(args.checkpoint.ckpt_dir) / "train_fsdp2" if args.checkpoint.ckpt_dir else None
+    if args.checkpoint.resume_from_checkpoint and ckpt_path:
+        model, optimizer, scheduler, train_dataloader, start_step, epoch = load_checkpoint_fsdp2(
+            model=model,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            ckpt_path=ckpt_path,
+            dist_config=dist_config,
+            dataloader=train_dataloader,
+        )
+    else:
+        start_step = 0
+        epoch = 0
+
+    perf_logger = PerfLogger(dist_config, args)
+
+    # Training loop
+    step = start_step
+    while step < args.num_train_steps:
+        for batch in train_dataloader:
+            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}  # noqa: PLW2901
+
+            # Forward pass with mixed precision.
+            with transformer_engine.pytorch.fp8_autocast(enabled=args.fp8_config.enabled, fp8_recipe=fp8_recipe):
+                outputs = model(**batch)
+
+            # Backward pass.
+            loss = outputs.loss
+            loss.backward()
+
+            # Compute and clip gradient norms.
+            total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0).item()
+
+            # Step optimizer.
+            optimizer.step()
+            scheduler.step()
+            optimizer.zero_grad()
+
+            perf_logger.log_step(
+                step=step,
+                batch=batch,
+                outputs=outputs,
+                grad_norm=total_norm,
+                lr=optimizer.param_groups[0]["lr"],
+            )
+
+            if ckpt_path and should_save_checkpoint(step, args.checkpoint.save_every_n_steps):
+                save_checkpoint_fsdp2(
+                    model=model,
+                    optimizer=optimizer,
+                    scheduler=scheduler,
+                    ckpt_path=ckpt_path,
+                    step=step,
+                    epoch=epoch,
+                    dist_config=dist_config,
+                    dataloader=train_dataloader if args.dataset.use_stateful_dataloader else None,
+                )
+
+            step += 1
+            if step >= args.num_train_steps:
+                break
+
+        # Dataloader exhausted, incrementing epoch
+        epoch += 1
+        dataset_or_sampler.set_epoch(epoch)
+
+    # Save final model to a .safetensors file.
+    if args.checkpoint.save_final_model and ckpt_path:
+        save_final_model_fsdp2(
+            model=model,
+            save_directory=ckpt_path / "final_model",
+            dist_config=dist_config,
+        )
+
+    # Clean up distributed training
+    perf_logger.finish()
+    torch.distributed.destroy_process_group()
+
+    return perf_logger.min_loss
+
+
+if __name__ == "__main__":
+    main()