[BIONEMO-2473] Added tests for Evo2 LoRA fine-tuning

balvisio · balvisio · commit f5cbdd67791e · 2025-08-25T16:24:33.000Z
Signed-off-by: Bruno Alvisio &lt;balvisio@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/peft.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/peft.py
@@ -13,20 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from copy import deepcopy
 from typing import List, Optional
 
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py
@@ -38,7 +38,6 @@
 )
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch import callbacks as nl_callbacks
-from nemo.lightning.pytorch.callbacks import ModelTransform
 from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
@@ -492,7 +491,7 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         help="Disable saving the last checkpoint.",
     )
     parser.add_argument("--lora-finetune", action="store_true", help="Use LoRA fine-tuning", default=False)
-    parser.add_argument("--lora-checkpoint-path", type=Path, default=None, help="LoRA checkpoint path")
+    parser.add_argument("--lora-checkpoint-path", type=str, default=None, help="LoRA checkpoint path")
     parser.add_argument(
         "--no-calculate-per-token-loss",
         action="store_true",
@@ -646,7 +645,7 @@ def train(args: argparse.Namespace) -> nl.Trainer:
     ]
 
     if args.lora_finetune:
-        callbacks.append(ModelTransform())
+        callbacks.append(lora_transform)
     if args.enable_preemption:
         callbacks.append(nl_callbacks.PreemptionCallback())
     if args.debug_ddp_parity_freq > 0:
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/__init__.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/common.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/common.py
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def small_training_cmd(path, max_steps, val_check, devices: int = 1, additional_args: str = ""):
+    """Command for training."""
+    cmd = (
+        f"train_evo2 --mock-data --result-dir {path} --devices {devices} "
+        "--model-size 1b_nv --num-layers 4 --hybrid-override-pattern SDH* --limit-val-batches 1 "
+        "--no-activation-checkpointing --add-bias-output --create-tensorboard-logger --create-tflops-callback "
+        f"--max-steps {max_steps} --warmup-steps 1 --val-check-interval {val_check} --limit-val-batches 1 "
+        f"--seq-length 16 --hidden-dropout 0.1 --attention-dropout 0.1 {additional_args}"
+    )
+    return cmd
+
+
+def small_training_finetune_cmd(
+    path,
+    max_steps,
+    val_check,
+    prev_ckpt,
+    devices: int = 1,
+    create_tflops_callback: bool = True,
+    additional_args: str = "",
+):
+    """Command for finetuning."""
+    cmd = (
+        f"train_evo2 --mock-data --result-dir {path} --devices {devices} "
+        "--model-size 1b_nv --num-layers 4 --hybrid-override-pattern SDH* --limit-val-batches 1 "
+        "--no-activation-checkpointing --add-bias-output --create-tensorboard-logger "
+        f"--max-steps {max_steps} --warmup-steps 1 --val-check-interval {val_check} --limit-val-batches 1 "
+        f"--seq-length 16 --hidden-dropout 0.1 --attention-dropout 0.1 {additional_args} --ckpt-dir {prev_ckpt} "
+        f"{'--create-tflops-callback' if create_tflops_callback else ''}"
+    )
+    return cmd
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_lora.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_lora.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+
+from bionemo.evo2.run.common import small_training_cmd, small_training_finetune_cmd
+from bionemo.testing.subprocess_utils import run_command_in_subprocess
+
+
+@pytest.mark.timeout(512)  # Optional: fail if the test takes too long.
+@pytest.mark.slow
+@pytest.mark.parametrize("with_peft", [True, False])
+def test_train_evo2_finetune_runs_lora(tmp_path, with_peft: bool):
+    """
+    This test runs the `train_evo2` command with mock data in a temporary directory.
+    It uses the temporary directory provided by pytest as the working directory.
+    The command is run in a subshell, and we assert that it returns an exit code of 0.
+    """
+    num_steps = 2
+    # Note: The command assumes that `train_evo2` is in your PATH.
+    command = small_training_cmd(tmp_path / "pretrain", max_steps=num_steps, val_check=num_steps)
+    stdout_pretrain: str = run_command_in_subprocess(command=command, path=str(tmp_path))
+    assert "Restoring model weights from RestoreConfig(path='" not in stdout_pretrain
+
+    log_dir = tmp_path / "pretrain" / "evo2"
+    checkpoints_dir = log_dir / "checkpoints"
+    tensorboard_dir = log_dir / "dev"
+
+    # Check if logs dir exists
+    assert log_dir.exists(), "Logs folder should exist."
+    # Check if checkpoints dir exists
+    assert checkpoints_dir.exists(), "Checkpoints folder does not exist."
+
+    expected_checkpoint_suffix = f"{num_steps}.0-last"
+    # Check if any subfolder ends with the expected suffix
+    matching_subfolders = [
+        p for p in checkpoints_dir.iterdir() if p.is_dir() and (expected_checkpoint_suffix in p.name)
+    ]
+
+    assert matching_subfolders, (
+        f"No checkpoint subfolder ending with '{expected_checkpoint_suffix}' found in {checkpoints_dir}."
+    )
+
+    # Check if directory with tensorboard logs exists
+    assert tensorboard_dir.exists(), "TensorBoard logs folder does not exist."
+    # Recursively search for files with tensorboard logger
+    event_files = list(tensorboard_dir.rglob("events.out.tfevents*"))
+    assert event_files, f"No TensorBoard event files found under {tensorboard_dir}"
+    assert len(matching_subfolders) == 1, "Only one checkpoint subfolder should be found."
+    if with_peft:
+        result_dir = tmp_path / "lora_finetune"
+        additional_args = "--lora-finetune"
+    else:
+        result_dir = tmp_path / "finetune"
+        additional_args = ""
+
+    command_finetune = small_training_finetune_cmd(
+        result_dir,
+        max_steps=num_steps,
+        val_check=num_steps,
+        prev_ckpt=matching_subfolders[0],
+        create_tflops_callback=not with_peft,
+        additional_args=additional_args,
+    )
+    stdout_finetune: str = run_command_in_subprocess(command=command_finetune, path=str(tmp_path))
+    assert "Restoring model weights from RestoreConfig(path='" in stdout_finetune
+
+    log_dir_ft = result_dir / "evo2"
+    checkpoints_dir_ft = log_dir_ft / "checkpoints"
+    tensorboard_dir_ft = log_dir_ft / "dev"
+
+    # Check if logs dir exists
+    assert log_dir_ft.exists(), "Logs folder should exist."
+    # Check if checkpoints dir exists
+    assert checkpoints_dir_ft.exists(), "Checkpoints folder does not exist."
+
+    expected_checkpoint_suffix = f"{num_steps}.0-last"
+    # Check if any subfolder ends with the expected suffix
+    matching_subfolders_finetune = [
+        p for p in checkpoints_dir_ft.iterdir() if p.is_dir() and (expected_checkpoint_suffix in p.name)
+    ]
+
+    assert matching_subfolders_finetune, (
+        f"No checkpoint subfolder ending with '{expected_checkpoint_suffix}' found in {checkpoints_dir_ft}."
+    )
+
+    # Check if directory with tensorboard logs exists
+    assert tensorboard_dir_ft.exists(), "TensorBoard logs folder does not exist."
+    # Recursively search for files with tensorboard logger
+    event_files = list(tensorboard_dir_ft.rglob("events.out.tfevents*"))
+    assert event_files, f"No TensorBoard event files found under {tensorboard_dir_ft}"
+
+    assert len(matching_subfolders_finetune) == 1, "Only one checkpoint subfolder should be found."
+
+    # With LoRA, test resuming from a saved LoRA checkpoint
+    if with_peft:
+        result_dir = tmp_path / "lora_finetune_resume"
+
+        # Resume from LoRA checkpoint
+        command_resume_finetune = small_training_finetune_cmd(
+            result_dir,
+            max_steps=num_steps,
+            val_check=num_steps,
+            prev_ckpt=matching_subfolders[0],
+            create_tflops_callback=False,
+            additional_args=f"--lora-finetune --lora-checkpoint-path {matching_subfolders_finetune[0]}",
+        )
+        stdout_finetune: str = run_command_in_subprocess(command=command_resume_finetune, path=str(tmp_path))
+
+        log_dir_ft = result_dir / "evo2"
+        checkpoints_dir_ft = log_dir_ft / "checkpoints"
+        tensorboard_dir_ft = log_dir_ft / "dev"
+
+        # Check if logs dir exists
+        assert log_dir_ft.exists(), "Logs folder should exist."
+        # Check if checkpoints dir exists
+        assert checkpoints_dir_ft.exists(), "Checkpoints folder does not exist."
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py
@@ -25,6 +25,7 @@
 from nemo import lightning as nl
 from transformer_engine.pytorch.fp8 import check_fp8_support
 
+from bionemo.evo2.run.common import small_training_cmd, small_training_finetune_cmd
 from bionemo.evo2.run.train import parse_args, train
 from bionemo.testing.lightning import extract_global_steps_from_log
 from bionemo.testing.megatron_parallel_state_utils import distributed_model_parallel_state
@@ -35,9 +36,7 @@
 
 
 def run_train_with_std_redirect(args: argparse.Namespace) -> Tuple[str, nl.Trainer]:
-    """
-    Run a function with output capture.
-    """
+    """Run a function with output capture."""
     stdout_buf, stderr_buf = io.StringIO(), io.StringIO()
     with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf):
         with distributed_model_parallel_state():
@@ -50,28 +49,6 @@ def run_train_with_std_redirect(args: argparse.Namespace) -> Tuple[str, nl.Train
     return train_stdout, trainer
 
 
-def small_training_cmd(path, max_steps, val_check, devices: int = 1, additional_args: str = ""):
-    cmd = (
-        f"train_evo2 --mock-data --result-dir {path} --devices {devices} "
-        "--model-size 1b_nv --num-layers 4 --hybrid-override-pattern SDH* --limit-val-batches 1 "
-        "--no-activation-checkpointing --add-bias-output --create-tensorboard-logger --create-tflops-callback "
-        f"--max-steps {max_steps} --warmup-steps 1 --val-check-interval {val_check} --limit-val-batches 1 "
-        f"--seq-length 16 --hidden-dropout 0.1 --attention-dropout 0.1 {additional_args}"
-    )
-    return cmd
-
-
-def small_training_finetune_cmd(path, max_steps, val_check, prev_ckpt, devices: int = 1, additional_args: str = ""):
-    cmd = (
-        f"train_evo2 --mock-data --result-dir {path} --devices {devices} "
-        "--model-size 1b_nv --num-layers 4 --hybrid-override-pattern SDH* --limit-val-batches 1 "
-        "--no-activation-checkpointing --add-bias-output --create-tensorboard-logger --create-tflops-callback "
-        f"--max-steps {max_steps} --warmup-steps 1 --val-check-interval {val_check} --limit-val-batches 1 "
-        f"--seq-length 16 --hidden-dropout 0.1 --attention-dropout 0.1 {additional_args} --ckpt-dir {prev_ckpt}"
-    )
-    return cmd
-
-
 def small_training_mamba_cmd(path, max_steps, val_check, devices: int = 1, additional_args: str = ""):
     cmd = (
         f"train_evo2 --mock-data --result-dir {path} --devices {devices} "