unskip evo2 tests (#1058)

broland-hat · web-flow · commit fe8d7e6093b3 · 2025-08-26T02:03:15.000Z
### Description - This PR addresses issue #1013 - In NeMo NVIDIA-NeMo/NeMo#14515, the code was updated to reduce memory consumption - This PR updates the NeMo version to 7ccb0d4. - This PR adjusts the memory thresholds to skip tests in sub-packages/bionemo-evo2/tests/bionemo/evo2/test_evo2.py - This PR adds some tools for torch memory usage.  ### Type of changes  - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels: - [SKIP_CI](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#skip_ci) - Skip all continuous integration tests - [INCLUDE_NOTEBOOKS_TESTS](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#include_notebooks_tests) - Execute notebook validation tests in pytest - [INCLUDE_SLOW_TESTS](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#include_slow_tests) - Execute tests labelled as slow in pytest for extensive testing > [!NOTE] > By default, the notebooks validation tests are skipped unless explicitly enabled. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. - If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) - If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. ### Usage  ```python # TODO: Add code snippet ``` ### Pre-submit Checklist  - [x] I have tested these changes locally - [x] I have updated the documentation accordingly - [x] I have added/updated tests as needed - [ ] All existing tests pass successfully ### Local test runs the slow test **test_evo2.py::test_golden_values_top_k_logits_and_cosine_similarity_7b** is broken on **main**, will be marked skip - (1 ) test_evo2.py::test_golden_values_top_k_logits_and_cosine_similarity_7b is broken on this commit with **NVIDIA H100 80GB HBM3** - [pytests_pr1058_unskip_evo2_tests_sub-packages-bionemo-evo2-tests-bionemo-evo2-test_evo2_20250821T0024_6e2a005d.log](https://github.com/user-attachments/files/21922792/pytests_pr1058_unskip_evo2_tests_sub-packages-bionemo-evo2-tests-bionemo-evo2-test_evo2_20250821T0024_6e2a005d.log) - (2) The same test as in (1) is broken on the commit *424050d2* in main with *NVIDIA H100 80GB HBM3* [pytests_pr1058_unskip_evo2_tests_sub-packages-bionemo-evo2-tests-bionemo-evo2-test_evo2__test_golden_values_top_k_logits_and_cosine_similarity_7b_20250821T2114_main_424050d2.log](https://github.com/user-attachments/files/21926382/pytests_pr1058_unskip_evo2_tests_sub-packages-bionemo-evo2-tests-bionemo-evo2-test_evo2__test_golden_values_top_k_logits_and_cosine_similarity_7b_20250821T2114_main_424050d2.log) the slow test **test_evo.py::test_generate_speed** is marked skip per https://nvidia.slack.com/archives/C074Z808N05/p1755185565520729?thread_ts=1755097791.370249&cid=C074Z808N05 --------- Signed-off-by: Brian Roland <broland@nvidia.com>
diff --git a/3rdparty/NeMo b/3rdparty/NeMo
@@ -1 +1 @@
-Subproject commit f4f22a26bb3d08f087e50843e148a07c4c3f2472
+Subproject commit 7ccb0d4c5544dbcc454930acb3a1fe29d9db5090
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/conftest.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/conftest.py
@@ -20,22 +20,30 @@
 import pytest
 import torch
 
+from bionemo.testing.torch import get_device_and_memory_allocated
+
 
 def pytest_sessionstart(session):
     """Called at the start of the test session."""
     if torch.cuda.is_available():
         torch.cuda.reset_peak_memory_stats()
-        print(f"Starting test session. Initial GPU memory: {torch.cuda.memory_allocated() / 1024**3:.3f} GB")
+        print(
+            f"""
+            sub-packages/bionemo-evo2/tests/bionemoe/evo2: Starting test session
+            {get_device_and_memory_allocated()}
+            """
+        )
 
 
 def pytest_sessionfinish(session, exitstatus):
     """Called at the end of the test session."""
     if torch.cuda.is_available():
-        peak_memory = torch.cuda.max_memory_allocated()
-        final_memory = torch.cuda.memory_allocated()
-        print("\nTest session complete:")
-        print(f"  Peak GPU memory: {peak_memory / 1024**3:.3f} GB")
-        print(f"  Final GPU memory: {final_memory / 1024**3:.3f} GB")
+        print(
+            f"""
+            sub-packages/bionemo-evo2/tests/bionemoe/evo2: Test session complete
+            {get_device_and_memory_allocated()}
+            """
+        )
 
 
 @pytest.fixture(autouse=True)
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/test_evo2.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/test_evo2.py
@@ -48,6 +48,44 @@
 logger.setLevel(logging.DEBUG)  # Capture all levels in the logger itself
 
 
+def determine_memory_requirement_and_skip_if_not_met(ckpt_name: str, flash_decode: bool | None = None) -> int:
+    """Determine the memory requirement for a given checkpoint and flash decode condition.
+    ckpt_name : str
+        the name of the checkpoint to test
+    flash_decode: bool | None
+        whether to test with flash decode
+    Returns:
+        The input sequence length cap, for the model sin the checkpoint, given certain memory requirements.
+        If the memory requirement is not met, the test is skipped.
+    """
+
+    if "1b" in ckpt_name:
+        model_size = "1b"
+        seq_len_cap = 6000
+        memory_needed_by_test = 17  # max reserved rounded up, for stand-alone test
+    elif "7b" in ckpt_name:
+        model_size = "7b"
+        seq_len_cap = 4000
+        memory_needed_by_test = 32  # max reserved rounded up, for stand-alone test
+    else:
+        raise ValueError(f"{ckpt_name=} is not supported for testing")
+
+    skip_condition_flash = flash_decode is None or flash_decode
+    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
+    skip_condition = gb_available < memory_needed_by_test and skip_condition_flash
+
+    if skip_condition:
+        pytest.skip(
+            ", ".join(
+                [
+                    f"Inference API requires at least {memory_needed_by_test}GB of available memory for {model_size} models",
+                    f"{gb_available=}",
+                ]
+            )
+        )
+    return seq_len_cap
+
+
 def load_weights_sharded_inplace_nemo2_to_mcore(
     model: MegatronModelType,
     distributed_checkpoint_dir: str | Path,
@@ -152,6 +190,7 @@ def test_golden_values_top_k_logits_and_cosine_similarity(seq_len: int):
         assert torch.mean(torch.abs(logit_similarity - torch.ones_like(logit_similarity))) < 0.03
 
 
+@pytest.mark.skip(reason="test fails on main, not due to #1058")
 @pytest.mark.slow
 def test_golden_values_top_k_logits_and_cosine_similarity_7b(seq_len: int = 8_192):
     try:
@@ -181,6 +220,7 @@ def test_golden_values_top_k_logits_and_cosine_similarity_7b(seq_len: int = 8_19
         outputs = model(input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask)
         gold_standard_no_fp8_tensor = torch.load(gold_standard_no_fp8).to(device=outputs.device, dtype=outputs.dtype)
         is_fp8_supported, compute_capability, device_info = check_fp8_support(device.index)
+
         if is_fp8_supported and compute_capability == "9.0":
             # Most rigurous assertion for output equivalence currently works on devices that are new enough to
             #  support FP8.
@@ -364,11 +404,8 @@ def check_matchrate(*, ckpt_name, matchrate, assert_matchrate=True):
 )
 def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: list[float]):
     assert len(sequences) > 0
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
+    seq_len_cap = determine_memory_requirement_and_skip_if_not_met(ckpt_name)
+
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
     if skip:
@@ -380,7 +417,7 @@ def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: l
     )
     matchrates = []
     for seq in sequences:
-        seq = seq[:6000]  # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
+        seq = seq[:seq_len_cap]  # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
         with torch.no_grad():
             device = torch.cuda.current_device()
             tokens = torch.tensor([mcore_tokenizer.tokenize(seq)], device=device)
@@ -426,13 +463,11 @@ def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: l
 )
 def test_forward_manual(sequences: list[str], ckpt_name: str, expected_matchpercents: list[float], flash_decode: bool):
     assert len(sequences) > 0
+    seq_len_cap = determine_memory_requirement_and_skip_if_not_met(ckpt_name, flash_decode)
+
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and flash_decode) or (gb_available < 50 and flash_decode and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
+
     vortex_style_fp8 = is_fp8_supported and "bf16" not in ckpt_name
     if skip:
         # This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.
@@ -479,7 +514,9 @@ def test_forward_manual(sequences: list[str], ckpt_name: str, expected_matchperc
             forward_kwargs = {}
         matchrates = []
         for seq in sequences:
-            seq = seq[:6000]  # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
+            seq = seq[
+                :seq_len_cap
+            ]  # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
             with torch.no_grad():
                 device = torch.cuda.current_device()
                 # tokens = torch.tensor([tokenizer.tokenize(seq)], device=device)
@@ -542,12 +579,9 @@ def test_batch_generate(
     sequences: list[str], ckpt_name: str, model_tokenizer_provider: Callable, expected_matchpercents: list[float]
 ):
     assert len(sequences) > 0
+    determine_memory_requirement_and_skip_if_not_met(ckpt_name)
+
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
     if skip:
         # This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.
@@ -614,11 +648,8 @@ def test_batch_generate_coding_sequences(
     expected_matchpercents: list[float],
 ):
     assert len(coding_sequences) > 0
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
+    determine_memory_requirement_and_skip_if_not_met(ckpt_name)
+
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
     if skip:
@@ -706,6 +737,9 @@ def test_batch_generate_coding_sequences(
     )
 
 
+@pytest.mark.skip(
+    reason="skip the test for now, and decide what to do after getting Anton's changes sorted and merged."
+)
 @pytest.mark.slow
 @pytest.mark.parametrize(
     "ckpt_name,model_tokenizer_provider,expected_tokens_sec",
@@ -723,11 +757,8 @@ def test_generate_speed(
     expected_tokens_sec: float,
 ):
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
+    determine_memory_requirement_and_skip_if_not_met(ckpt_name)
+
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
     if skip:
         # This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/torch.py b/sub-packages/bionemo-testing/src/bionemo/testing/torch.py
@@ -61,3 +61,19 @@ def recursive_assert_approx_equal(x, y, atol=1e-4, rtol=1e-4):
             recursive_assert_approx_equal(x[key], y[key], atol=atol, rtol=rtol)
     else:
         assert x == y
+
+
+def get_device_and_memory_allocated() -> str:
+    """Get the current device index, name, and memory usage."""
+    current_device_index = torch.cuda.current_device()
+    props = torch.cuda.get_device_properties(current_device_index)
+    message = f"""
+        current device index: {current_device_index}
+        current device uuid: {props.uuid}
+        current device name: {props.name}
+        memory, total on device: {torch.cuda.mem_get_info()[1] / 1024**3:.3f} GB
+        memory, available on device: {torch.cuda.mem_get_info()[0] / 1024**3:.3f} GB
+        memory allocated for tensors etc: {torch.cuda.memory_allocated() / 1024**3:.3f} GB
+        max memory reserved for tensors etc: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB
+        """
+    return message
diff --git a/sub-packages/bionemo-testing/tests/bionemo/testing/test_torch.py b/sub-packages/bionemo-testing/tests/bionemo/testing/test_torch.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from bionemo.testing.torch import get_device_and_memory_allocated
+
+
+def test_get_device_and_memory_allocated():
+    message = get_device_and_memory_allocated()
+    assert message is not None
+    assert "memory, total on device" in message
+    assert "memory, available on device" in message
+    assert "memory allocated for tensors etc" in message
+    assert "max memory reserved for tensors etc" in message