NVIDIA · broland-hat · Aug 26, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025
@@ -20,22 +20,30 @@
 import pytest
 import torch
 
+from bionemo.testing.torch import get_device_and_memory_allocated
+
 
 def pytest_sessionstart(session):
     """Called at the start of the test session."""
     if torch.cuda.is_available():
         torch.cuda.reset_peak_memory_stats()
-        print(f"Starting test session. Initial GPU memory: {torch.cuda.memory_allocated() / 1024**3:.3f} GB")
+        print(
+            f"""
+            sub-packages/bionemo-evo2/tests/bionemoe/evo2: Starting test session
+            {get_device_and_memory_allocated()}
+            """
+        )
 
 
 def pytest_sessionfinish(session, exitstatus):
     """Called at the end of the test session."""
     if torch.cuda.is_available():
-        peak_memory = torch.cuda.max_memory_allocated()
-        final_memory = torch.cuda.memory_allocated()
-        print("\nTest session complete:")
-        print(f"  Peak GPU memory: {peak_memory / 1024**3:.3f} GB")
-        print(f"  Final GPU memory: {final_memory / 1024**3:.3f} GB")
+        print(
+            f"""
+            sub-packages/bionemo-evo2/tests/bionemoe/evo2: Test session complete
+            {get_device_and_memory_allocated()}
+            """
+        )
 
 
 @pytest.fixture(autouse=True)

@@ -48,6 +48,44 @@
 logger.setLevel(logging.DEBUG)  # Capture all levels in the logger itself
 
 
+def determine_memory_requirement_and_skip_if_not_met(ckpt_name: str, flash_decode: bool | None = None) -> int:
+    """Determine the memory requirement for a given checkpoint and flash decode condition.
+    ckpt_name : str
+        the name of the checkpoint to test
+    flash_decode: bool | None
+        whether to test with flash decode
+    Returns:
+        The input sequence length cap, for the model sin the checkpoint, given certain memory requirements.
+        If the memory requirement is not met, the test is skipped.
+    """
+
+    if "1b" in ckpt_name:
+        model_size = "1b"
+        seq_len_cap = 6000
+        memory_needed_by_test = 17  # max reserved rounded up, for stand-alone test
+    elif "7b" in ckpt_name:
+        model_size = "7b"
+        seq_len_cap = 4000
+        memory_needed_by_test = 32  # max reserved rounded up, for stand-alone test
+    else:
+        raise ValueError(f"{ckpt_name=} is not supported for testing")
+
+    skip_condition_flash = flash_decode is None or flash_decode
+    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
+    skip_condition = gb_available < memory_needed_by_test and skip_condition_flash
+
+    if skip_condition:
+        pytest.skip(
+            ", ".join(
+                [
+                    f"Inference API requires at least {memory_needed_by_test}GB of available memory for {model_size} models",
+                    f"{gb_available=}",
+                ]
+            )
+        )
+    return seq_len_cap
+
+
 def load_weights_sharded_inplace_nemo2_to_mcore(
     model: MegatronModelType,
     distributed_checkpoint_dir: str | Path,
@@ -181,6 +219,7 @@ def test_golden_values_top_k_logits_and_cosine_similarity_7b(seq_len: int = 8_19
         outputs = model(input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask)
         gold_standard_no_fp8_tensor = torch.load(gold_standard_no_fp8).to(device=outputs.device, dtype=outputs.dtype)
         is_fp8_supported, compute_capability, device_info = check_fp8_support(device.index)
+
         if is_fp8_supported and compute_capability == "9.0":
             # Most rigurous assertion for output equivalence currently works on devices that are new enough to
             #  support FP8.
@@ -364,11 +403,8 @@ def check_matchrate(*, ckpt_name, matchrate, assert_matchrate=True):
 )
 def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: list[float]):
     assert len(sequences) > 0
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
+    seq_len_cap = determine_memory_requirement_and_skip_if_not_met(ckpt_name)
+
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
     if skip:
@@ -380,7 +416,7 @@ def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: l
     )
     matchrates = []
     for seq in sequences:
-        seq = seq[:6000]  # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
+        seq = seq[:seq_len_cap]  # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
         with torch.no_grad():
             device = torch.cuda.current_device()
             tokens = torch.tensor([mcore_tokenizer.tokenize(seq)], device=device)
@@ -426,13 +462,11 @@ def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: l
 )
 def test_forward_manual(sequences: list[str], ckpt_name: str, expected_matchpercents: list[float], flash_decode: bool):
     assert len(sequences) > 0
+    seq_len_cap = determine_memory_requirement_and_skip_if_not_met(ckpt_name, flash_decode)
+
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and flash_decode) or (gb_available < 50 and flash_decode and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
+
     vortex_style_fp8 = is_fp8_supported and "bf16" not in ckpt_name
     if skip:
         # This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.
@@ -479,7 +513,9 @@ def test_forward_manual(sequences: list[str], ckpt_name: str, expected_matchperc
             forward_kwargs = {}
         matchrates = []
         for seq in sequences:
-            seq = seq[:6000]  # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
+            seq = seq[
+                :seq_len_cap
+            ]  # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
             with torch.no_grad():
                 device = torch.cuda.current_device()
                 # tokens = torch.tensor([tokenizer.tokenize(seq)], device=device)
@@ -542,12 +578,9 @@ def test_batch_generate(
     sequences: list[str], ckpt_name: str, model_tokenizer_provider: Callable, expected_matchpercents: list[float]
 ):
     assert len(sequences) > 0
+    determine_memory_requirement_and_skip_if_not_met(ckpt_name)
+
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
     if skip:
         # This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.
@@ -614,11 +647,8 @@ def test_batch_generate_coding_sequences(
     expected_matchpercents: list[float],
 ):
     assert len(coding_sequences) > 0
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
+    determine_memory_requirement_and_skip_if_not_met(ckpt_name)
+
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
     if skip:
@@ -723,11 +753,8 @@ def test_generate_speed(
     expected_tokens_sec: float,
 ):
     is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
-    gb_available = torch.cuda.mem_get_info()[0] / 1024**3
-    if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
-        pytest.skip(
-            f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
-        )
+    determine_memory_requirement_and_skip_if_not_met(ckpt_name)
+
     skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
     if skip:
         # This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.

@@ -61,3 +61,19 @@ def recursive_assert_approx_equal(x, y, atol=1e-4, rtol=1e-4):
             recursive_assert_approx_equal(x[key], y[key], atol=atol, rtol=rtol)
     else:
         assert x == y
+
+
+def get_device_and_memory_allocated() -> str:
+    """Get the current device index, name, and memory usage."""
+    current_device_index = torch.cuda.current_device()
+    props = torch.cuda.get_device_properties(current_device_index)
+    message = f"""
+        current device index: {current_device_index}
+        current device uuid: {props.uuid}
+        current device name: {props.name}
+        memory, total on device: {torch.cuda.mem_get_info()[1] / 1024**3:.3f} GB
+        memory, available on device: {torch.cuda.mem_get_info()[0] / 1024**3:.3f} GB
+        memory allocated for tensors etc: {torch.cuda.memory_allocated() / 1024**3:.3f} GB
+        max memory reserved for tensors etc: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB
+        """
+    return message
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from bionemo.testing.torch import get_device_and_memory_allocated
+
+
+def test_get_device_and_memory_allocated():
+    message = get_device_and_memory_allocated()
+    assert message is not None
+    assert "memory, total on device" in message
+    assert "memory, available on device" in message
+    assert "memory allocated for tensors etc" in message
+    assert "max memory reserved for tensors etc" in message