Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
da40a1a
br: test_tools
broland-hat Aug 20, 2025
d259e74
br:
broland-hat Aug 20, 2025
5027a51
br: update NeMo to 7ccb0d4 after NeMo PR14515
broland-hat Aug 20, 2025
7330d87
br: get_devie_and_memory_allocated() move to torch.py [skip ci]"
broland-hat Aug 20, 2025
9537a47
br: adjust memory thresholds [skip ci]
broland-hat Aug 20, 2025
03ef6e6
Merge remote-tracking branch 'origin' into br_bnm2533_fix_evo2_tests_a
broland-hat Aug 20, 2025
6e2a005
br: adjust sequence length cap for fixed memory requirements"
broland-hat Aug 20, 2025
1c7d31d
br: run linter local
broland-hat Aug 21, 2025
8f95f12
Merge branch 'main' into br_bnm2533_fix_evo2_tests_a
broland-hat Aug 21, 2025
948bba3
br: linter on test_evo2.py
broland-hat Aug 21, 2025
21d6b8f
br: linter for conftest.py
broland-hat Aug 21, 2025
bb927fb
br: test module for coverage
broland-hat Aug 21, 2025
1877b6b
br: linter for test module
broland-hat Aug 21, 2025
239ecfe
Merge remote-tracking branch 'origin/main' into br_bnm2533_fix_evo2_t…
broland-hat Aug 21, 2025
7ecedf3
br: enable is_fp8_supported and compute_cabability test
broland-hat Aug 21, 2025
2b9ccdb
br: enable fp8 test
broland-hat Aug 21, 2025
28586e5
Merge remote-tracking branch 'origin/main' into br_bnm2533_fix_evo2_t…
broland-hat Aug 22, 2025
628d090
br: skipp test_generated_speed
broland-hat Aug 25, 2025
320ed0a
br: skip test_golden_values_top_k_logits_and_cosine_similarity_7b
broland-hat Aug 25, 2025
83f152a
br: linter
broland-hat Aug 25, 2025
1f95151
Merge remote-tracking branch 'origin/main' into br_bnm2533_fix_evo2_t…
broland-hat Aug 25, 2025
9e85a76
Merge remote-tracking branch 'origin/main' into br_bnm2533_fix_evo2_t…
broland-hat Aug 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/NeMo
Submodule NeMo updated from f4f22a to 7ccb0d
20 changes: 14 additions & 6 deletions sub-packages/bionemo-evo2/tests/bionemo/evo2/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,30 @@
import pytest
import torch

from bionemo.testing.torch import get_device_and_memory_allocated


def pytest_sessionstart(session):
"""Called at the start of the test session."""
if torch.cuda.is_available():
torch.cuda.reset_peak_memory_stats()
print(f"Starting test session. Initial GPU memory: {torch.cuda.memory_allocated() / 1024**3:.3f} GB")
print(
f"""
sub-packages/bionemo-evo2/tests/bionemoe/evo2: Starting test session
{get_device_and_memory_allocated()}
"""
)


def pytest_sessionfinish(session, exitstatus):
"""Called at the end of the test session."""
if torch.cuda.is_available():
peak_memory = torch.cuda.max_memory_allocated()
final_memory = torch.cuda.memory_allocated()
print("\nTest session complete:")
print(f" Peak GPU memory: {peak_memory / 1024**3:.3f} GB")
print(f" Final GPU memory: {final_memory / 1024**3:.3f} GB")
print(
f"""
sub-packages/bionemo-evo2/tests/bionemoe/evo2: Test session complete
{get_device_and_memory_allocated()}
"""
)


@pytest.fixture(autouse=True)
Expand Down
81 changes: 54 additions & 27 deletions sub-packages/bionemo-evo2/tests/bionemo/evo2/test_evo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,44 @@
logger.setLevel(logging.DEBUG) # Capture all levels in the logger itself


def determine_memory_requirement_and_skip_if_not_met(ckpt_name: str, flash_decode: bool | None = None) -> int:
"""Determine the memory requirement for a given checkpoint and flash decode condition.
ckpt_name : str
the name of the checkpoint to test
flash_decode: bool | None
whether to test with flash decode
Returns:
The input sequence length cap, for the model sin the checkpoint, given certain memory requirements.
If the memory requirement is not met, the test is skipped.
"""

if "1b" in ckpt_name:
model_size = "1b"
seq_len_cap = 6000
memory_needed_by_test = 17 # max reserved rounded up, for stand-alone test
elif "7b" in ckpt_name:
model_size = "7b"
seq_len_cap = 4000
memory_needed_by_test = 32 # max reserved rounded up, for stand-alone test
else:
raise ValueError(f"{ckpt_name=} is not supported for testing")

skip_condition_flash = flash_decode is None or flash_decode
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
skip_condition = gb_available < memory_needed_by_test and skip_condition_flash

if skip_condition:
pytest.skip(
", ".join(
[
f"Inference API requires at least {memory_needed_by_test}GB of available memory for {model_size} models",
f"{gb_available=}",
]
)
)
return seq_len_cap


def load_weights_sharded_inplace_nemo2_to_mcore(
model: MegatronModelType,
distributed_checkpoint_dir: str | Path,
Expand Down Expand Up @@ -181,6 +219,7 @@ def test_golden_values_top_k_logits_and_cosine_similarity_7b(seq_len: int = 8_19
outputs = model(input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask)
gold_standard_no_fp8_tensor = torch.load(gold_standard_no_fp8).to(device=outputs.device, dtype=outputs.dtype)
is_fp8_supported, compute_capability, device_info = check_fp8_support(device.index)

if is_fp8_supported and compute_capability == "9.0":
# Most rigurous assertion for output equivalence currently works on devices that are new enough to
# support FP8.
Expand Down Expand Up @@ -364,11 +403,8 @@ def check_matchrate(*, ckpt_name, matchrate, assert_matchrate=True):
)
def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: list[float]):
assert len(sequences) > 0
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
seq_len_cap = determine_memory_requirement_and_skip_if_not_met(ckpt_name)

is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
if skip:
Expand All @@ -380,7 +416,7 @@ def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: l
)
matchrates = []
for seq in sequences:
seq = seq[:6000] # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
seq = seq[:seq_len_cap] # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
with torch.no_grad():
device = torch.cuda.current_device()
tokens = torch.tensor([mcore_tokenizer.tokenize(seq)], device=device)
Expand Down Expand Up @@ -426,13 +462,11 @@ def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: l
)
def test_forward_manual(sequences: list[str], ckpt_name: str, expected_matchpercents: list[float], flash_decode: bool):
assert len(sequences) > 0
seq_len_cap = determine_memory_requirement_and_skip_if_not_met(ckpt_name, flash_decode)

is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and flash_decode) or (gb_available < 50 and flash_decode and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)

vortex_style_fp8 = is_fp8_supported and "bf16" not in ckpt_name
if skip:
# This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.
Expand Down Expand Up @@ -479,7 +513,9 @@ def test_forward_manual(sequences: list[str], ckpt_name: str, expected_matchperc
forward_kwargs = {}
matchrates = []
for seq in sequences:
seq = seq[:6000] # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
seq = seq[
:seq_len_cap
] # TODO: artificial limit, megatron uses more memory. Vortex can process full sequences
with torch.no_grad():
device = torch.cuda.current_device()
# tokens = torch.tensor([tokenizer.tokenize(seq)], device=device)
Expand Down Expand Up @@ -542,12 +578,9 @@ def test_batch_generate(
sequences: list[str], ckpt_name: str, model_tokenizer_provider: Callable, expected_matchpercents: list[float]
):
assert len(sequences) > 0
determine_memory_requirement_and_skip_if_not_met(ckpt_name)

is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
if skip:
# This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.
Expand Down Expand Up @@ -614,11 +647,8 @@ def test_batch_generate_coding_sequences(
expected_matchpercents: list[float],
):
assert len(coding_sequences) > 0
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
determine_memory_requirement_and_skip_if_not_met(ckpt_name)

is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
if skip:
Expand Down Expand Up @@ -723,11 +753,8 @@ def test_generate_speed(
expected_tokens_sec: float,
):
is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
determine_memory_requirement_and_skip_if_not_met(ckpt_name)

skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
if skip:
# This checkpoint is sensitive to FP8, so we skip it if it is not supported on the current device.
Expand Down
16 changes: 16 additions & 0 deletions sub-packages/bionemo-testing/src/bionemo/testing/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,19 @@ def recursive_assert_approx_equal(x, y, atol=1e-4, rtol=1e-4):
recursive_assert_approx_equal(x[key], y[key], atol=atol, rtol=rtol)
else:
assert x == y


def get_device_and_memory_allocated() -> str:
"""Get the current device index, name, and memory usage."""
current_device_index = torch.cuda.current_device()
props = torch.cuda.get_device_properties(current_device_index)
message = f"""
current device index: {current_device_index}
current device uuid: {props.uuid}
current device name: {props.name}
memory, total on device: {torch.cuda.mem_get_info()[1] / 1024**3:.3f} GB
memory, available on device: {torch.cuda.mem_get_info()[0] / 1024**3:.3f} GB
memory allocated for tensors etc: {torch.cuda.memory_allocated() / 1024**3:.3f} GB
max memory reserved for tensors etc: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB
"""
return message
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-Apache2
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from bionemo.testing.torch import get_device_and_memory_allocated


def test_get_device_and_memory_allocated():
message = get_device_and_memory_allocated()
assert message is not None
assert "memory, total on device" in message
assert "memory, available on device" in message
assert "memory allocated for tensors etc" in message
assert "max memory reserved for tensors etc" in message
Loading