Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
da40a1a
br: test_tools
broland-hat Aug 20, 2025
d259e74
br:
broland-hat Aug 20, 2025
5027a51
br: update NeMo to 7ccb0d4 after NeMo PR14515
broland-hat Aug 20, 2025
7330d87
br: get_devie_and_memory_allocated() move to torch.py [skip ci]"
broland-hat Aug 20, 2025
9537a47
br: adjust memory thresholds [skip ci]
broland-hat Aug 20, 2025
03ef6e6
Merge remote-tracking branch 'origin' into br_bnm2533_fix_evo2_tests_a
broland-hat Aug 20, 2025
6e2a005
br: adjust sequence length cap for fixed memory requirements"
broland-hat Aug 20, 2025
1c7d31d
br: run linter local
broland-hat Aug 21, 2025
8f95f12
Merge branch 'main' into br_bnm2533_fix_evo2_tests_a
broland-hat Aug 21, 2025
948bba3
br: linter on test_evo2.py
broland-hat Aug 21, 2025
21d6b8f
br: linter for conftest.py
broland-hat Aug 21, 2025
bb927fb
br: test module for coverage
broland-hat Aug 21, 2025
1877b6b
br: linter for test module
broland-hat Aug 21, 2025
239ecfe
Merge remote-tracking branch 'origin/main' into br_bnm2533_fix_evo2_t…
broland-hat Aug 21, 2025
7ecedf3
br: enable is_fp8_supported and compute_cabability test
broland-hat Aug 21, 2025
2b9ccdb
br: enable fp8 test
broland-hat Aug 21, 2025
28586e5
Merge remote-tracking branch 'origin/main' into br_bnm2533_fix_evo2_t…
broland-hat Aug 22, 2025
628d090
br: skipp test_generated_speed
broland-hat Aug 25, 2025
320ed0a
br: skip test_golden_values_top_k_logits_and_cosine_similarity_7b
broland-hat Aug 25, 2025
83f152a
br: linter
broland-hat Aug 25, 2025
1f95151
Merge remote-tracking branch 'origin/main' into br_bnm2533_fix_evo2_t…
broland-hat Aug 25, 2025
9e85a76
Merge remote-tracking branch 'origin/main' into br_bnm2533_fix_evo2_t…
broland-hat Aug 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/NeMo
Submodule NeMo updated from ee0be1 to 7ccb0d
22 changes: 15 additions & 7 deletions sub-packages/bionemo-evo2/tests/bionemo/evo2/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,31 @@
import pytest
import torch

from bionemo.testing.torch import get_device_and_memory_allocated


def pytest_sessionstart(session):
"""Called at the start of the test session."""
if torch.cuda.is_available():
torch.cuda.reset_peak_memory_stats()
print(f"Starting test session. Initial GPU memory: {torch.cuda.memory_allocated() / 1024**3:.3f} GB")
print(
f"""
sub-packages/bionemo-evo2/tests/bionemoe/evo2: Starting test session
{get_device_and_memory_allocated()}
"""
)


def pytest_sessionfinish(session, exitstatus):
"""Called at the end of the test session."""
if torch.cuda.is_available():
peak_memory = torch.cuda.max_memory_allocated()
final_memory = torch.cuda.memory_allocated()
print("\nTest session complete:")
print(f" Peak GPU memory: {peak_memory / 1024**3:.3f} GB")
print(f" Final GPU memory: {final_memory / 1024**3:.3f} GB")

print(
f"""
sub-packages/bionemo-evo2/tests/bionemoe/evo2: Test session complete
{get_device_and_memory_allocated()}
"""
)


@pytest.fixture(autouse=True)
def cleanup_after_test():
Expand Down
15 changes: 10 additions & 5 deletions sub-packages/bionemo-evo2/tests/bionemo/evo2/test_evo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@
logger.setLevel(logging.DEBUG) # Capture all levels in the logger itself


MEM_REQUIREMENT_1B_GB = 18 # add 0.6 GB to max mem reserved, and round up
MEM_REQUIREMENT_7B_GB = 48



def load_weights_sharded_inplace_nemo2_to_mcore(
model: MegatronModelType,
distributed_checkpoint_dir: str | Path,
Expand Down Expand Up @@ -365,7 +370,7 @@ def check_matchrate(*, ckpt_name, matchrate, assert_matchrate=True):
def test_forward(sequences: list[str], ckpt_name: str, expected_matchpercents: list[float]):
assert len(sequences) > 0
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
if (gb_available < MEM_REQUIREMENT_1B_GB and "1b" in ckpt_name) or (gb_available < MEM_REQUIREMENT_7B_GB and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
Expand Down Expand Up @@ -429,7 +434,7 @@ def test_forward_manual(sequences: list[str], ckpt_name: str, expected_matchperc
is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
skip = "evo2/1b-8k:" in ckpt_name and not is_fp8_supported
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and flash_decode) or (gb_available < 50 and flash_decode and "7b" in ckpt_name):
if (gb_available < MEM_REQUIREMENT_1B_GB and flash_decode) or (gb_available < MEM_REQUIREMENT_7B_GB and flash_decode and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
Expand Down Expand Up @@ -544,7 +549,7 @@ def test_batch_generate(
assert len(sequences) > 0
is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
if (gb_available < MEM_REQUIREMENT_1B_GB and "1b" in ckpt_name) or (gb_available < MEM_REQUIREMENT_7B_GB and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
Expand Down Expand Up @@ -615,7 +620,7 @@ def test_batch_generate_coding_sequences(
):
assert len(coding_sequences) > 0
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
if (gb_available < MEM_REQUIREMENT_1B_GB and "1b" in ckpt_name) or (gb_available < MEM_REQUIREMENT_7B_GB and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
Expand Down Expand Up @@ -724,7 +729,7 @@ def test_generate_speed(
):
is_fp8_supported, compute_capability, device_info = check_fp8_support(torch.cuda.current_device())
gb_available = torch.cuda.mem_get_info()[0] / 1024**3
if (gb_available < 38 and "1b" in ckpt_name) or (gb_available < 50 and "7b" in ckpt_name):
if (gb_available < MEM_REQUIREMENT_1B_GB and "1b" in ckpt_name) or (gb_available < MEM_REQUIREMENT_7B_GB and "7b" in ckpt_name):
pytest.skip(
f"Inference API requires more than 38GB of memory for 1b models, or 50GB for 7b models. {gb_available=}"
)
Expand Down
18 changes: 18 additions & 0 deletions sub-packages/bionemo-testing/src/bionemo/testing/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,21 @@ def recursive_assert_approx_equal(x, y, atol=1e-4, rtol=1e-4):
recursive_assert_approx_equal(x[key], y[key], atol=atol, rtol=rtol)
else:
assert x == y


def get_device_and_memory_allocated() -> str:
"""Get the current device index, name, and memory usage."""
current_device_index = torch.cuda.current_device()
props = torch.cuda.get_device_properties(current_device_index)
message = (
f"""
current device index: {current_device_index}
current device uuid: {props.uuid}
current device name: {props.name}
memory, total on device: {torch.cuda.mem_get_info()[1] / 1024**3:.3f} GB
memory, available on device: {torch.cuda.mem_get_info()[0] / 1024**3:.3f} GB
memory allocated for tensors etc: {torch.cuda.memory_allocated() / 1024**3:.3f} GB
max memory reserved for tensors etc: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB
"""
)
return message