Skip to content

Commit 7dd1659

Browse files
authored
Bump recipes base container to 25.12 (#1399)
Also requires using TOT megatron-fsdp to avoid the error ``` File "/usr/local/lib/python3.12/dist-packages/megatron_fsdp/utils.py", line 168, in get_mesh_names for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: '_MeshEnv' object has no attribute 'child_to_root_mapping' ``` that was fixed in NVIDIA/Megatron-LM#2575 --------- Signed-off-by: Peter St. John <pstjohn@nvidia.com>
1 parent bcb127b commit 7dd1659

File tree

20 files changed

+37
-32
lines changed

20 files changed

+37
-32
lines changed

.devcontainer/recipes/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Uncomment to use the latest TE from the NGC registry for debugging changes with latest TE.
22
# FROM gitlab-master.nvidia.com/dl/transformerengine/transformerengine:main-pytorch-py3-base
3-
FROM nvcr.io/nvidia/pytorch:25.11-py3
3+
FROM nvcr.io/nvidia/pytorch:25.12-py3
44

55
# FIXME: Fix for "No such file or directory: /workspace/TransformerEngine"
66
# Remove once bug has been addressed in the nvidia/pytorch container.

.devcontainer/recipes/requirements.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
accelerate @ git+https://github.com/huggingface/accelerate.git # Until huggingface/accelerate#3852 is released.
1+
accelerate
22
datasets
33
deepspeed
44
hydra-core
5-
megatron-fsdp
5+
# TOT megatron-mfsdp until NVIDIA/Megatron-LM#2575 is in a release.
6+
megatron-fsdp @ git+https://github.com/NVIDIA/Megatron-LM.git@main#subdirectory=megatron/core/distributed/fsdp/src
67
peft
8+
pytest
79
torch
810
torchao!=0.14.0
911
torchdata

.github/workflows/unit-tests-recipes.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ jobs:
9393
# Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
9494
# xformers-based models for golden value testing. The rest of the models use the default pytorch image.
9595
96-
# This uses a squashed version of the pytorch:25.11-py3 image, generated with `docker-squash
97-
# nvcr.io/nvidia/pytorch:25.11-py3 -t svcbionemo023/bionemo-framework:pytorch25.11-py3-squashed --output
96+
# This uses a squashed version of the pytorch:25.12-py3 image, generated with `docker-squash
97+
# nvcr.io/nvidia/pytorch:25.12-py3 -t svcbionemo023/bionemo-framework:pytorch25.12-py3-squashed --output
9898
# type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15` and pushed
9999
# to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
100100
# hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
@@ -107,8 +107,8 @@ jobs:
107107
if . == "bionemo-recipes/models/amplify" then
108108
"svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
109109
else
110-
# "nvcr.io/nvidia/pytorch:25.11-py3"
111-
"svcbionemo023/bionemo-framework:pytorch25.11-py3-squashed"
110+
# "nvcr.io/nvidia/pytorch:25.12-py3"
111+
"svcbionemo023/bionemo-framework:pytorch25.12-py3-squashed"
112112
end
113113
)
114114
})

bionemo-recipes/models/esm2/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/pytorch:25.11-py3
1+
FROM nvcr.io/nvidia/pytorch:25.12-py3
22
WORKDIR /workspace/bionemo
33
COPY . .
44
RUN --mount=type=cache,target=/root/.cache/pip \

bionemo-recipes/models/esm2/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ dependencies = [
1212
"datasets",
1313
"hydra-core",
1414
"jinja2",
15-
"megatron-fsdp",
15+
# TOT megatron-mfsdp until NVIDIA/Megatron-LM#2575 is in a release.
16+
"megatron-fsdp @ git+https://github.com/NVIDIA/Megatron-LM.git@main#subdirectory=megatron/core/distributed/fsdp/src",
1617
"omegaconf",
1718
"peft",
1819
"pytest",

bionemo-recipes/models/geneformer/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/pytorch:25.11-py3
1+
FROM nvcr.io/nvidia/pytorch:25.12-py3
22
WORKDIR /workspace/bionemo
33
COPY . .
44
RUN --mount=type=cache,target=/root/.cache/pip \

bionemo-recipes/models/geneformer/tests/test_checkpoints_modeling_bert.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import pytest
1919
import torch
20-
from transformers import AutoModelForMaskedLM
20+
from transformers import AutoModelForMaskedLM, set_seed
2121

2222

2323
def load_geneformer_model(model_name):
@@ -77,7 +77,9 @@ def load_geneformer_model(model_name):
7777
def test_geneformer_checkpoint_loss(model_variant, input_data):
7878
"""Test that the TE model can process input data and produce valid loss outputs."""
7979

80-
model_name, model_info = model_variant
80+
set_seed(42)
81+
82+
model_name, _ = model_variant
8183

8284
# Load the specific Geneformer checkpoint from Hugging Face
8385
model_hf = load_geneformer_model(model_name)
@@ -112,9 +114,9 @@ def test_geneformer_checkpoint_loss(model_variant, input_data):
112114
torch.testing.assert_close(
113115
te_outputs.loss,
114116
hf_outputs.loss,
115-
atol=1e-3,
117+
atol=1e-2,
116118
rtol=1e-3,
117-
msg=f"TE loss ({te_outputs.loss:.4f}) and HF loss ({hf_outputs.loss:.4f}) should be close",
119+
msg=lambda x: f"TE loss ({te_outputs.loss:.4f}) and HF loss ({hf_outputs.loss:.4f}) should be close: {x}",
118120
)
119121

120122
# Clean up

bionemo-recipes/recipes/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ recipes/{recipe_name}/
8585
Your `Dockerfile` should create a complete, reproducible training environment:
8686

8787
```dockerfile
88-
FROM nvcr.io/nvidia/pytorch:25.11-py3
88+
FROM nvcr.io/nvidia/pytorch:25.12-py3
8989

9090
# Install dependencies with caching for faster builds
9191
RUN --mount=type=cache,target=/root/.cache/pip \

bionemo-recipes/recipes/esm2_accelerate_te/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvcr.io/nvidia/pytorch:25.11-py3
1+
FROM nvcr.io/nvidia/pytorch:25.12-py3
22

33
RUN --mount=type=cache,target=/root/.cache/pip \
44
--mount=type=bind,source=requirements.txt,target=/requirements.txt \

bionemo-recipes/recipes/esm2_native_te/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# syntax=docker/dockerfile:1.4
2-
FROM nvcr.io/nvidia/pytorch:25.11-py3
2+
FROM nvcr.io/nvidia/pytorch:25.12-py3
33

44
RUN --mount=type=cache,target=/root/.cache/pip \
55
--mount=type=bind,source=requirements.txt,target=/requirements.txt \

0 commit comments

Comments
 (0)