[BIONEMO-3530] Fix weight initialization in ESM2 (#1406)

balvisio · web-flow · commit d2080238d2c5 · 2026-01-10T01:12:39.000Z
### Description This PR fixes a bug where when instantiating a model `from_pretrained` the layers that are not part of the pretrained model are not being initialized. ### Type of changes  - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels. By default, only basic unit tests are run. - [ciflow:skip](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:skip) - Skip all CI tests for this PR - [ciflow:notebooks](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:notebooks) - Run Jupyter notebooks execution tests for bionemo2 - [ciflow:slow](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:slow) - Run slow single GPU integration tests marked as @pytest.mark.slow for bionemo2 - [ciflow:all](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all) - Run all tests (unit tests, slow tests, and notebooks) for bionemo2. This label can be used to enforce running tests for all bionemo2. - [ciflow:all-recipes](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/main/contributing/contributing.md#ciflow:all-recipes) - Run tests for all recipes (under bionemo-recipes). This label can be used to enforce running tests for all recipes. Unit tests marked as `@pytest.mark.multi_gpu` or `@pytest.mark.distributed` are not run in the PR pipeline. For more details, see [CONTRIBUTING](CONTRIBUTING.md) > [!NOTE] > By default, only basic unit tests are run. Add appropriate labels to enable an additional test coverage. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. - If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) - If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. ### Pre-submit Checklist  - [x] I have tested these changes locally - [x] I have updated the documentation accordingly - [x] I have added/updated tests as needed - [x] All existing tests pass successfully Signed-off-by: Bruno Alvisio <balvisio@nvidia.com>
diff --git a/bionemo-recipes/models/esm2/src/esm/modeling_esm_te.py b/bionemo-recipes/models/esm2/src/esm/modeling_esm_te.py
@@ -284,6 +284,11 @@ def init_empty_weights(self):
         # Meta-device init seems to break weight tying, so we re-tie the weights here.
         self.tie_weights()
 
+    @classmethod
+    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
+        """Override the default get_init_context method to allow for fp8 model initialization."""
+        return []
+
 
 class NVEsmModel(NVEsmPreTrainedModel):
     """The ESM Encoder-only protein language model.
diff --git a/bionemo-recipes/models/esm2/tests/test_meta_device_init.py b/bionemo-recipes/models/esm2/tests/test_meta_device_init.py
@@ -35,7 +35,7 @@
 from transformer_engine.pytorch.tensor import QuantizedTensor
 from transformers import AutoConfig, set_seed
 
-from esm.modeling_esm_te import NVEsmConfig, NVEsmForMaskedLM
+from esm.modeling_esm_te import NVEsmConfig, NVEsmForMaskedLM, NVEsmForTokenClassification
 
 
 requires_multi_gpu = pytest.mark.skipif(
@@ -44,6 +44,10 @@
 )
 
 
+def msg(x):
+    return f"Mismatch in module {name}: {x}"
+
+
 def verify_model_parameters_initialized_correctly(
     model: NVEsmForMaskedLM, atol=1e-3, rtol=1e-4, should_be_fp8: bool = False
 ):
@@ -53,10 +57,6 @@ def verify_model_parameters_initialized_correctly(
         assert str(parameter.device).startswith("cuda"), f"Parameter {name} is not on the cuda device"
 
     for name, module in model.named_modules():
-
-        def msg(x):
-            return f"Mismatch in module {name}: {x}"
-
         if isinstance(module, torch.nn.Embedding):
             torch.testing.assert_close(module.weight.mean().item(), 0.0, atol=atol, rtol=rtol, msg=msg)
             torch.testing.assert_close(
@@ -118,6 +118,22 @@ def msg(x):
             torch.testing.assert_close(module.inv_freq, expected_inv_freq, msg=msg)
 
 
+def verify_pretrained_model_sanity(model: NVEsmForTokenClassification, atol=1e-3, rtol=1e-4):
+    for name, p in model.named_parameters():
+        assert p.numel() > 0, f"{name} is empty"
+        assert torch.isfinite(p).all(), f"{name} has NaN/Inf"
+
+        max_abs = p.abs().max().item()
+        assert max_abs < 1e3, f"{name} extreme values: {max_abs}"
+
+        if name == "classifier.weight":
+            torch.testing.assert_close(p.mean().item(), 0.0, atol=atol, rtol=rtol, msg=msg)
+            torch.testing.assert_close(p.std().item(), model.config.initializer_range, atol=atol, rtol=rtol, msg=msg)
+
+        if name == "classifier.bias":
+            torch.testing.assert_close(p, torch.zeros_like(p), msg=msg)
+
+
 def test_cuda_init():
     config = NVEsmConfig(**AutoConfig.from_pretrained("facebook/esm2_t6_8M_UR50D").to_dict())
 
@@ -170,6 +186,18 @@ def test_meta_fp8_init(fp8_recipe):
     verify_model_parameters_initialized_correctly(model, should_be_fp8=True)
 
 
+def test_model_for_token_classification_init():
+    config = NVEsmConfig(**AutoConfig.from_pretrained("nvidia/esm2_t6_8M_UR50D", trust_remote_code=True).to_dict())
+
+    set_seed(42)
+    model = NVEsmForTokenClassification.from_pretrained(
+        "nvidia/esm2_t6_8M_UR50D", config=config, dtype=torch.bfloat16, trust_remote_code=True
+    )
+    model.to("cuda")
+
+    verify_pretrained_model_sanity(model)
+
+
 @pytest.mark.parametrize("num_gpus", [1, pytest.param(2, marks=requires_multi_gpu)])
 def test_meta_device_init_after_fully_shard(num_gpus: int):
     cmd = [
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_accelerate_te/example_8m_checkpoint/esm_nv.py
@@ -284,6 +284,11 @@ def init_empty_weights(self):
         # Meta-device init seems to break weight tying, so we re-tie the weights here.
         self.tie_weights()
 
+    @classmethod
+    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
+        """Override the default get_init_context method to allow for fp8 model initialization."""
+        return []
+
 
 class NVEsmModel(NVEsmPreTrainedModel):
     """The ESM Encoder-only protein language model.
diff --git a/bionemo-recipes/recipes/esm2_native_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_native_te/example_8m_checkpoint/esm_nv.py
@@ -284,6 +284,11 @@ def init_empty_weights(self):
         # Meta-device init seems to break weight tying, so we re-tie the weights here.
         self.tie_weights()
 
+    @classmethod
+    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
+        """Override the default get_init_context method to allow for fp8 model initialization."""
+        return []
+
 
 class NVEsmModel(NVEsmPreTrainedModel):
     """The ESM Encoder-only protein language model.
diff --git a/bionemo-recipes/recipes/esm2_peft_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_peft_te/example_8m_checkpoint/esm_nv.py
@@ -284,6 +284,11 @@ def init_empty_weights(self):
         # Meta-device init seems to break weight tying, so we re-tie the weights here.
         self.tie_weights()
 
+    @classmethod
+    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
+        """Override the default get_init_context method to allow for fp8 model initialization."""
+        return []
+
 
 class NVEsmModel(NVEsmPreTrainedModel):
     """The ESM Encoder-only protein language model.