NVIDIA
diff --git a/‎.github/workflows/bionemo-recipes.yml‎
Lines changed: 70 additions & 0 deletions b/‎.github/workflows/bionemo-recipes.yml‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎.github/workflows/unit-tests.yml‎
Lines changed: 38 additions & 3 deletions b/‎.github/workflows/unit-tests.yml‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 12 additions & 0 deletions b/‎.gitignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 0 deletions b/‎README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎bionemo-recipes.md‎
Lines changed: 194 additions & 0 deletions b/‎bionemo-recipes.md‎
Lines changed: 194 additions & 0 deletions
@@ -0,0 +1,70 @@
+name: "BioNeMo Recipes CI"
+
+on:
+  push:
+    branches:
+      - main
+      - "pull-request/[0-9]+"
+      - "dependabot/**"
+  merge_group:
+    types: [checks_requested]
+  schedule:
+    - cron: "0 7 * * *" # Runs at 7 AM UTC daily (12 AM MST)
+
+defaults:
+  run:
+    shell: bash -x -e -u -o pipefail {0}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  changed-files:
+    if: github.event_name != 'schedule'
+    runs-on: ubuntu-latest
+    outputs:
+      any_changed: ${{ steps.changed-files.outputs.changed_directories }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: step-security/changed-files@v46
+        id: changed-files
+        with:
+          base_sha: main
+          dir_names: true
+          dir_names_max_depth: 2
+          files: |
+            'models/**'
+            'recipes/**'
+      - name: List all changed files
+        env:
+          ALL_CHANGED_DIRECTORIES: ${{ steps.changed-files.outputs.all_changed_directories }}
+        run: |
+          for directory in ${ALL_CHANGED_DIRECTORIES}; do
+            echo "$directory was changed"
+          done
+
+  pre-commit:
+    runs-on: ubuntu-latest
+    needs: changed-files
+    if: needs.changed-files.outputs.any_changed == 'true'
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+          cache: "pip"
+      - name: Setup UV
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+      - run: |
+          uv tool install pre-commit --with pre-commit-uv --force-reinstall
+          uv tool install tach>=0.9.0
+          uv tool update-shell
+      - run: ./ci/scripts/static_checks.sh
+
@@ -20,18 +20,51 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  changed-files:
+    if: github.event_name != 'schedule'
+    runs-on: ubuntu-latest
+    outputs:
+      any_changed: ${{ steps.changed-files.outputs.any_changed }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: step-security/changed-files@v46
+        id: changed-files
+        with:
+          base_sha: main
+          files: |
+            '!models/**'
+            '!recipes/**'
+            '!**.md'
+      - name: List all changed files
+        env:
+          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+        run: |
+          for file in ${ALL_CHANGED_FILES}; do
+            echo "$file was changed"
+          done
+
   pre-commit:
     runs-on: ubuntu-latest
+    needs: changed-files
+    if: needs.changed-files.outputs.any_changed == 'true'
     steps:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
-          submodules: "recursive"
       - uses: actions/setup-python@v5
         with:
           python-version: "3.13"
           cache: "pip"
-      - run: pip install -r requirements-dev.txt
+      - name: Setup UV
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+      - run: |
+          uv tool install pre-commit --with pre-commit-uv --force-reinstall
+          uv tool install tach>=0.9.0
+          uv tool update-shell
       - run: ./ci/scripts/static_checks.sh
 
   # With copy-pr-bot, we need to get the PR labels from the PR API rather than from the event metadata.
@@ -65,8 +98,9 @@ jobs:
     needs:
       - pre-commit
       - get-pr-labels
+      - changed-files
     runs-on: linux-amd64-cpu16
-    if: ${{ !contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'SKIP_CI') }}
+    if: ${{ !contains(fromJSON(needs.get-pr-labels.outputs.labels || '[]'), 'SKIP_CI') && (needs.changed-files.outputs.any_changed == 'true' || needs.changed-files.result == 'skipped') }}
     steps:
       - name: Login to Docker Hub
         uses: docker/login-action@v3
@@ -185,6 +219,7 @@ jobs:
     needs:
       - build-bionemo-image
       - get-pr-labels
+      - changed-files
     runs-on: linux-amd64-gpu-l4-latest-1
     if: |
       github.event_name == 'schedule' || github.event_name == 'merge_group' ||
 
@@ -194,3 +194,15 @@ coverage.xml
 Thumbs.db
 
 .python_history
+
+# Any training results
+results/
+job_output/
+wandb/
+
+# Any model checkpoints
+*.safetensors
+checkpoint_export/
+
+# Hydra outputs
+outputs/
@@ -15,6 +15,12 @@ expert-level support.
 
 BioNeMo Framework is part of a larger ecosystem of NVIDIA Biopharma products. Get notified of new releases, bug fixes, critical security updates, and more for biopharma. [Subscribe.](https://www.nvidia.com/en-us/clara/biopharma/product-updates/)
 
+> [!NOTE]
+> BioNeMo Recipes are now available, which demonstrate high-performance model training outside of the NeMo Framework.
+> The recipes show how to train models that derive from HuggingFace `PreTrainedModel` classes, and use
+> [NVIDIA TransformerEngine](https://github.com/NVIDIA/TransformerEngine) layers for optimized attention kernels. For
+> more information, see the [BioNeMo Recipes README](./bionemo-recipes.md).
+
 ## Structure of the Framework
 
 The `bionemo-framework` is organized into independently installable namespace packages. These are located under the
 
@@ -0,0 +1,194 @@
+# BioNemo Recipes
+
+BioNemo Recipes provides an easy path for the biological foundation model training community to scale up transformer-based models efficiently. Rather than offering a batteries-included training framework, we provide **model checkpoints** with TransformerEngine layers and **training recipes** that demonstrate how to achieve maximum throughput with popular open-source frameworks.
+
+## Overview
+
+The biological AI community is actively prototyping model architectures and needs tooling that prioritizes extensibility, interoperability, and ease-of-use alongside performance. BioNemo Recipes addresses this by offering:
+
+- **Flexible scaling**: Scale from single-GPU prototyping to multi-node training without complex parallelism configurations
+- **Framework compatibility**: Works with popular frameworks like HuggingFace Accelerate, PyTorch Lightning, and vanilla PyTorch
+- **Performance optimization**: Leverages TransformerEngine and nvFSDP for state-of-the-art training efficiency
+- **Research-friendly**: Hackable, readable code that researchers can easily adapt for their experiments
+
+### Use Cases
+
+- **Foundation Model Developers**: AI researchers and ML engineers developing novel biological foundation models who need to scale up prototypes efficiently
+- **Foundation Model Customizers**: Domain scientists looking to fine-tune existing models with proprietary data for drug discovery and biological research
+
+## Repository Structure
+
+This repository contains two types of components:
+
+### Models (`models/`)
+
+Huggingface-compatible `PreTrainedModel` classes that use TransformerEngine layers internally. These are designed to be:
+
+- **Distributed via Hugging Face Hub**: Pre-converted checkpoints available at [huggingface.co/nvidia](https://huggingface.co/nvidia)
+- **Drop-in replacements**: Compatible with `AutoModel.from_pretrained()` without additional dependencies
+- **Performance optimized**: Leverage TransformerEngine features like FP8 training and context parallelism
+
+Example models include ESM-2, Geneformer, and AMPLIFY.
+
+### Recipes (`recipes/`)
+
+Self-contained training examples demonstrating best practices for scaling biological foundation models. Each recipe is a complete Docker container with:
+
+- **Framework examples**: Vanilla PyTorch, HuggingFace Accelerate, PyTorch Lightning
+- **Feature demonstrations**: FP8 training, nvFSDP, context parallelism, sequence packing
+- **Scaling strategies**: Single-GPU to multi-node training patterns
+- **Benchmarked performance**: Validated throughput and convergence metrics
+
+Recipes are **not pip-installable packages** but serve as reference implementations that users can adapt for their own research.
+
+## Quick Start
+
+### Using Models
+
+```python
+from transformers import AutoModel, AutoTokenizer
+
+# Load a BioNemo model directly from Hugging Face
+model = AutoModel.from_pretrained("nvidia/AMPLIFY_120M")
+tokenizer = AutoTokenizer.from_pretrained("nvidia/AMPLIFY_120M")
+```
+
+### Running Recipes
+
+```bash
+# Navigate to a recipe
+cd recipes/esm2_native_te_nvfsdp
+
+# Build and run
+docker build -t esm2_recipe .
+docker run --rm -it --gpus all esm2_recipe python train.py
+```
+
+______________________________________________________________________
+
+## Developer Guide
+
+### Setting Up Development Environment
+
+1. **Install pre-commit hooks:**
+
+   ```bash
+   pre-commit install
+   ```
+
+   Run hooks manually:
+
+   ```bash
+   pre-commit run --all-files
+   ```
+
+2. **Test your changes:**
+   Each model and recipe has its own build and test setup following this pattern:
+
+   ```bash
+   cd models/my_model  # or recipes/my_recipe
+   docker build . -t my_tag
+   docker run --rm -it --gpus all my_tag pytest -v .
+   ```
+
+### Coding Guidelines
+
+We prioritize **readability and simplicity** over comprehensive feature coverage:
+
+- **KISS over DRY**: It's better to have clear, duplicated code than complex abstractions
+- **One thing well**: Each recipe should demonstrate specific features clearly rather than trying to cover everything
+- **Self-contained**: Recipes cannot depend on cutting-edge code from other parts of the repository
+
+### Testing Strategy
+
+We use a three-tier testing approach:
+
+#### L0 Tests (Pre-merge)
+
+- **Purpose**: Fast validation that code works
+- **Runtime**: \<10 minutes, single GPU
+- **Frequency**: Run automatically on PRs
+- **Scope**: Basic functionality, checkpoint creation/loading
+
+#### L1 Tests (Performance Monitoring)
+
+- **Purpose**: Performance benchmarking and partial convergence validation
+- **Runtime**: Up to 4 hours, up to 16 GPUs
+- **Frequency**: Nightly/weekly
+- **Scope**: Throughput metrics, scaling validation
+
+#### L2 Tests (Release Validation)
+
+- **Purpose**: Full convergence and large-scale validation
+- **Runtime**: Multiple days, hundreds of GPUs
+- **Frequency**: Monthly or before releases
+- **Scope**: Complete model convergence, cross-platform validation
+
+### Adding New Components
+
+#### Adding a New Model
+
+Models should be pip-installable packages that can export checkpoints to Hugging Face. See the
+[models README](models/README.md) for detailed guidelines on:
+
+- Package structure and conventions
+- Checkpoint export procedures
+- Testing requirements
+- CI/CD integration
+
+#### Adding a New Recipe
+
+Recipes should be self-contained Docker environments demonstrating specific training patterns. See
+the [recipes README](recipes/README.md) for guidance on:
+
+- Directory structure and naming
+- Hydra configuration management
+- Docker best practices
+- SLURM integration examples
+
+### CI/CD Contract
+
+All components must pass this basic validation:
+
+```bash
+docker build -t {component_tag} .
+docker run --rm -it --gpus all {component_tag} pytest -v .
+```
+
+#### Running CI/CD
+
+To run the CI/CD pipeline locally, run the following command:
+
+```bash
+./ci/build_and_test.py
+```
+
+### Performance Expectations
+
+We aim to provide the fastest available training implementations for biological foundation models, with documented benchmarks across NVIDIA hardware (A100, H100, H200, B100, B200, etc.).
+
+## Contributing
+
+We welcome contributions that advance the state of biological foundation model training. Please ensure your contributions:
+
+1. Follow our coding guidelines emphasizing clarity
+2. Include appropriate tests (L0 minimum, L1/L2 as applicable)
+3. Provide clear documentation and examples
+4. Maintain compatibility with our supported frameworks
+
+For detailed contribution guidelines, see our individual component READMEs:
+
+- [Models Development Guide](models/README.md)
+- [Recipes Development Guide](recipes/README.md)
+
+## License
+
+[Add appropriate license information]
+
+## Support
+
+For technical support and questions:
+
+- Check existing issues before opening a new one
+- Review our training recipes for implementation examples
+- Consult the TransformerEngine and nvFSDP documentation for underlying technologies