Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions .github/actions/build-container/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Build container

inputs:
azure-client-id:
description: "Azure Client ID"
required: true
azure-tenant-id:
description: "Azure Tenant ID"
required: true
azure-subscription-id:
description: "Azure Subscription ID"
required: true
dockerfile-path:
description: "Path to dockerfile to build"
required: true
has-azure-credentials:
description: "Has Azure credentials"
required: false
default: "false"
PAT:
description: "GitHub Personal Access Token"
required: true
repo-name:
description: "The name of the repo to build container"
required: true
type: string

env:
container-registry: nemoci.azurecr.io

runs:
using: "composite"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
path: NeMo-Automodel

- name: Setup python
uses: actions/setup-python@v5
with:
python-version: 3.12

- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main

- name: Install Azure CLI
shell: bash
run: |
echo "::group::Install Azure CLI"
# Create systemd override for proper dependencies
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
echo "::endgroup::"

- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

- name: Azure ACR Login
shell: bash
run: |
az acr login --name nemoci

- name: Install GH CLI
shell: bash
run: |
apt-get update
apt-get install -y gh

- name: Normalize repo name to lowercase
shell: bash
env:
REPO: ${{ inputs.repo-name }}
run: |
echo "REPO_LOWER=${REPO,,}" >> "$GITHUB_ENV"

- name: Get last merged PR
shell: bash
id: cache_from
env:
GH_TOKEN: ${{ github.token }}
run: |
LAST_PRS=$(gh api graphql \
-F owner="NVIDIA-NeMo" \
-F name=${{ inputs.repo-name }} \
-f query='
query($owner: String!, $name: String!) {
repository(owner: $owner, name: $name) {
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
nodes {
number
}
}
}
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
echo "type=registry,ref=${{ env.container-registry }}/${{ env.REPO_LOWER }}:$number-buildcache,mode=max"
done)

echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
echo "EOF" | tee -a $GITHUB_OUTPUT

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build and push
uses: docker/build-push-action@v5
with:
file: ${{ inputs.dockerfile-path }}
push: true
context: .
build-args: |
BASE_IMAGE=pytorch
cache-from: |
type=registry,ref=${{ env.container-registry }}/${{ env.REPO_LOWER }}:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
type=registry,ref=${{ env.container-registry }}/${{ env.REPO_LOWER }}:main-buildcache,mode=max
${{ steps.cache_from.outputs.LAST_PRS }}
cache-to: |
type=registry,ref=${{ env.container-registry }}/${{ env.REPO_LOWER }}:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
no-cache: false
tags: |
${{ env.container-registry }}/${{ env.REPO_LOWER }}:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
${{ env.container-registry }}/${{ env.REPO_LOWER }}:${{ github.sha }}
secrets: |
GH_TOKEN=${{ secrets.PAT }}
33 changes: 15 additions & 18 deletions .github/actions/test-template/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,18 @@ inputs:
PAT:
description: "GitHub Personal Access Token"
required: true
container-image:
description: "Container image to use for test"
required: true

runs:
using: "composite"
steps:
- name: Checkout repository
uses: actions/checkout@v2
with:
path: NeMo-Automodel

- name: Install Azure CLI
if: ${{ inputs.has-azure-credentials == 'true' }}
shell: bash
Expand All @@ -69,6 +78,11 @@ runs:
tenant-id: ${{ inputs.azure-tenant-id }}
subscription-id: ${{ inputs.azure-subscription-id }}

- name: Azure ACR Login
shell: bash
run: |
az acr login --name nemoci

- name: Azure Fileshare
if: ${{ inputs.has-azure-credentials == 'true' && inputs.is-unit-test == 'false' }}
shell: bash
Expand Down Expand Up @@ -125,23 +139,6 @@ runs:
ls -al $MNT_PATH/TestData
echo "::endgroup::"

- name: Checkout repository
uses: actions/checkout@v2
with:
path: NeMo-Automodel

- name: Build container
shell: bash
env:
GH_TOKEN: ${{ inputs.PAT }}
run: |
echo "::group::Build test container"
docker system prune -af
docker build -f docker/Dockerfile \
--build-arg BASE_IMAGE=pytorch \
--target automodel_final -t automodel .
echo "::endgroup::"

- name: Start container
shell: bash
run: |
Expand Down Expand Up @@ -170,7 +167,7 @@ runs:
--volume $(pwd)/NeMo-Automodel:/workspace \
--workdir /workspace \
--volume $MNT_PATH/TestData:/home/TestData \
automodel \
${{ inputs.container-image }} \
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
RUN_TEST_EOF
)
Expand Down
33 changes: 32 additions & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ permissions:
id-token: write
contents: read

env:
container-registry: nemoci.azurecr.io

jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
Expand Down Expand Up @@ -129,6 +132,31 @@ jobs:
run: |
echo "Running CI tests"

cicd-container-build:
needs: [pre-flight, cicd-wait-in-queue]
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& !cancelled()
runs-on: self-hosted-nemo
environment: nemo-ci
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions/build-container
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
dockerfile-path: "./docker/Dockerfile"
has-azure-credentials: "true"
PAT: ${{ secrets.PAT }}
repo-name: "Automodel"

cicd-unit-tests:
strategy:
fail-fast: false
Expand All @@ -142,7 +170,7 @@ jobs:
runner: self-hosted-nemo
cpu-only: false
timeout: 30
needs: [pre-flight, cicd-wait-in-queue]
needs: [pre-flight, cicd-wait-in-queue, cicd-container-build]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.test-name }}
environment: nemo-ci
Expand All @@ -167,6 +195,7 @@ jobs:
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/automodel:${{ github.sha }}

cicd-e2e-tests:
strategy:
Expand Down Expand Up @@ -231,6 +260,8 @@ jobs:
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/automodel:${{ github.sha }}


Nemo_CICD_Test:
needs:
Expand Down
16 changes: 13 additions & 3 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -123,16 +123,26 @@ RUN if [ "$INSTALL_DEEPEP" = "True" ]; then \

FROM automodel_dep as automodel_final

WORKDIR /opt/Automodel

COPY pyproject.toml uv.lock /opt/Automodel/
COPY nemo_automodel/__init__.py nemo_automodel/package_info.py /opt/Automodel/nemo_automodel/
COPY docker/common/uv-pytorch.toml docker/common/uv-pytorch.lock /opt/Automodel/docker/common/

# Install Automodel
ARG BASE_IMAGE=cuda
ARG AUTOMODEL_INSTALL=all
COPY . /opt/Automodel
RUN cd /opt/Automodel && \
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
--mount=type=cache,target=/root/.cache/uv \
if [ "$BASE_IMAGE" = "pytorch" ]; then \
sed -i '/\[tool\.uv\]/r /opt/Automodel/docker/common/uv-pytorch.toml' pyproject.toml && \
mv /opt/Automodel/docker/common/uv-pytorch.lock /opt/Automodel/uv.lock; \
fi && \
uv sync --locked --extra $AUTOMODEL_INSTALL --all-groups
uv sync --locked --extra $AUTOMODEL_INSTALL --all-groups && \
uv cache prune

COPY . /opt/Automodel

COPY <<EOF /opt/venv/env.sh
export UV_PROJECT_ENVIRONMENT=/opt/venv
Expand Down
Loading