Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/actions/test-template/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ runs:
set -e

docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c '\
cp -r /opt/Megatron-LM/ /workspace/ && \
bash tests/${{ inputs.test_dir }}/${{ inputs.script }}.sh && \
echo "Finished successfully." || echo "Did not finish."'
) 2>&1 | tee $DIR/err.log
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/_build_container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,6 @@ jobs:
BASE_IMAGE=$(cat requirements/manifest.json | jq -r '."ngc-pytorch"')
TRTLLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".repo')
TRTLLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".ref')
MLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".repo')
MLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
TE_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.repo')
TE_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.ref')
APEX_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.repo')
APEX_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.ref')
EOF
)

Expand Down
56 changes: 0 additions & 56 deletions .github/workflows/_bump_mcore_tag.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/build-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
with:
docs-directory: docs/source
sync-all: true
no-extras: "--no-extra cu12"
no-extras: "--no-extra cu13"

build-docs-summary:
needs: [pre-flight, build-docs]
Expand Down
12 changes: 9 additions & 3 deletions .github/workflows/cicd-main-speech.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,10 @@ jobs:
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
- name: Checkout action
uses: actions/checkout@v6
- name: main
uses: NVIDIA/NeMo/.github/actions/test-template@main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
Expand Down Expand Up @@ -214,8 +216,10 @@ jobs:
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
- name: Checkout action
uses: actions/checkout@v6
- name: main
uses: NVIDIA/NeMo/.github/actions/test-template@main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
Expand Down Expand Up @@ -478,8 +482,10 @@ jobs:
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
- name: Checkout action
uses: actions/checkout@v6
- name: main
uses: NVIDIA/NeMo/.github/actions/test-template@main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
Expand Down
12 changes: 9 additions & 3 deletions .github/workflows/cicd-main-unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ jobs:
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
- name: Checkout action
uses: actions/checkout@v6
- name: main
uses: NVIDIA/NeMo/.github/actions/test-template@main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
Expand Down Expand Up @@ -68,8 +70,10 @@ jobs:
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
- name: Checkout action
uses: actions/checkout@v6
- name: main
uses: NVIDIA/NeMo/.github/actions/test-template@main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
Expand All @@ -94,8 +98,10 @@ jobs:
uses: actions/checkout@v6
with:
path: ${{ github.run_id }}
- name: Checkout action
uses: actions/checkout@v6
- name: main
uses: NVIDIA/NeMo/.github/actions/test-template@main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,11 @@ jobs:
with:
path: ${{ github.run_id }}

- name: Checkout action
uses: actions/checkout@v6

- name: main
uses: NVIDIA/NeMo/.github/actions/test-template@main
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
script: L0_Setup_Test_Data_And_Models
Expand Down
62 changes: 0 additions & 62 deletions .github/workflows/mcore-tag-bump-bot.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/release-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ jobs:
ref: ${{ inputs.github-ref }}
docs-directory: docs/source
sync-all: true
no-extras: "--no-extra cu12"
no-extras: "--no-extra cu13"

publish-docs:
runs-on: ubuntu-latest
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/update-buildcache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,6 @@ jobs:
BASE_IMAGE=$(cat requirements/manifest.json | jq -r '."ngc-pytorch"')
TRTLLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".repo')
TRTLLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".ref')
MLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".repo')
MLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
TE_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.repo')
TE_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.ref')
APEX_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.repo')
APEX_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.ref')
EOF
)

Expand Down
52 changes: 17 additions & 35 deletions docker/Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:26.03-py3

FROM ${BASE_IMAGE} as base-image
ENV PIP_CONSTRAINT=""
FROM ${BASE_IMAGE}
ENV PIP_CONSTRAINT=""
ENV TRANSFORMERS_OFFLINE=0
ENV HYDRA_FULL_ERROR=1
ENV PYTHONUNBUFFERED=1
Expand All @@ -29,33 +29,8 @@ apt-get install -y bc libsox-fmt-all
apt-get clean
EOF

FROM base-image as te-wheel
WORKDIR /tmp/NeMo
ARG TE_REPO
ARG TE_TAG
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh \
--mount=type=bind,source=external/patches,target=/tmp/NeMo/external/patches <<"EOF" bash -ex

bash /tmp/NeMo/install_dep.sh --library te --mode build
ls -al /tmp/Megatron-LM || true
EOF

FROM base-image as mcore-wheel
WORKDIR /tmp/NeMo
ARG MLM_REPO
ARG MLM_TAG
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh <<"EOF" bash -ex

bash /tmp/NeMo/install_dep.sh --library mcore --mode build
ls -al /tmp/Megatron-LM || true
EOF

FROM base-image
WORKDIR /tmp/NeMo
ENV INSTALL_DIR="/opt"
RUN \
--mount=type=bind,from=te-wheel,source=/opt/wheels/te,target=/opt/wheels/te \
--mount=type=bind,from=mcore-wheel,source=/opt/wheels/mcore,target=/opt/wheels/mcore \
RUN \
--mount=type=bind,source=requirements,target=/tmp/NeMo/requirements \
--mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/tmp/NeMo/tools/ctc_segmentation/requirements.txt \
--mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh \
Expand All @@ -64,17 +39,24 @@ RUN \
--mount=type=bind,source=README.md,target=/tmp/NeMo/README.md \
--mount=type=bind,source=nemo/package_info.py,target=/tmp/NeMo/nemo/package_info.py \
--mount=type=bind,source=nemo/__init__.py,target=/tmp/NeMo/nemo/__init__.py <<"EOF" bash -ex

bash /tmp/NeMo/install_dep.sh --library te --mode install
bash /tmp/NeMo/install_dep.sh --library mcore --mode install

bash /tmp/NeMo/install_dep.sh --library extra --mode install
pip install --no-cache-dir ".[all,cu12]"
rm -rf $NEMO_DIR || true
# NGC PyTorch ships pre-release torch/torchvision/torchaudio with +nv local
# version labels. Pip's default "prefer stable" behavior would replace them
# with stable PyPI builds when resolving transitive torch>=X.Y constraints,
# breaking torchvision (which pins the exact NGC torch build). Pin the NV
# stack for this install so pip keeps the pre-installed builds.
# `|| true` guards against grep exit 1 when no match, which would abort
# the heredoc under bash -e.
pip freeze --all | grep -E '^(torch|torchvision|torchaudio|triton|pytorch-triton)==' > /tmp/nv-pinned.txt || true
echo "=== Pinning NV torch stack ==="
cat /tmp/nv-pinned.txt
echo "==============================="
PIP_CONSTRAINT=/tmp/nv-pinned.txt pip install --no-cache-dir ".[all,cu13]"
EOF


WORKDIR /workspace
ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
ENV NEMO_HOME="/home/TestData/nemo_home"

ARG IMAGE_LABEL
Expand Down
34 changes: 2 additions & 32 deletions docker/Dockerfile.speech
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.12-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:26.03-py3

# build an image that includes only the nemo dependencies, ensures that dependencies
# are included first for optimal caching, and useful for building a development
Expand Down Expand Up @@ -64,31 +64,6 @@ RUN apt-get update && \

WORKDIR /workspace/

ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MCORE_TAG=338af51452a53982d202e8386db6233adad1ce86
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
# Install megatron core, this can be removed once 0.3 pip package is released
# We leave it here in case we need to work off of a specific commit in main
RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout ${MCORE_TAG} && \
pip install .

# Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771
RUN git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
git checkout ${APEX_TAG} && \
pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir \
--config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./

# Transformer Engine 1.2.0
RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin ${TE_TAG} && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .

WORKDIR /tmp/

# uninstall stuff from base container
Expand Down Expand Up @@ -147,14 +122,9 @@ RUN /usr/bin/test -n "$NEMO_VERSION" && \
RUN --mount=from=nemo-src,target=/tmp/nemo,rw cd /tmp/nemo && pip install ".[all]"

# Check install
# NB: adjusting LD_LIBRARY_PATH (only here, should not be persistent!) is a temporary hack
# to avoid failure if CUDA is unavailable (`docker build` does not expose GPUs)
# The error is raised in NeMo Core, and the main reason is reinstalled Transformer-Engine;
RUN CHECK_MSG=$(export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${CUDA_HOME}/compat/lib.real && \
python -c "import nemo.collections.asr as nemo_asr" && \
python -c "import nemo.collections.nlp as nemo_nlp" && \
python -c "import nemo.collections.tts as nemo_tts" && \
python -c "import nemo_text_processing.text_normalization as text_normalization"); CHECK_CODE=$?; \
python -c "import nemo.collections.tts as nemo_tts"); CHECK_CODE=$?; \
echo ${CHECK_MSG}; \
if [ ${CHECK_CODE} -ne 0 ]; then \
echo "Import check failed"; \
Expand Down
Loading
Loading