Remove TE git ref

chtruong814 · chtruong814 · commit da8bb0440340 · 2026-06-05T21:20:45.000Z
Signed-off-by: Charlie Truong &lt;chtruong@nvidia.com&gt;
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -14,8 +14,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# This CI Dockerfile supports CUDA 13 and CUDA 12 from a single BASE_IMAGE
+# build arg. The default is the recommended CUDA 13 image:
+#   nvcr.io/nvidia/cuda-dl-base:26.04-cuda13.2-devel-ubuntu24.04
+# The current recommended CUDA 12 image is:
+#   nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04
+#
+# The build derives CUDA_FLAVOR internally from BASE_IMAGE by matching
+# "cuda13" or "cuda12" in the image tag. That flavor selects the matching uv
+# extra (cu13 or cu12) and CUDA Python package include path. If BASE_IMAGE does
+# not contain either token, the build fails early.
+#
+# Example CUDA 13 H100+ build:
+#   docker buildx build -f docker/Dockerfile \
+#     --build-arg GPU_TARGET=h100plus .
+# Example CUDA 12 A100 build:
+#   docker buildx build -f docker/Dockerfile \
+#     --build-arg BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 \
+#     --build-arg GPU_TARGET=a100 .
+#
+# GPU_TARGET controls compiled Automodel dependency tuning. "h100plus" builds
+# for SM90/SM100/SM120 and includes H100+ features such as DeepEP and
+# flash-attn-4. "a100" builds only SM80 and uses an A100-specific DeepEP patch
+# to avoid unsupported newer-GPU/NVSHMEM build paths. This keeps CI images
+# smaller and avoids compiling kernels for architectures a target image cannot
+# use.
 ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:26.04-cuda13.2-devel-ubuntu24.04
 FROM ${BASE_IMAGE} AS base-image
+ARG BASE_IMAGE
 ARG UV_VERSION=0.11.14
 
 ENV DEBIAN_FRONTEND=noninteractive
@@ -47,11 +73,62 @@ curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" | sh
 uv --version
 EOF
 
+RUN <<"EOF" bash -euxo pipefail
+case "${BASE_IMAGE}" in
+    *cuda12*) cuda_flavor=cu12 ;;
+    *cuda13*) cuda_flavor=cu13 ;;
+    *)
+        echo "Cannot derive CUDA flavor from BASE_IMAGE='${BASE_IMAGE}'. Expected image tag containing 'cuda12' or 'cuda13'."
+        exit 1
+        ;;
+esac
+cuda_major_minor="$(sed -n 's/.*cuda\([0-9][0-9]*\.[0-9][0-9]*\).*/\1/p' <<<"${BASE_IMAGE}")"
+if [[ -z "${cuda_major_minor}" ]]; then
+    echo "Cannot derive CUDA major.minor from BASE_IMAGE='${BASE_IMAGE}'. Expected image tag containing e.g. 'cuda12.9' or 'cuda13.2'."
+    exit 1
+fi
+cat >/usr/local/bin/nemo-cuda-flavor <<SCRIPT
+#!/usr/bin/env bash
+set -euo pipefail
+echo "${cuda_flavor}"
+SCRIPT
+cat >/usr/local/bin/nemo-install-cuda-python <<SCRIPT
+#!/usr/bin/env bash
+set -euo pipefail
+cuda_major_minor="${cuda_major_minor}"
+cuda_major="\${cuda_major_minor%%.*}"
+cuda_minor="\${cuda_major_minor#*.}"
+cuda_next_minor="\$((cuda_minor + 1))"
+uv pip install \
+    "cuda-bindings>=\${cuda_major_minor}.0,<\${cuda_major}.\${cuda_next_minor}" \
+    "cuda-python>=\${cuda_major_minor}.0,<\${cuda_major}.\${cuda_next_minor}"
+SCRIPT
+chmod +x /usr/local/bin/nemo-cuda-flavor /usr/local/bin/nemo-install-cuda-python
+echo "Derived CUDA flavor: ${cuda_flavor}"
+echo "Derived CUDA Python major.minor: ${cuda_major_minor}"
+EOF
+
 WORKDIR /workspace
 COPY pyproject.toml uv.lock /workspace/
 COPY nemo/__init__.py nemo/package_info.py /workspace/nemo/
 RUN <<"EOF" bash -ex
-uv sync --link-mode copy --locked --extra all --extra cu13 --group test
+cuda_flavor="$(nemo-cuda-flavor)"
+uv sync --link-mode copy --locked --extra all --extra "${cuda_flavor}" --group test
+nemo-install-cuda-python
+EOF
+
+RUN <<"EOF" bash -ex
+# Container-only runtime utilities. Keep these out of pyproject.toml so they do
+# not become NeMo package dependencies.
+uv pip install \
+    dill \
+    orjson
+
+case "$(nemo-cuda-flavor)" in
+    cu12) torchcodec_index=https://download.pytorch.org/whl/cu126 ;;
+    cu13) torchcodec_index=https://download.pytorch.org/whl/cu132 ;;
+esac
+uv pip install --index-url "${torchcodec_index}" torchcodec
 EOF
 COPY nemo /workspace/nemo
 
@@ -101,8 +178,9 @@ case "${GPU_TARGET}" in
         ;;
 esac
 
+CUDA_FLAVOR="$(nemo-cuda-flavor)"
 AUTOMODEL_CCCL_INCLUDES="/usr/local/cuda/include/cccl"
-PYTHON_CCCL_INCLUDE="${VIRTUAL_ENV}/lib/python${UV_PYTHON}/site-packages/nvidia/cu13/include/cccl"
+PYTHON_CCCL_INCLUDE="${VIRTUAL_ENV}/lib/python${UV_PYTHON}/site-packages/nvidia/${CUDA_FLAVOR}/include/cccl"
 if [[ -d "${PYTHON_CCCL_INCLUDE}" ]]; then
     AUTOMODEL_CCCL_INCLUDES="${AUTOMODEL_CCCL_INCLUDES}:${PYTHON_CCCL_INCLUDE}"
 fi
@@ -136,14 +214,16 @@ if [[ "${GPU_TARGET}" == "h100plus" ]]; then
 elif [[ "${GPU_TARGET}" == "a100" ]]; then
     automodel_extra=compiled-a100
 fi
+cuda_flavor="$(nemo-cuda-flavor)"
 uv sync \
     --inexact \
     --link-mode copy \
     --locked \
     --extra all \
-    --extra cu13 \
+    --extra "${cuda_flavor}" \
     --extra "${automodel_extra}" \
     --group test
+nemo-install-cuda-python
 
 if [[ "${GPU_TARGET}" == "a100" ]]; then
     git clone "${DEEPEP_REPO}" /opt/automodel-src/DeepEP
@@ -159,12 +239,17 @@ fi
 if [[ "${GPU_TARGET}" == "h100plus" ]]; then
     # flash-attn-4 requires apache-tvm-ffi 0.1.11, while mamba-ssm
     # currently constrains the solved environment to apache-tvm-ffi<=0.1.9.
+    cutlass_packages=(
+        "nvidia-cutlass-dsl==4.5.2"
+        "nvidia-cutlass-dsl-libs-base==4.5.2"
+    )
+    if [[ "$(nemo-cuda-flavor)" == "cu13" ]]; then
+        cutlass_packages+=("nvidia-cutlass-dsl-libs-cu13==4.5.2")
+    fi
     uv pip install \
         --no-deps \
         "apache-tvm-ffi==0.1.11" \
-        "nvidia-cutlass-dsl==4.5.2" \
-        "nvidia-cutlass-dsl-libs-base==4.5.2" \
-        "nvidia-cutlass-dsl-libs-cu13==4.5.2" \
+        "${cutlass_packages[@]}" \
         "quack-kernels==0.5.0" \
         "torch-c-dlpack-ext==0.1.5"
 
@@ -196,6 +281,18 @@ LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
 ARG RC_DATE=00.00
 ARG TARGETARCH
 
+ARG INSTALL_FFMPEG=false
+RUN <<"EOF" bash -ex
+if [ "${INSTALL_FFMPEG}" = "true" ]; then
+    apt-get update
+    apt-get install -y ffmpeg
+    apt-get clean
+    rm -rf /var/lib/apt/lists/*
+fi
+EOF
+
+ENV NEMO_HOME="/home/TestData/nemo_home"
+
 # NOTICES.txt file points to where the OSS source code is archived
 RUN echo "This distribution includes open source which is archived at the following URL: https://opensource.nvidia.com/oss/teams/nvidia/nemo/${RC_DATE}:linux-${TARGETARCH}/index.html" > NOTICES.txt && \
     echo "For further inquiries or assistance, contact us at oss-requests@nvidia.com" >> NOTICES.txt
diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst
@@ -332,7 +332,7 @@ Fast Conformer RNN-T (Streaming) with Prompt Feature
 
 The RNN-T-only prompt model (``EncDecRNNTBPEModelWithPrompt``) is the cache-aware streaming
 counterpart of the hybrid prompt model — same one-hot language-ID prompt mechanism, no
-auxiliary CTC head. 
+auxiliary CTC head.
 
 **Key Features:**
 
@@ -382,4 +382,3 @@ Code-Switching
    :widths: 50,50
    :header-rows: 1
 
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ readme = "README.md"
 license = {file = "LICENSE"}
 requires-python = ">=3.10"
 dependencies = [
+    "aistore",
     "fsspec>=2024.12.0",
     "huggingface_hub>=0.24",
     "numba ; platform_system == 'Darwin'",
@@ -32,6 +33,7 @@ dependencies = [
     "onnx>=1.7.0",
     "scikit-learn",
     "setuptools>=70.0.0",
+    "smart-open",
     "tensorboard",
     "text-unidecode",
     "torch>=2.6.0",
@@ -87,7 +89,6 @@ core = [
     "nv_one_logger_core>=2.3.1",
     "nv_one_logger_training_telemetry>=2.3.1",
     "nv_one_logger_pytorch_lightning_integration>=2.3.1",
-    "aistore",
 ]
 
 lightning = [
@@ -194,6 +195,7 @@ all = [
     "omegaconf<=2.3",
     "torchmetrics>=0.11.0",
     "transformers",
+    "wandb",
     "webdataset>=0.2.86",
     "nv_one_logger_core>=2.3.1",
     "nv_one_logger_training_telemetry>=2.3.1",
@@ -229,11 +231,13 @@ all = [
 ]
 
 cu12 = [
+    "torch==2.12.0+cu126 ; sys_platform == 'linux'",
     "numba-cuda[cu12] ; platform_system != 'Darwin'",
-    "cuda-python>=12.6.0,<13 ; platform_system != 'Darwin'",
+    "cuda-python>=12,<13 ; platform_system != 'Darwin'",
 ]
 
 cu13 = [
+    "torch==2.12.0+cu132 ; sys_platform == 'linux'",
     "numba-cuda[cu13] ; platform_system != 'Darwin'",
     "cuda-python>=13,<14 ; platform_system != 'Darwin'",
 ]
@@ -429,21 +433,12 @@ conflicts = [
         { extra = "cu12" },
         { extra = "cu13" },
     ],
-    [
-        { extra = "cu12" },
-        { extra = "compiled" },
-    ],
-    [
-        { extra = "cu12" },
-        { extra = "compiled-a100" },
-    ],
     [
         { extra = "compiled" },
         { extra = "compiled-a100" },
     ],
 ]
 override-dependencies = [
-    "torch==2.12.0+cu132 ; sys_platform == 'linux'",
     "mlflow>=3.9.0rc0",
     "cryptography>=46.0.5",
     "wandb>=0.27.1",
@@ -466,15 +461,15 @@ no-build-isolation-package = [
 ]
 
 # --- uv configuration ---
+# Keep Torch wheel indexes explicit per CUDA extra. The pinned Automodel git
+# dependency also carries Torch source metadata; see the static metadata below.
 [tool.uv.sources]
-# Match nemo_automodel's torch index sources so uv doesn't see conflicting
-# indexes when resolving the speechlm2 extra (which pulls nemo_automodel
-# from git as a source dependency — uv treats these as workspace members).
 nemo_automodel = { git = "https://github.com/NVIDIA-NeMo/Automodel.git", rev = "9eccbb6102a260efd7cbdffa890fc57b94f94528" }
 deep_ep = { git = "https://github.com/deepseek-ai/DeepEP.git", tag = "v1.2.1" }
 torch = [
   { index = "pytorch-cpu", marker = "sys_platform != 'linux' and sys_platform != 'darwin'" },
-  { index = "pytorch-cu132", marker = "sys_platform == 'linux'" },
+  { index = "pytorch-cu126", extra = "cu12", marker = "sys_platform == 'linux'" },
+  { index = "pytorch-cu132", extra = "cu13", marker = "sys_platform == 'linux'" },
   { index = "pypi", marker = "sys_platform == 'darwin'" },
 ]
 
@@ -488,11 +483,39 @@ name = "pytorch-cpu"
 url = "https://download.pytorch.org/whl/cpu"
 explicit = true
 
+[[tool.uv.index]]
+name = "pytorch-cu126"
+url = "https://download.pytorch.org/whl/cu126"
+explicit = true
+
 [[tool.uv.index]]
 name = "pytorch-cu132"
 url = "https://download.pytorch.org/whl/cu132"
 explicit = true
 
+[[tool.uv.dependency-metadata]]
+name = "nemo-automodel"
+version = "0.4.0+9eccbb61"
+requires-python = ">=3.10"
+# The pinned Automodel git revision carries its own Torch source table. Keep
+# its core dependency metadata static here so this repo controls the CUDA wheel index.
+requires-dist = [
+    "datasets>=4.0.0",
+    "megatron-fsdp>=0.2.3",
+    "mistral-common[audio,hf-hub,image,sentencepiece]",
+    "opencv-python-headless==4.10.0.84",
+    "pybind11",
+    "pyyaml",
+    "tiktoken",
+    "torch>=2.6.0",
+    "torchdata",
+    "transformers==5.5.0",
+    "wandb",
+    "torchao",
+    "mlflow",
+    "flashoptim>=0.1.3",
+]
+
 [dependency-groups]
 test = [
     "black>=26.3.1",
diff --git a/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py b/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py
@@ -20,6 +20,7 @@
 
 import tarfile
 from pathlib import Path
+from unittest.mock import patch
 
 import lhotse
 import pytest
@@ -367,20 +368,23 @@ class _FakeTokenizer:
 
 @pytest.mark.unit
 def test_salm_dataset_batch_loader_enabled(monkeypatch):
-    pytest.importorskip("aistore")  # AISBatchLoader requires the aistore client.
     monkeypatch.setenv("USE_AIS_GET_BATCH", "true")
     from nemo.collections.speechlm2.data.salm_dataset import SALMDataset
 
-    ds = SALMDataset(tokenizer=_FakeTokenizer())
-    assert isinstance(ds.load_audio, AudioSamples)
-    assert ds.load_audio.use_batch_loader is True
+    with patch("nemo.collections.speechlm2.data.salm_dataset.AudioSamples") as audio_samples:
+        ds = SALMDataset(tokenizer=_FakeTokenizer())
+
+    audio_samples.assert_called_once_with(fault_tolerant=True, use_batch_loader=True, mono_downmix=True)
+    assert ds.load_audio is audio_samples.return_value
 
 
 @pytest.mark.unit
 def test_salm_dataset_batch_loader_disabled(monkeypatch):
     monkeypatch.delenv("USE_AIS_GET_BATCH", raising=False)
     from nemo.collections.speechlm2.data.salm_dataset import SALMDataset
 
-    ds = SALMDataset(tokenizer=_FakeTokenizer())
-    assert isinstance(ds.load_audio, AudioSamples)
-    assert ds.load_audio.use_batch_loader is False
+    with patch("nemo.collections.speechlm2.data.salm_dataset.AudioSamples") as audio_samples:
+        ds = SALMDataset(tokenizer=_FakeTokenizer())
+
+    audio_samples.assert_called_once_with(fault_tolerant=True, use_batch_loader=False, mono_downmix=True)
+    assert ds.load_audio is audio_samples.return_value
diff --git a/uv.lock b/uv.lock