Skip to content

Commit da8bb04

Browse files
committed
Remove TE git ref
Signed-off-by: Charlie Truong <chtruong@nvidia.com>
2 parents b2cb7fc + b7e0998 commit da8bb04

5 files changed

Lines changed: 2164 additions & 760 deletions

File tree

docker/Dockerfile

Lines changed: 103 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,34 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
# This CI Dockerfile supports CUDA 13 and CUDA 12 from a single BASE_IMAGE
18+
# build arg. The default is the recommended CUDA 13 image:
19+
# nvcr.io/nvidia/cuda-dl-base:26.04-cuda13.2-devel-ubuntu24.04
20+
# The current recommended CUDA 12 image is:
21+
# nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04
22+
#
23+
# The build derives CUDA_FLAVOR internally from BASE_IMAGE by matching
24+
# "cuda13" or "cuda12" in the image tag. That flavor selects the matching uv
25+
# extra (cu13 or cu12) and CUDA Python package include path. If BASE_IMAGE does
26+
# not contain either token, the build fails early.
27+
#
28+
# Example CUDA 13 H100+ build:
29+
# docker buildx build -f docker/Dockerfile \
30+
# --build-arg GPU_TARGET=h100plus .
31+
# Example CUDA 12 A100 build:
32+
# docker buildx build -f docker/Dockerfile \
33+
# --build-arg BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04 \
34+
# --build-arg GPU_TARGET=a100 .
35+
#
36+
# GPU_TARGET controls compiled Automodel dependency tuning. "h100plus" builds
37+
# for SM90/SM100/SM120 and includes H100+ features such as DeepEP and
38+
# flash-attn-4. "a100" builds only SM80 and uses an A100-specific DeepEP patch
39+
# to avoid unsupported newer-GPU/NVSHMEM build paths. This keeps CI images
40+
# smaller and avoids compiling kernels for architectures a target image cannot
41+
# use.
1742
ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:26.04-cuda13.2-devel-ubuntu24.04
1843
FROM ${BASE_IMAGE} AS base-image
44+
ARG BASE_IMAGE
1945
ARG UV_VERSION=0.11.14
2046

2147
ENV DEBIAN_FRONTEND=noninteractive
@@ -47,11 +73,62 @@ curl -LsSf "https://astral.sh/uv/${UV_VERSION}/install.sh" | sh
4773
uv --version
4874
EOF
4975

76+
RUN <<"EOF" bash -euxo pipefail
77+
case "${BASE_IMAGE}" in
78+
*cuda12*) cuda_flavor=cu12 ;;
79+
*cuda13*) cuda_flavor=cu13 ;;
80+
*)
81+
echo "Cannot derive CUDA flavor from BASE_IMAGE='${BASE_IMAGE}'. Expected image tag containing 'cuda12' or 'cuda13'."
82+
exit 1
83+
;;
84+
esac
85+
cuda_major_minor="$(sed -n 's/.*cuda\([0-9][0-9]*\.[0-9][0-9]*\).*/\1/p' <<<"${BASE_IMAGE}")"
86+
if [[ -z "${cuda_major_minor}" ]]; then
87+
echo "Cannot derive CUDA major.minor from BASE_IMAGE='${BASE_IMAGE}'. Expected image tag containing e.g. 'cuda12.9' or 'cuda13.2'."
88+
exit 1
89+
fi
90+
cat >/usr/local/bin/nemo-cuda-flavor <<SCRIPT
91+
#!/usr/bin/env bash
92+
set -euo pipefail
93+
echo "${cuda_flavor}"
94+
SCRIPT
95+
cat >/usr/local/bin/nemo-install-cuda-python <<SCRIPT
96+
#!/usr/bin/env bash
97+
set -euo pipefail
98+
cuda_major_minor="${cuda_major_minor}"
99+
cuda_major="\${cuda_major_minor%%.*}"
100+
cuda_minor="\${cuda_major_minor#*.}"
101+
cuda_next_minor="\$((cuda_minor + 1))"
102+
uv pip install \
103+
"cuda-bindings>=\${cuda_major_minor}.0,<\${cuda_major}.\${cuda_next_minor}" \
104+
"cuda-python>=\${cuda_major_minor}.0,<\${cuda_major}.\${cuda_next_minor}"
105+
SCRIPT
106+
chmod +x /usr/local/bin/nemo-cuda-flavor /usr/local/bin/nemo-install-cuda-python
107+
echo "Derived CUDA flavor: ${cuda_flavor}"
108+
echo "Derived CUDA Python major.minor: ${cuda_major_minor}"
109+
EOF
110+
50111
WORKDIR /workspace
51112
COPY pyproject.toml uv.lock /workspace/
52113
COPY nemo/__init__.py nemo/package_info.py /workspace/nemo/
53114
RUN <<"EOF" bash -ex
54-
uv sync --link-mode copy --locked --extra all --extra cu13 --group test
115+
cuda_flavor="$(nemo-cuda-flavor)"
116+
uv sync --link-mode copy --locked --extra all --extra "${cuda_flavor}" --group test
117+
nemo-install-cuda-python
118+
EOF
119+
120+
RUN <<"EOF" bash -ex
121+
# Container-only runtime utilities. Keep these out of pyproject.toml so they do
122+
# not become NeMo package dependencies.
123+
uv pip install \
124+
dill \
125+
orjson
126+
127+
case "$(nemo-cuda-flavor)" in
128+
cu12) torchcodec_index=https://download.pytorch.org/whl/cu126 ;;
129+
cu13) torchcodec_index=https://download.pytorch.org/whl/cu132 ;;
130+
esac
131+
uv pip install --index-url "${torchcodec_index}" torchcodec
55132
EOF
56133
COPY nemo /workspace/nemo
57134

@@ -101,8 +178,9 @@ case "${GPU_TARGET}" in
101178
;;
102179
esac
103180

181+
CUDA_FLAVOR="$(nemo-cuda-flavor)"
104182
AUTOMODEL_CCCL_INCLUDES="/usr/local/cuda/include/cccl"
105-
PYTHON_CCCL_INCLUDE="${VIRTUAL_ENV}/lib/python${UV_PYTHON}/site-packages/nvidia/cu13/include/cccl"
183+
PYTHON_CCCL_INCLUDE="${VIRTUAL_ENV}/lib/python${UV_PYTHON}/site-packages/nvidia/${CUDA_FLAVOR}/include/cccl"
106184
if [[ -d "${PYTHON_CCCL_INCLUDE}" ]]; then
107185
AUTOMODEL_CCCL_INCLUDES="${AUTOMODEL_CCCL_INCLUDES}:${PYTHON_CCCL_INCLUDE}"
108186
fi
@@ -136,14 +214,16 @@ if [[ "${GPU_TARGET}" == "h100plus" ]]; then
136214
elif [[ "${GPU_TARGET}" == "a100" ]]; then
137215
automodel_extra=compiled-a100
138216
fi
217+
cuda_flavor="$(nemo-cuda-flavor)"
139218
uv sync \
140219
--inexact \
141220
--link-mode copy \
142221
--locked \
143222
--extra all \
144-
--extra cu13 \
223+
--extra "${cuda_flavor}" \
145224
--extra "${automodel_extra}" \
146225
--group test
226+
nemo-install-cuda-python
147227

148228
if [[ "${GPU_TARGET}" == "a100" ]]; then
149229
git clone "${DEEPEP_REPO}" /opt/automodel-src/DeepEP
@@ -159,12 +239,17 @@ fi
159239
if [[ "${GPU_TARGET}" == "h100plus" ]]; then
160240
# flash-attn-4 requires apache-tvm-ffi 0.1.11, while mamba-ssm
161241
# currently constrains the solved environment to apache-tvm-ffi<=0.1.9.
242+
cutlass_packages=(
243+
"nvidia-cutlass-dsl==4.5.2"
244+
"nvidia-cutlass-dsl-libs-base==4.5.2"
245+
)
246+
if [[ "$(nemo-cuda-flavor)" == "cu13" ]]; then
247+
cutlass_packages+=("nvidia-cutlass-dsl-libs-cu13==4.5.2")
248+
fi
162249
uv pip install \
163250
--no-deps \
164251
"apache-tvm-ffi==0.1.11" \
165-
"nvidia-cutlass-dsl==4.5.2" \
166-
"nvidia-cutlass-dsl-libs-base==4.5.2" \
167-
"nvidia-cutlass-dsl-libs-cu13==4.5.2" \
252+
"${cutlass_packages[@]}" \
168253
"quack-kernels==0.5.0" \
169254
"torch-c-dlpack-ext==0.1.5"
170255

@@ -196,6 +281,18 @@ LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
196281
ARG RC_DATE=00.00
197282
ARG TARGETARCH
198283

284+
ARG INSTALL_FFMPEG=false
285+
RUN <<"EOF" bash -ex
286+
if [ "${INSTALL_FFMPEG}" = "true" ]; then
287+
apt-get update
288+
apt-get install -y ffmpeg
289+
apt-get clean
290+
rm -rf /var/lib/apt/lists/*
291+
fi
292+
EOF
293+
294+
ENV NEMO_HOME="/home/TestData/nemo_home"
295+
199296
# NOTICES.txt file points to where the OSS source code is archived
200297
RUN echo "This distribution includes open source which is archived at the following URL: https://opensource.nvidia.com/oss/teams/nvidia/nemo/${RC_DATE}:linux-${TARGETARCH}/index.html" > NOTICES.txt && \
201298
echo "For further inquiries or assistance, contact us at oss-requests@nvidia.com" >> NOTICES.txt

docs/source/asr/results.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ Fast Conformer RNN-T (Streaming) with Prompt Feature
332332

333333
The RNN-T-only prompt model (``EncDecRNNTBPEModelWithPrompt``) is the cache-aware streaming
334334
counterpart of the hybrid prompt model — same one-hot language-ID prompt mechanism, no
335-
auxiliary CTC head.
335+
auxiliary CTC head.
336336

337337
**Key Features:**
338338

@@ -382,4 +382,3 @@ Code-Switching
382382
:widths: 50,50
383383
:header-rows: 1
384384

385-

pyproject.toml

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ readme = "README.md"
2424
license = {file = "LICENSE"}
2525
requires-python = ">=3.10"
2626
dependencies = [
27+
"aistore",
2728
"fsspec>=2024.12.0",
2829
"huggingface_hub>=0.24",
2930
"numba ; platform_system == 'Darwin'",
@@ -32,6 +33,7 @@ dependencies = [
3233
"onnx>=1.7.0",
3334
"scikit-learn",
3435
"setuptools>=70.0.0",
36+
"smart-open",
3537
"tensorboard",
3638
"text-unidecode",
3739
"torch>=2.6.0",
@@ -87,7 +89,6 @@ core = [
8789
"nv_one_logger_core>=2.3.1",
8890
"nv_one_logger_training_telemetry>=2.3.1",
8991
"nv_one_logger_pytorch_lightning_integration>=2.3.1",
90-
"aistore",
9192
]
9293

9394
lightning = [
@@ -194,6 +195,7 @@ all = [
194195
"omegaconf<=2.3",
195196
"torchmetrics>=0.11.0",
196197
"transformers",
198+
"wandb",
197199
"webdataset>=0.2.86",
198200
"nv_one_logger_core>=2.3.1",
199201
"nv_one_logger_training_telemetry>=2.3.1",
@@ -229,11 +231,13 @@ all = [
229231
]
230232

231233
cu12 = [
234+
"torch==2.12.0+cu126 ; sys_platform == 'linux'",
232235
"numba-cuda[cu12] ; platform_system != 'Darwin'",
233-
"cuda-python>=12.6.0,<13 ; platform_system != 'Darwin'",
236+
"cuda-python>=12,<13 ; platform_system != 'Darwin'",
234237
]
235238

236239
cu13 = [
240+
"torch==2.12.0+cu132 ; sys_platform == 'linux'",
237241
"numba-cuda[cu13] ; platform_system != 'Darwin'",
238242
"cuda-python>=13,<14 ; platform_system != 'Darwin'",
239243
]
@@ -429,21 +433,12 @@ conflicts = [
429433
{ extra = "cu12" },
430434
{ extra = "cu13" },
431435
],
432-
[
433-
{ extra = "cu12" },
434-
{ extra = "compiled" },
435-
],
436-
[
437-
{ extra = "cu12" },
438-
{ extra = "compiled-a100" },
439-
],
440436
[
441437
{ extra = "compiled" },
442438
{ extra = "compiled-a100" },
443439
],
444440
]
445441
override-dependencies = [
446-
"torch==2.12.0+cu132 ; sys_platform == 'linux'",
447442
"mlflow>=3.9.0rc0",
448443
"cryptography>=46.0.5",
449444
"wandb>=0.27.1",
@@ -466,15 +461,15 @@ no-build-isolation-package = [
466461
]
467462

468463
# --- uv configuration ---
464+
# Keep Torch wheel indexes explicit per CUDA extra. The pinned Automodel git
465+
# dependency also carries Torch source metadata; see the static metadata below.
469466
[tool.uv.sources]
470-
# Match nemo_automodel's torch index sources so uv doesn't see conflicting
471-
# indexes when resolving the speechlm2 extra (which pulls nemo_automodel
472-
# from git as a source dependency — uv treats these as workspace members).
473467
nemo_automodel = { git = "https://github.com/NVIDIA-NeMo/Automodel.git", rev = "9eccbb6102a260efd7cbdffa890fc57b94f94528" }
474468
deep_ep = { git = "https://github.com/deepseek-ai/DeepEP.git", tag = "v1.2.1" }
475469
torch = [
476470
{ index = "pytorch-cpu", marker = "sys_platform != 'linux' and sys_platform != 'darwin'" },
477-
{ index = "pytorch-cu132", marker = "sys_platform == 'linux'" },
471+
{ index = "pytorch-cu126", extra = "cu12", marker = "sys_platform == 'linux'" },
472+
{ index = "pytorch-cu132", extra = "cu13", marker = "sys_platform == 'linux'" },
478473
{ index = "pypi", marker = "sys_platform == 'darwin'" },
479474
]
480475

@@ -488,11 +483,39 @@ name = "pytorch-cpu"
488483
url = "https://download.pytorch.org/whl/cpu"
489484
explicit = true
490485

486+
[[tool.uv.index]]
487+
name = "pytorch-cu126"
488+
url = "https://download.pytorch.org/whl/cu126"
489+
explicit = true
490+
491491
[[tool.uv.index]]
492492
name = "pytorch-cu132"
493493
url = "https://download.pytorch.org/whl/cu132"
494494
explicit = true
495495

496+
[[tool.uv.dependency-metadata]]
497+
name = "nemo-automodel"
498+
version = "0.4.0+9eccbb61"
499+
requires-python = ">=3.10"
500+
# The pinned Automodel git revision carries its own Torch source table. Keep
501+
# its core dependency metadata static here so this repo controls the CUDA wheel index.
502+
requires-dist = [
503+
"datasets>=4.0.0",
504+
"megatron-fsdp>=0.2.3",
505+
"mistral-common[audio,hf-hub,image,sentencepiece]",
506+
"opencv-python-headless==4.10.0.84",
507+
"pybind11",
508+
"pyyaml",
509+
"tiktoken",
510+
"torch>=2.6.0",
511+
"torchdata",
512+
"transformers==5.5.0",
513+
"wandb",
514+
"torchao",
515+
"mlflow",
516+
"flashoptim>=0.1.3",
517+
]
518+
496519
[dependency-groups]
497520
test = [
498521
"black>=26.3.1",

tests/collections/common/test_lhotse_multimodal_ais_get_batch.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import tarfile
2222
from pathlib import Path
23+
from unittest.mock import patch
2324

2425
import lhotse
2526
import pytest
@@ -367,20 +368,23 @@ class _FakeTokenizer:
367368

368369
@pytest.mark.unit
369370
def test_salm_dataset_batch_loader_enabled(monkeypatch):
370-
pytest.importorskip("aistore") # AISBatchLoader requires the aistore client.
371371
monkeypatch.setenv("USE_AIS_GET_BATCH", "true")
372372
from nemo.collections.speechlm2.data.salm_dataset import SALMDataset
373373

374-
ds = SALMDataset(tokenizer=_FakeTokenizer())
375-
assert isinstance(ds.load_audio, AudioSamples)
376-
assert ds.load_audio.use_batch_loader is True
374+
with patch("nemo.collections.speechlm2.data.salm_dataset.AudioSamples") as audio_samples:
375+
ds = SALMDataset(tokenizer=_FakeTokenizer())
376+
377+
audio_samples.assert_called_once_with(fault_tolerant=True, use_batch_loader=True, mono_downmix=True)
378+
assert ds.load_audio is audio_samples.return_value
377379

378380

379381
@pytest.mark.unit
380382
def test_salm_dataset_batch_loader_disabled(monkeypatch):
381383
monkeypatch.delenv("USE_AIS_GET_BATCH", raising=False)
382384
from nemo.collections.speechlm2.data.salm_dataset import SALMDataset
383385

384-
ds = SALMDataset(tokenizer=_FakeTokenizer())
385-
assert isinstance(ds.load_audio, AudioSamples)
386-
assert ds.load_audio.use_batch_loader is False
386+
with patch("nemo.collections.speechlm2.data.salm_dataset.AudioSamples") as audio_samples:
387+
ds = SALMDataset(tokenizer=_FakeTokenizer())
388+
389+
audio_samples.assert_called_once_with(fault_tolerant=True, use_batch_loader=False, mono_downmix=True)
390+
assert ds.load_audio is audio_samples.return_value

0 commit comments

Comments
 (0)