-
Notifications
You must be signed in to change notification settings - Fork 662
[CI] Refactor CI build on GitHub #2723
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
bc22b05
a758953
4c86fb0
b56daa2
a59b6ab
32fafa4
cd7bf8a
7e636cc
f033c8b
df606be
edbc265
2e66d4f
b373a86
8a48eb7
13abe8f
f0714ef
47fa3f1
5e26ea2
68a3c70
8cd53db
d250403
27218da
3949f12
da09ded
b39e385
42c7836
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,72 +22,65 @@ jobs: | |
| - name: 'Dependencies' | ||
| run: | | ||
| apt-get update | ||
| apt-get install -y git python3.9 pip cudnn9-cuda-12 | ||
| apt-get install -y git python3.9 pip cudnn9-cuda-12 ccache | ||
| pip install cmake==3.21.0 pybind11[global] ninja | ||
| - name: 'Checkout' | ||
| uses: actions/checkout@v3 | ||
| with: | ||
| submodules: recursive | ||
| - name: ccache | ||
| uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad | ||
| - uses: actions/cache@v4 | ||
| with: | ||
| path: /root/.ccache | ||
| key: ccache-${{ runner.os }}-core-${{ github.ref }}-${{ github.sha }} | ||
| restore-keys: | | ||
| ccache-${{ runner.os }}-core-${{ github.ref }}- | ||
| ccache-${{ runner.os }}-core- | ||
| - name: 'Build' | ||
| run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v | ||
| run: NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v | ||
| env: | ||
| NVTE_FRAMEWORK: none | ||
| MAX_JOBS: 1 | ||
| SCCACHE_GHA_ENABLED: "true" | ||
| CCACHE_DIR: /root/.ccache | ||
| CCACHE_MAXSIZE: 5G | ||
| - name: 'Sanity check' | ||
| run: python3 -c "import transformer_engine" | ||
| working-directory: / | ||
| pytorch: | ||
| name: 'PyTorch' | ||
| runs-on: ubuntu-latest | ||
| container: | ||
| image: ghcr.io/ptrendx/te_gha_pytorch:latest | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to change to use GH packages |
||
| options: --user root | ||
| steps: | ||
| - name: Move /var/lib/docker/ | ||
| shell: bash -euxo pipefail {0} | ||
| run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker" | ||
|
|
||
| - name: Maximize build space | ||
| uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 | ||
| with: | ||
| root-reserve-mb: 5120 | ||
| temp-reserve-mb: 32 | ||
| swap-size-mb: 10240 | ||
| remove-dotnet: 'true' | ||
| remove-android: 'true' | ||
| remove-haskell: 'true' | ||
| remove-codeql: 'true' | ||
| build-mount-path: '/var/lib/docker/' | ||
|
|
||
| - name: Restore /var/lib/docker/ | ||
| shell: bash -euxo pipefail {0} | ||
| run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" | ||
|
|
||
| - name: 'Checkout' | ||
| uses: actions/checkout@v3 | ||
| with: | ||
| submodules: recursive | ||
|
|
||
| - name: Start named container | ||
| run: | | ||
| docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity | ||
|
|
||
| - name: 'Dependencies' | ||
| run: | | ||
| docker exec builder bash -c '\ | ||
| apt-get update && \ | ||
| apt-get install -y git python3.9 pip cudnn9-cuda-12 && \ | ||
| pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \ | ||
| apt-get clean \ | ||
| ' | ||
|
|
||
| - name: "Disk space check after dependencies" | ||
| run: df -lh | ||
| - uses: actions/cache@v4 | ||
| with: | ||
| path: /root/.ccache | ||
| key: ccache-${{ runner.os }}-pytorch-${{ github.ref }}-${{ github.sha }} | ||
| restore-keys: | | ||
| ccache-${{ runner.os }}-pytorch-${{ github.ref }}- | ||
| ccache-${{ runner.os }}-pytorch- | ||
| - name: "Disk space check after dependencies and ccache" | ||
| run: df -lh | ||
| - name: 'Build' | ||
| run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps' | ||
| run: | | ||
| export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") | ||
| export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')") | ||
| export PATH=$CUDA_PATH/bin:$PATH | ||
| export NVTE_BUILD_USE_NVIDIA_WHEELS=1 | ||
| NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v | ||
| env: | ||
| NVTE_FRAMEWORK: pytorch | ||
| MAX_JOBS: 1 | ||
| MAX_JOBS: 2 | ||
| CCACHE_DIR: /root/.ccache | ||
| CCACHE_MAXSIZE: 5G | ||
| - name: 'Sanity check' | ||
| run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py' | ||
| run: python3 tests/pytorch/test_sanity_import.py | ||
| jax: | ||
| name: 'JAX' | ||
| runs-on: ubuntu-latest | ||
|
|
@@ -96,65 +89,65 @@ jobs: | |
| options: --user root | ||
| steps: | ||
| - name: 'Dependencies' | ||
| run: pip install cmake==3.21.0 pybind11[global] | ||
| run: | | ||
| pip install cmake==3.21.0 pybind11[global] | ||
| apt-get update && apt-get install -y ccache | ||
| - name: 'Checkout' | ||
| uses: actions/checkout@v3 | ||
| with: | ||
| submodules: recursive | ||
| - name: ccache | ||
| uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad | ||
| - uses: actions/cache@v4 | ||
| with: | ||
| path: /root/.ccache | ||
| key: ccache-${{ runner.os }}-jax-${{ github.ref }}-${{ github.sha }} | ||
| restore-keys: | | ||
| ccache-${{ runner.os }}-jax-${{ github.ref }}- | ||
| ccache-${{ runner.os }}-jax- | ||
| - name: 'Build' | ||
| run: | | ||
| NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v | ||
| NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v | ||
| env: | ||
| NVTE_FRAMEWORK: jax | ||
| MAX_JOBS: 1 | ||
| SCCACHE_GHA_ENABLED: "true" | ||
| CCACHE_DIR: /root/.ccache | ||
| CCACHE_MAXSIZE: 5G | ||
| - name: 'Sanity check' | ||
| run: python3 tests/jax/test_sanity_import.py | ||
| all: | ||
| name: 'All' | ||
| runs-on: ubuntu-latest | ||
| container: | ||
| image: ghcr.io/ptrendx/te_gha_all:latest | ||
| options: --user root | ||
| steps: | ||
| - name: Move /var/lib/docker/ | ||
| shell: bash -euxo pipefail {0} | ||
| run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker" | ||
|
|
||
| - name: Maximize build space | ||
| uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 | ||
| with: | ||
| root-reserve-mb: 5120 | ||
| temp-reserve-mb: 32 | ||
| swap-size-mb: 10240 | ||
| remove-dotnet: 'true' | ||
| remove-android: 'true' | ||
| remove-haskell: 'true' | ||
| remove-codeql: 'true' | ||
| build-mount-path: '/var/lib/docker/' | ||
|
|
||
| - name: Restore /var/lib/docker/ | ||
| shell: bash -euxo pipefail {0} | ||
| run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" | ||
|
|
||
| - name: 'Checkout' | ||
| uses: actions/checkout@v3 | ||
| with: | ||
| submodules: recursive | ||
|
|
||
| - name: Start named container | ||
| run: | | ||
| docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity | ||
|
|
||
| - name: 'Dependencies' | ||
| run: | | ||
| docker exec builder bash -c '\ | ||
| pip install cmake==3.21.0 pybind11[global] einops onnxscript && \ | ||
| pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130 | ||
| ' | ||
| - name: "Disk space check after dependencies" | ||
| run: df -lh | ||
| - uses: actions/cache@v4 | ||
| with: | ||
| path: /root/.ccache | ||
| key: ccache-${{ runner.os }}-all-${{ github.ref }}-${{ github.sha }} | ||
| restore-keys: | | ||
| ccache-${{ runner.os }}-all-${{ github.ref }}- | ||
| ccache-${{ runner.os }}-all- | ||
| - name: "Disk space check after dependencies and ccache" | ||
| run: df -lh | ||
| - name: 'Build' | ||
| run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps' | ||
| run: | | ||
| export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") | ||
| export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')") | ||
| export PATH=$CUDA_PATH/bin:$PATH | ||
| export NVTE_BUILD_USE_NVIDIA_WHEELS=1 | ||
| NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v | ||
| env: | ||
| NVTE_FRAMEWORK: all | ||
| MAX_JOBS: 1 | ||
| - name: 'Sanity check' | ||
| run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py' | ||
| MAX_JOBS: 2 | ||
| CCACHE_DIR: /root/.ccache | ||
| CCACHE_MAXSIZE: 5G | ||
| - name: 'Sanity check (pytorch)' | ||
| run: python3 tests/pytorch/test_sanity_import.py | ||
| - name: 'Sanity check (jax)' | ||
| run: python3 tests/jax/test_sanity_import.py | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| FROM ubuntu:24.04 | ||
|
|
||
| # Container dependencies | ||
| RUN apt-get update && apt-get install -y git python3 python3-pip ccache | ||
|
|
||
| ENV PIP_BREAK_SYSTEM_PACKAGES=1 | ||
|
|
||
| # Python build dependencies | ||
| RUN pip install cmake ninja pybind11 numpy packaging | ||
|
|
||
| # PyTorch (CUDA 13.0) | ||
| RUN pip install torch --index-url https://download.pytorch.org/whl/cu130 | ||
|
|
||
| # JAX with CUDA 13 support | ||
| RUN pip install "jax[cuda13]" | ||
|
|
||
| # NVIDIA CUDA toolkit wheels | ||
| RUN pip install \ | ||
| "nvidia-cuda-nvcc<13.1" \ | ||
| "nvidia-cuda-cccl<13.1" \ | ||
| "nvidia-cuda-crt<13.1" \ | ||
| "nvidia-nvvm<13.1" \ | ||
| "nvidia-cuda-profiler-api<13.1" \ | ||
| "nvidia-nvml-dev<13.1" | ||
|
|
||
| # Create symlinks for CUDA libraries | ||
| RUN CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") && \ | ||
| ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so && \ | ||
| ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so && \ | ||
| ln -s $CUDA_PATH/../nccl/lib/libnccl.so.2 $CUDA_PATH/../nccl/lib/libnccl.so && \ | ||
| ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| FROM ubuntu:24.04 | ||
|
|
||
| # Container dependencies | ||
| RUN apt-get update && apt-get install -y git python3 python3-pip ccache | ||
|
|
||
| ENV PIP_BREAK_SYSTEM_PACKAGES=1 | ||
|
|
||
| # Python build dependencies | ||
| RUN pip install cmake ninja pybind11 numpy packaging | ||
|
|
||
| # PyTorch (CUDA 13.0) | ||
| RUN pip install torch --index-url https://download.pytorch.org/whl/cu130 | ||
|
|
||
| # NVIDIA CUDA toolkit wheels | ||
| RUN pip install \ | ||
| "nvidia-cuda-nvcc<13.1" \ | ||
| "nvidia-cuda-cccl<13.1" \ | ||
| "nvidia-cuda-crt<13.1" \ | ||
| "nvidia-nvvm<13.1" \ | ||
| "nvidia-cuda-profiler-api<13.1" \ | ||
| "nvidia-nvml-dev<13.1" | ||
|
|
||
| # Create symlinks for CUDA libraries | ||
| RUN CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") && \ | ||
| ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so && \ | ||
| ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so && \ | ||
| ln -s $CUDA_PATH/../nccl/lib/libnccl.so.2 $CUDA_PATH/../nccl/lib/libnccl.so && \ | ||
| ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 | ||
|
Comment on lines
+22
to
+28
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing The symlink block creates unversioned aliases for Consider adding a similar symlink for cudnn, e.g.: CUDNN_LIB=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn/lib')") && \
ln -s $CUDNN_LIB/libcudnn.so.9 $CUDNN_LIB/libcudnn.soThe same applies to |
||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -251,6 +251,33 @@ def get_cuda_include_dirs() -> Tuple[str, str]: | |||||||||
| ] | ||||||||||
|
|
||||||||||
|
|
||||||||||
| @functools.lru_cache(maxsize=None) | ||||||||||
| def get_cuda_library_dirs() -> Tuple[str, str]: | ||||||||||
| """Returns the CUDA library directory.""" | ||||||||||
|
|
||||||||||
| force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0"))) | ||||||||||
| # If cuda is installed via toolkit, all libraries | ||||||||||
| # are bundled inside the top level cuda directory. | ||||||||||
| if not force_wheels and cuda_toolkit_include_path() is not None: | ||||||||||
| return [] | ||||||||||
|
|
||||||||||
| # Use pip wheels to include all libraries. | ||||||||||
| try: | ||||||||||
| import nvidia | ||||||||||
| except ModuleNotFoundError as e: | ||||||||||
| raise RuntimeError("CUDA not found.") | ||||||||||
|
|
||||||||||
| if nvidia.__file__ is not None: | ||||||||||
| cuda_root = Path(nvidia.__file__).parent | ||||||||||
| else: | ||||||||||
| cuda_root = Path(nvidia.__path__[0]) # namespace | ||||||||||
| return [ | ||||||||||
| subdir / "lib" | ||||||||||
| for subdir in cuda_root.iterdir() | ||||||||||
| if subdir.is_dir() and (subdir / "lib").is_dir() | ||||||||||
| ] | ||||||||||
|
|
||||||||||
|
|
||||||||||
| @functools.lru_cache(maxsize=None) | ||||||||||
| def cuda_archs() -> str: | ||||||||||
| archs = os.getenv("NVTE_CUDA_ARCHS") | ||||||||||
|
|
@@ -292,6 +319,13 @@ def cuda_version() -> Tuple[int, ...]: | |||||||||
| version_str = get_version("nvidia-cuda-runtime-cu12") | ||||||||||
| version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) | ||||||||||
| return version_tuple | ||||||||||
| except: | ||||||||||
| pass | ||||||||||
|
Comment on lines
+322
to
+323
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bare The bare
Suggested change
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree with this suggestion |
||||||||||
|
|
||||||||||
| try: | ||||||||||
| version_str = get_version("nvidia-cuda-runtime") | ||||||||||
| version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) | ||||||||||
| return version_tuple | ||||||||||
| except importlib.metadata.PackageNotFoundError: | ||||||||||
| raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") | ||||||||||
|
|
||||||||||
|
|
||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we bump this up?