diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh index c38ece71cb..d6f52d3a1f 100755 --- a/.ci/scripts/build_ucc.sh +++ b/.ci/scripts/build_ucc.sh @@ -7,28 +7,8 @@ export UCC_ENABLE_GTEST=${UCC_ENABLE_GTEST:-yes} export UCC_ENABLE_NVLS=${UCC_ENABLE_NVLS:-no} export UCC_BUILD_TLS=${UCC_BUILD_TLS:-cuda,nccl,self,sharp,shm,ucp,mlx5} -# In containers, calculate based on memory limits to avoid OOM -# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes -if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then - # Prefer cgroupv1 path, fall back to cgroupv2 or static default if not found - if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then - limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes) - elif [ -f /sys/fs/cgroup/memory.max ]; then - limit=$(cat /sys/fs/cgroup/memory.max) - # If cgroupv2 limit is "max", meaning unlimited, set to 4GB to avoid OOM - [ "$limit" = "max" ] && limit=$((4 * 1024 * 1024 * 1024)) - else - # Default to 4GB if no limit is found - limit=$((4 * 1024 * 1024 * 1024)) - fi - - # Use 1 build process per GB of memory, clamp in [1,16] - nproc=$((limit / (1024 * 1024 * 1024))) - [ "$nproc" -gt 16 ] && nproc=16 - [ "$nproc" -lt 1 ] && nproc=1 -else - nproc=$(nproc --all) -fi +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" +. "${SCRIPT_DIR}/common.sh" echo "INFO: Build UCC" UCC_SRC_DIR="${SRC_DIR}/ucc" @@ -56,7 +36,15 @@ fi echo "INFO: Configure flags: ${CONFIGURE_FLAGS}" eval "${UCC_SRC_DIR}/configure ${CONFIGURE_FLAGS}" -make "-j${nproc}" install + +# Skip libtool relinking during install: the relink produces identical RUNPATH +# and adds ~5s of pure overhead per build. +if ! grep -q 'need_relink=yes' libtool; then + echo "WARN: libtool relinking patch had no effect (need_relink=yes not found)" +fi +sed -i 's/need_relink=yes/need_relink=no/g' libtool + +make "-j${NPROC}" install echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf ldconfig ldconfig -p | grep -i libucc diff --git a/.ci/scripts/build_ucx.sh b/.ci/scripts/build_ucx.sh index ab0ff86d60..93f90c13fc 100755 --- a/.ci/scripts/build_ucx.sh +++ b/.ci/scripts/build_ucx.sh @@ -1,13 +1,16 @@ #!/bin/bash -eEx set -o pipefail +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" +. "${SCRIPT_DIR}/common.sh" + echo "INFO: Build UCX" cd "${SRC_DIR}/ucx" "${SRC_DIR}/ucx/autogen.sh" mkdir -p "${SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}" cd "${SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}" "${SRC_DIR}/ucx/contrib/configure-release-mt" --with-cuda="${CUDA_HOME}" --prefix="${UCX_INSTALL_DIR}" -make -j install +make "-j${NPROC}" install echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf ldconfig ldconfig -p | grep -i ucx diff --git a/.ci/scripts/common.sh b/.ci/scripts/common.sh index 8b84ab7d60..ab6350c289 100755 --- a/.ci/scripts/common.sh +++ b/.ci/scripts/common.sh @@ -1,22 +1,19 @@ -# In containers, calculate based on memory limits to avoid OOM -# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes +# Determine number of parallel build jobs based on available system memory. +# In containers/Kubernetes, use cgroup memory limits to avoid OOM. if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then - # Prefer cgroupv1 path, fall back to cgroupv2 or static default if not found if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes) elif [ -f /sys/fs/cgroup/memory.max ]; then limit=$(cat /sys/fs/cgroup/memory.max) - # If cgroupv2 limit is "max", meaning unlimited, set to 4GB to avoid OOM [ "$limit" = "max" ] && limit=$((4 * 1024 * 1024 * 1024)) else - # Default to 4GB if no limit is found limit=$((4 * 1024 * 1024 * 1024)) fi - - # Use 1 build process per GB of memory, clamp in [1,16] - nproc=$((limit / (1024 * 1024 * 1024))) - [ "$nproc" -gt 16 ] && nproc=16 - [ "$nproc" -lt 1 ] && nproc=1 - export NPROC=$nproc + NPROC=$((limit / (1024 * 1024 * 1024))) + [ "$NPROC" -gt 16 ] && NPROC=16 + [ "$NPROC" -lt 1 ] && NPROC=1 +else + NPROC=$(nproc --all) fi +export NPROC diff --git a/.github/actions/restore-artifacts/action.yml b/.github/actions/restore-artifacts/action.yml new file mode 100644 index 0000000000..c2a2d1c4c5 --- /dev/null +++ b/.github/actions/restore-artifacts/action.yml @@ -0,0 +1,28 @@ +name: Restore artifact permissions +description: > + Download-artifact strips execute bits. This action restores them for the + standard UCC/UCX/OMPI install trees used across CI jobs. +inputs: + ucx: + description: Restore UCX install permissions + default: 'true' + ucc: + description: Restore UCC install permissions + default: 'true' + ompi: + description: Restore OMPI install permissions + default: 'false' +runs: + using: composite + steps: + - shell: bash + run: | + if [ "${{ inputs.ucx }}" = "true" ]; then + chmod -R +x /tmp/ucx/install/bin /tmp/ucx/install/lib 2>/dev/null || true + fi + if [ "${{ inputs.ucc }}" = "true" ]; then + chmod -R +x /tmp/ucc/install/bin /tmp/ucc/install/lib 2>/dev/null || true + fi + if [ "${{ inputs.ompi }}" = "true" ]; then + chmod -R +x /tmp/ompi/install/bin /tmp/ompi/install/lib 2>/dev/null || true + fi diff --git a/.github/workflows/asan-test.yaml b/.github/workflows/asan-test.yaml index d3c7059e40..080b339937 100644 --- a/.github/workflows/asan-test.yaml +++ b/.github/workflows/asan-test.yaml @@ -1,4 +1,4 @@ -name: ASAN Tests +name: ASAN on: [push, pull_request] @@ -12,32 +12,80 @@ env: CLANG_VER: 17 jobs: - gtest-asan: - runs-on: ubuntu-22.04 + build: + runs-on: ubuntu-24.04 steps: - name: Install dependencies run: | sudo apt-get update sudo apt-get install -y --no-install-recommends wget gpg - # Setup LLVM repository sudo mkdir -p /etc/apt/keyrings wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg - echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list + echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list sudo apt-get update sudo apt-get install -y --no-install-recommends clang-${CLANG_VER} clang++-${CLANG_VER} libclang-rt-${CLANG_VER}-dev + - name: Cache UCX + id: cache-ucx + uses: actions/cache@v4 + with: + path: /tmp/ucx/install + key: ucx-asan-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/asan-test.yaml') }} - name: Get UCX - run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx + if: steps.cache-ucx.outputs.cache-hit != 'true' + run: | + rm -rf /tmp/ucx + git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx - name: Build UCX + if: steps.cache-ucx.outputs.cache-hit != 'true' run: | cd /tmp/ucx && ./autogen.sh - CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install + CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix /tmp/ucx/install make -j install - - uses: actions/checkout@v4 - - name: Run gtest ASAN + - uses: actions/checkout@v6 + - name: Build UCC (ASAN) + env: + LD_LIBRARY_PATH: /tmp/ucx/install/lib run: | - export ASAN_OPTIONS=fast_unwind_on_malloc=0:detect_leaks=1:print_suppressions=0 - export LSAN_OPTIONS=report_objects=1 ./autogen.sh CFLAGS="-fsanitize=address" CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./configure --prefix=/tmp/ucc/install --with-ucx=/tmp/ucx/install --enable-gtest make -j install - ./test/gtest/gtest + cp test/gtest/gtest /tmp/ucc/install/bin/gtest + - name: Bundle clang runtime for test shards + run: | + CLANG_RT_DIR=$(dirname $(clang-${CLANG_VER} -print-file-name=libclang_rt.asan-x86_64.so)) + cp -a ${CLANG_RT_DIR} /tmp/clang-rt + - name: Upload build artifacts + uses: actions/upload-artifact@v7 + with: + name: ucc-asan-build + path: | + /tmp/ucx/install + /tmp/ucc/install + /tmp/clang-rt + retention-days: 1 + + gtest-asan: + runs-on: ubuntu-24.04 + needs: build + strategy: + fail-fast: false + matrix: + shard: [0, 1, 2, 3] + name: ASAN (shard ${{ matrix.shard }}) + steps: + - uses: actions/checkout@v6 + - name: Download build artifacts + uses: actions/download-artifact@v7 + with: + name: ucc-asan-build + path: /tmp + - name: Restore artifact permissions + uses: ./.github/actions/restore-artifacts + - name: Run UCC gtest (ASAN, shard ${{ matrix.shard }}) + env: + GTEST_TOTAL_SHARDS: 4 + GTEST_SHARD_INDEX: ${{ matrix.shard }} + ASAN_OPTIONS: fast_unwind_on_malloc=1:detect_leaks=1:print_suppressions=0 + LSAN_OPTIONS: report_objects=1 + LD_LIBRARY_PATH: /tmp/ucc/install/lib:/tmp/ucx/install/lib:/tmp/clang-rt + run: /tmp/ucc/install/bin/gtest diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 0037f8bc5a..6befbf3a06 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -43,7 +43,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v6 with: repository: ${{ fromJson(needs.Authorization.outputs.args).repo }} ref: ${{ fromJson(needs.Authorization.outputs.args).ref }} diff --git a/.github/workflows/clang-tidy-nvidia.yaml b/.github/workflows/clang-tidy-nvidia.yaml index 495e146f35..0839c87072 100644 --- a/.github/workflows/clang-tidy-nvidia.yaml +++ b/.github/workflows/clang-tidy-nvidia.yaml @@ -1,23 +1,24 @@ -name: Linter-NVIDIA +name: Lint (CUDA) on: [push, pull_request] concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: lint-cuda-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true env: OPEN_UCX_LINK: https://github.com/openucx/ucx OPEN_UCX_BRANCH: master - HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.22.1rc4/hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz + HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.25.1_cuda13/hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64.tbz CLANG_VER: 17 - MLNX_OFED_VER: 24.10-2.1.8.0 - CUDA_VER: 12-8 + MLNX_OFED_VER: 24.10-4.1.4.0 + CUDA_VER: 13-1 LIBRARY_PATH: /tmp/ucx/install/lib LD_LIBRARY_PATH: /tmp/ucx/install/lib jobs: clang-tidy: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 + name: clang-tidy (CUDA) steps: - name: Install dependencies run: | @@ -26,35 +27,60 @@ jobs: # Setup LLVM repository sudo mkdir -p /etc/apt/keyrings wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg - echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list + echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list sudo apt-get update sudo apt-get install -y --no-install-recommends clang-tidy-${CLANG_VER} bear clang-${CLANG_VER} clang++-${CLANG_VER} + - name: Cache MLNX_OFED tarball + id: cache-ofed + uses: actions/cache@v4 + with: + path: MLNX_OFED_LINUX-${{ env.MLNX_OFED_VER }}-ubuntu24.04-x86_64.tgz + key: mlnx-ofed-${{ env.MLNX_OFED_VER }}-ubuntu24.04 + - name: Download MLNX_OFED + if: steps.cache-ofed.outputs.cache-hit != 'true' + run: wget --no-verbose http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VER}/MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64.tgz - name: Install extra rdma dependencies run: | - wget --no-verbose http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VER}/MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64.tgz - sudo tar -xvzf MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64.tgz - sudo chmod -R a+rwx MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64 - sudo MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64/mlnxofedinstall --skip-unsupported-devices-check --user-space-only --without-fw-update --force --basic -vvv + sudo tar -xzf MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64.tgz + sudo chmod -R a+rwx MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64 + sudo MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64/mlnxofedinstall --skip-unsupported-devices-check --user-space-only --without-fw-update --force --basic -v - name: Install extra cuda dependencies run: | - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER} + - name: Cache UCX + id: cache-ucx + uses: actions/cache@v4 + with: + path: /tmp/ucx/install + key: ucx-nvidia-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/clang-tidy-nvidia.yaml') }} - name: Get UCX - run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx + if: steps.cache-ucx.outputs.cache-hit != 'true' + run: | + rm -rf /tmp/ucx + git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx - name: Build UCX + if: steps.cache-ucx.outputs.cache-hit != 'true' run: | cd /tmp/ucx && ./autogen.sh CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install make -j install + - name: Cache HPCX + id: cache-hpcx + uses: actions/cache@v4 + with: + path: /tmp/hpcx + key: hpcx-v2.25.1-${{ hashFiles('.github/workflows/clang-tidy-nvidia.yaml') }} - name: Download HPCX + if: steps.cache-hpcx.outputs.cache-hit != 'true' run: | cd /tmp wget --no-verbose ${HPCX_LINK} - tar xjf hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz - mv hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64 hpcx - - uses: actions/checkout@v4 + tar xjf hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64.tbz + mv hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64 hpcx + - uses: actions/checkout@v6 - name: Build UCC run: | ./autogen.sh diff --git a/.github/workflows/clang-tidy-rocm.yaml b/.github/workflows/clang-tidy-rocm.yaml index 6be0d4074b..74c96ef16f 100644 --- a/.github/workflows/clang-tidy-rocm.yaml +++ b/.github/workflows/clang-tidy-rocm.yaml @@ -1,9 +1,9 @@ -name: Linter-ROCM +name: Lint (ROCm) on: [push, pull_request] concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: lint-rocm-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true env: @@ -16,6 +16,7 @@ env: jobs: clang-tidy: runs-on: ubuntu-22.04 + name: clang-tidy (ROCm) steps: - name: Install dependencies run: | @@ -40,14 +41,24 @@ jobs: bear \ rocm-hip-sdk sudo ln -sf /opt/rocm-${ROCM_VER} /opt/rocm + - name: Cache UCX + id: cache-ucx + uses: actions/cache@v4 + with: + path: /tmp/ucx/install + key: ucx-rocm-${{ env.OPEN_UCX_BRANCH }}-${{ env.ROCM_VER }}-${{ hashFiles('.github/workflows/clang-tidy-rocm.yaml') }} - name: Get UCX - run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx + if: steps.cache-ucx.outputs.cache-hit != 'true' + run: | + rm -rf /tmp/ucx + git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx - name: Build UCX + if: steps.cache-ucx.outputs.cache-hit != 'true' run: | cd /tmp/ucx && ./autogen.sh CC=gcc CXX=g++ ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install --with-rocm=/opt/rocm make -j install - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Build UCC run: | ./autogen.sh diff --git a/.github/workflows/clang-tidy.yaml b/.github/workflows/clang-tidy.yaml index da6bb9cabe..8b5af028b9 100644 --- a/.github/workflows/clang-tidy.yaml +++ b/.github/workflows/clang-tidy.yaml @@ -1,9 +1,9 @@ -name: Linter +name: Lint (CPU) on: [push, pull_request] concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: lint-cpu-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true env: @@ -12,7 +12,8 @@ env: CLANG_VER: 17 jobs: clang-tidy: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 + name: clang-tidy steps: - name: Install dependencies run: | @@ -21,17 +22,27 @@ jobs: # Setup LLVM repository sudo mkdir -p /etc/apt/keyrings wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg - echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list + echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list sudo apt-get update sudo apt-get install -y --no-install-recommends clang-tidy-${CLANG_VER} bear + - name: Cache UCX + id: cache-ucx + uses: actions/cache@v4 + with: + path: /tmp/ucx/install + key: ucx-tidy-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/clang-tidy.yaml') }} - name: Get UCX - run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx + if: steps.cache-ucx.outputs.cache-hit != 'true' + run: | + rm -rf /tmp/ucx + git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx - name: Build UCX + if: steps.cache-ucx.outputs.cache-hit != 'true' run: | cd /tmp/ucx && ./autogen.sh CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install make -j install - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Build UCC run: | ./autogen.sh diff --git a/.github/workflows/codestyle.yaml b/.github/workflows/codestyle.yaml index 5d51b6ee9e..1d500d5f75 100644 --- a/.github/workflows/codestyle.yaml +++ b/.github/workflows/codestyle.yaml @@ -1,10 +1,9 @@ -name: Codestyle +name: Lint (codestyle) on: [pull_request] -# Cancel in-progress runs for the same PR concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: lint-codestyle-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true env: @@ -13,8 +12,8 @@ env: jobs: check-codestyle: - runs-on: ubuntu-22.04 - name: Check code style + runs-on: ubuntu-24.04 + name: codestyle defaults: run: shell: bash @@ -27,7 +26,7 @@ jobs: chmod +x ./git-clang-format sudo mv ./git-clang-format /usr/bin/git-clang-format - name: Checking out repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: ref: ${{ github.event.pull_request.head.sha }} fetch-depth: 0 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 0d57adedb7..bc8a0720c5 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -14,14 +14,14 @@ jobs: run: | sudo apt-get update sudo apt-get install -y --no-install-recommends doxygen doxygen-latex cm-super texlive-fonts-recommended - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Build UCC docs run: | ./autogen.sh ./configure --with-docs-only make docs - name: Upload docs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: docs path: ${{ github.workspace }}/docs/doxygen-doc/ucc.pdf diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index c26d0d0971..6791c3f0e9 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -1,4 +1,4 @@ -name: OpenMPI tests +name: Build & Test on: [push, pull_request] @@ -14,43 +14,109 @@ env: IMB_LINK: https://github.com/intel/mpi-benchmarks.git IMB_COLLS: allgather,allgatherv,allreduce,alltoall,alltoallv,barrier,bcast,gather,gatherv,reduce,reduce_scatter,reduce_scatter_block,scatter,scatterv jobs: - tests: + build: runs-on: ubuntu-latest steps: - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends doxygen doxygen-latex + - name: Cache UCX + id: cache-ucx + uses: actions/cache@v4 + with: + path: /tmp/ucx/install + key: ucx-release-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/main.yaml') }} - name: Get UCX - run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx + if: steps.cache-ucx.outputs.cache-hit != 'true' + run: | + rm -rf /tmp/ucx + git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx - name: Build UCX + if: steps.cache-ucx.outputs.cache-hit != 'true' run: | cd /tmp/ucx && ./autogen.sh - ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install + ./contrib/configure-release --without-java --without-go --disable-numa --prefix /tmp/ucx/install make -j install - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Build UCC run: | ./autogen.sh ./configure --prefix=/tmp/ucc/install --enable-gtest --with-ucx=/tmp/ucx/install make -j`nproc` install - make gtest + cp test/gtest/gtest /tmp/ucc/install/bin/gtest + - name: Upload build artifacts + uses: actions/upload-artifact@v7 + with: + name: ucc-build + path: | + /tmp/ucx/install + /tmp/ucc/install + retention-days: 1 + + gtest: + runs-on: ubuntu-latest + needs: build + strategy: + fail-fast: false + matrix: + shard: [0, 1] + name: test (shard ${{ matrix.shard }}) + steps: + - uses: actions/checkout@v6 + - name: Download build artifacts + uses: actions/download-artifact@v7 + with: + name: ucc-build + path: /tmp + - name: Restore artifact permissions + uses: ./.github/actions/restore-artifacts + - name: Run UCC gtest (shard ${{ matrix.shard }}) + env: + GTEST_TOTAL_SHARDS: 2 + GTEST_SHARD_INDEX: ${{ matrix.shard }} + LD_LIBRARY_PATH: /tmp/ucc/install/lib:/tmp/ucx/install/lib + run: /tmp/ucc/install/bin/gtest - name: Run ucc_info + if: matrix.shard == 0 run: | + export LD_LIBRARY_PATH=/tmp/ucc/install/lib:/tmp/ucx/install/lib /tmp/ucc/install/bin/ucc_info -vc - name: Run CMake tests + if: matrix.shard == 0 run: | set -e + export LD_LIBRARY_PATH=/tmp/ucc/install/lib:/tmp/ucx/install/lib cmake -S test/cmake -B /tmp/cmake-ucc -DCMAKE_PREFIX_PATH=/tmp/ucc/install cd /tmp/cmake-ucc cmake --build . ./test_ucc + + ompi-build: + runs-on: ubuntu-latest + name: OMPI build + needs: build + steps: + - name: Download build artifacts + uses: actions/download-artifact@v7 + with: + name: ucc-build + path: /tmp + - name: Restore permissions + run: | + chmod -R +x /tmp/ucx/install/bin /tmp/ucx/install/lib || true + chmod -R +x /tmp/ucc/install/bin /tmp/ucc/install/lib || true + - name: Cache OMPI + id: cache-ompi + uses: actions/cache@v4 + with: + path: /tmp/ompi/install + key: ompi-${{ env.OPEN_MPI_BRANCH }}-${{ hashFiles('.github/workflows/main.yaml', 'configure.ac') }} - name: Get OMPI + if: steps.cache-ompi.outputs.cache-hit != 'true' run: | - git clone ${OPEN_MPI_LINK} -b ${OPEN_MPI_BRANCH} /tmp/ompi - cd /tmp/ompi - git submodule update --init --recursive + rm -rf /tmp/ompi + git clone --depth 1 ${OPEN_MPI_LINK} -b ${OPEN_MPI_BRANCH} /tmp/ompi + cd /tmp/ompi + git submodule update --init --recursive --depth 1 - name: Build OMPI + if: steps.cache-ompi.outputs.cache-hit != 'true' run: > cd /tmp/ompi @@ -65,75 +131,120 @@ jobs: --with-ucc=/tmp/ucc/install make -j install + - name: Upload OMPI build artifacts + uses: actions/upload-artifact@v7 + with: + name: ompi-build + path: | + /tmp/ucx/install + /tmp/ucc/install + /tmp/ompi/install + retention-days: 1 + + ompi-perftest: + runs-on: ubuntu-latest + name: OMPI perftest + needs: ompi-build + steps: + - uses: actions/checkout@v6 + - name: Download OMPI build artifacts + uses: actions/download-artifact@v7 + with: + name: ompi-build + path: /tmp + - name: Restore artifact permissions + uses: ./.github/actions/restore-artifacts + with: + ompi: 'true' - name: Build ucc_perftest (with OMPI) run: | set -e + ./autogen.sh CC=/tmp/ompi/install/bin/mpicc CXX=/tmp/ompi/install/bin/mpicxx \ - ./configure --prefix=/tmp/ucc/install --enable-gtest --with-ucx=/tmp/ucx/install --with-mpi=/tmp/ompi/install - make -C tools/perf -j`nproc` - make -C tools/perf install + ./configure --prefix=/tmp/ucc/install --with-ucx=/tmp/ucx/install --with-mpi=/tmp/ompi/install + make -j`nproc` install - name: Run ucc_perftest run: | set -e test -x /tmp/ucc/install/bin/ucc_perftest export LD_LIBRARY_PATH=/tmp/ucc/install/lib:/tmp/ucx/install/lib:/tmp/ompi/install/lib:$LD_LIBRARY_PATH + MPI_ARGS=( + -np 4 -H localhost:4 + --bind-to none + --mca pml ucx + --mca pml_ucx_tls any + --mca pml_ucx_devices any + --mca coll_ucc_enable 0 + -x LD_LIBRARY_PATH + -x UCC_LOG_LEVEL=info + -x UCC_TLS=ucp + -x UCC_CONFIG_FILE= + ) + PERFTEST_ARGS=(-m host -d float32 -b 1024 -e 1024 -n 10 -w 2) COLLS=( allgather allgatherv allreduce alltoall alltoallv barrier bcast gather gatherv reduce reduce_scatter reduce_scatterv scatterv ) - for c in "${COLLS[@]}"; do - echo "Running ucc_perftest -c ${c}" - /tmp/ompi/install/bin/mpirun \ - -np 4 -H localhost:4 \ - --bind-to none \ - --mca pml ucx \ - --mca pml_ucx_tls any \ - --mca pml_ucx_devices any \ - --mca coll_ucc_enable 0 \ - -x LD_LIBRARY_PATH \ - -x UCC_LOG_LEVEL=info \ - -x UCC_TLS=ucp \ - -x UCC_CONFIG_FILE= \ - /tmp/ucc/install/bin/ucc_perftest \ - -c "${c}" -m host -d float32 -b 1024 -e 1024 -n 10 -w 2 - done INPLACE_COLLS=( allgather allgatherv allreduce gather gatherv reduce reduce_scatter reduce_scatterv ) + for c in "${COLLS[@]}"; do + echo "Running ucc_perftest -c ${c}" + /tmp/ompi/install/bin/mpirun "${MPI_ARGS[@]}" \ + /tmp/ucc/install/bin/ucc_perftest -c "${c}" "${PERFTEST_ARGS[@]}" + done for c in "${INPLACE_COLLS[@]}"; do echo "Running ucc_perftest (inplace) -c ${c}" - /tmp/ompi/install/bin/mpirun \ - -np 4 -H localhost:4 \ - --bind-to none \ - --mca pml ucx \ - --mca pml_ucx_tls any \ - --mca pml_ucx_devices any \ - --mca coll_ucc_enable 0 \ - -x LD_LIBRARY_PATH \ - -x UCC_LOG_LEVEL=info \ - -x UCC_TLS=ucp \ - -x UCC_CONFIG_FILE= \ - /tmp/ucc/install/bin/ucc_perftest \ - -c "${c}" -m host -d float32 -b 1024 -e 1024 -n 10 -w 2 -i + /tmp/ompi/install/bin/mpirun "${MPI_ARGS[@]}" \ + /tmp/ucc/install/bin/ucc_perftest -c "${c}" "${PERFTEST_ARGS[@]}" -i done + + ompi-imb: + runs-on: ubuntu-latest + name: OMPI IMB + needs: ompi-build + steps: + - uses: actions/checkout@v6 + - name: Download OMPI build artifacts + uses: actions/download-artifact@v7 + with: + name: ompi-build + path: /tmp + - name: Restore artifact permissions + uses: ./.github/actions/restore-artifacts + with: + ompi: 'true' + - name: Cache IMB + id: cache-imb + uses: actions/cache@v4 + with: + path: /tmp/imb + key: imb-${{ hashFiles('.github/workflows/main.yaml') }} - name: Get IMB - run: git clone ${IMB_LINK} /tmp/imb + if: steps.cache-imb.outputs.cache-hit != 'true' + run: | + rm -rf /tmp/imb + git clone --depth 1 ${IMB_LINK} /tmp/imb - name: Build IMB + if: steps.cache-imb.outputs.cache-hit != 'true' run: | - cd /tmp/imb - make CC=/tmp/ompi/install/bin/mpicc CXX=/tmp/ompi/install/bin/mpicxx CPPFLAGS="-DCHECK=1" -j IMB-MPI1 + cd /tmp/imb + make CC=/tmp/ompi/install/bin/mpicc CXX=/tmp/ompi/install/bin/mpicxx CPPFLAGS="-DCHECK=1" -j IMB-MPI1 - name: Run IMB-DCHECK + env: + LD_LIBRARY_PATH: /tmp/ucc/install/lib:/tmp/ucx/install/lib:/tmp/ompi/install/lib run: > - /tmp/ompi/install/bin/mpirun - -np 8 - -H localhost:8 - --bind-to none - --mca pml ucx - --mca pml_ucx_tls any - --mca pml_ucx_devices any - --mca coll_ucc_priority 100 - --mca coll_ucc_enable 1 - /tmp/imb/IMB-MPI1 ${IMB_COLLS} -iter 10 -iter_policy off + /tmp/ompi/install/bin/mpirun + -np 8 + -H localhost:8 + --bind-to none + --mca pml ucx + --mca pml_ucx_tls any + --mca pml_ucx_devices any + --mca coll_ucc_priority 100 + --mca coll_ucc_enable 1 + -x LD_LIBRARY_PATH + /tmp/imb/IMB-MPI1 ${IMB_COLLS} -iter 10 -iter_policy off