openucx · ikryukov · Mar 10, 2026 · Sergei-Lebedev · Mar 13, 2026 · Sergei-Lebedev
diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh
@@ -7,28 +7,8 @@ export UCC_ENABLE_GTEST=${UCC_ENABLE_GTEST:-yes}
 export UCC_ENABLE_NVLS=${UCC_ENABLE_NVLS:-no}
 export UCC_BUILD_TLS=${UCC_BUILD_TLS:-cuda,nccl,self,sharp,shm,ucp,mlx5}
 
-# In containers, calculate based on memory limits to avoid OOM
-# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes
-if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then
-    # Prefer cgroupv1 path, fall back to cgroupv2 or static default if not found
-    if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
-        limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
-    elif [ -f /sys/fs/cgroup/memory.max ]; then
-        limit=$(cat /sys/fs/cgroup/memory.max)
-        # If cgroupv2 limit is "max", meaning unlimited, set to 4GB to avoid OOM
-        [ "$limit" = "max" ] && limit=$((4 * 1024 * 1024 * 1024))
-    else
-        # Default to 4GB if no limit is found
-        limit=$((4 * 1024 * 1024 * 1024))
-    fi
-
-    # Use 1 build process per GB of memory, clamp in [1,16]
-    nproc=$((limit / (1024 * 1024 * 1024)))
-    [ "$nproc" -gt 16 ] && nproc=16
-    [ "$nproc" -lt 1 ] && nproc=1
-else
-    nproc=$(nproc --all)
-fi
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"
+. "${SCRIPT_DIR}/common.sh"
 
 echo "INFO: Build UCC"
 UCC_SRC_DIR="${SRC_DIR}/ucc"
@@ -56,7 +36,15 @@ fi
 
 echo "INFO: Configure flags: ${CONFIGURE_FLAGS}"
 eval "${UCC_SRC_DIR}/configure ${CONFIGURE_FLAGS}"
-make "-j${nproc}" install
+
+# Skip libtool relinking during install: the relink produces identical RUNPATH
+# and adds ~5s of pure overhead per build.
+if ! grep -q 'need_relink=yes' libtool; then
+    echo "WARN: libtool relinking patch had no effect (need_relink=yes not found)"
+fi
+sed -i 's/need_relink=yes/need_relink=no/g' libtool
+
+make "-j${NPROC}" install
 echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf
 ldconfig
 ldconfig -p | grep -i libucc
diff --git a/.ci/scripts/build_ucx.sh b/.ci/scripts/build_ucx.sh
@@ -1,13 +1,16 @@
 #!/bin/bash -eEx
 set -o pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"
+. "${SCRIPT_DIR}/common.sh"
+
 echo "INFO: Build UCX"
 cd "${SRC_DIR}/ucx"
 "${SRC_DIR}/ucx/autogen.sh"
 mkdir -p "${SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}"
 cd "${SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}"
 "${SRC_DIR}/ucx/contrib/configure-release-mt" --with-cuda="${CUDA_HOME}" --prefix="${UCX_INSTALL_DIR}"
-make -j install
+make "-j${NPROC}" install
 echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf
 ldconfig
 ldconfig -p | grep -i ucx
diff --git a/.ci/scripts/common.sh b/.ci/scripts/common.sh
@@ -1,22 +1,19 @@
-# In containers, calculate based on memory limits to avoid OOM
-# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes
+# Determine number of parallel build jobs based on available system memory.
+# In containers/Kubernetes, use cgroup memory limits to avoid OOM.
 if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then
-    # Prefer cgroupv1 path, fall back to cgroupv2 or static default if not found
     if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
         limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
     elif [ -f /sys/fs/cgroup/memory.max ]; then
         limit=$(cat /sys/fs/cgroup/memory.max)
-        # If cgroupv2 limit is "max", meaning unlimited, set to 4GB to avoid OOM
         [ "$limit" = "max" ] && limit=$((4 * 1024 * 1024 * 1024))
     else
-        # Default to 4GB if no limit is found
         limit=$((4 * 1024 * 1024 * 1024))
     fi
-
-    # Use 1 build process per GB of memory, clamp in [1,16]
-    nproc=$((limit / (1024 * 1024 * 1024)))
-    [ "$nproc" -gt 16 ] && nproc=16
-    [ "$nproc" -lt 1 ] && nproc=1
-    export NPROC=$nproc
+    NPROC=$((limit / (1024 * 1024 * 1024)))
+    [ "$NPROC" -gt 16 ] && NPROC=16
+    [ "$NPROC" -lt 1 ] && NPROC=1
+else
+    NPROC=$(nproc --all)
 fi
+export NPROC
 
diff --git a/.github/actions/restore-artifacts/action.yml b/.github/actions/restore-artifacts/action.yml
@@ -0,0 +1,28 @@
+name: Restore artifact permissions
+description: >
+  Download-artifact strips execute bits.  This action restores them for the
+  standard UCC/UCX/OMPI install trees used across CI jobs.
+inputs:
+  ucx:
+    description: Restore UCX install permissions
+    default: 'true'
+  ucc:
+    description: Restore UCC install permissions
+    default: 'true'
+  ompi:
+    description: Restore OMPI install permissions
+    default: 'false'
+runs:
+  using: composite
+  steps:
+  - shell: bash
+    run: |
+      if [ "${{ inputs.ucx }}" = "true" ]; then
+        chmod -R +x /tmp/ucx/install/bin /tmp/ucx/install/lib 2>/dev/null || true
+      fi
+      if [ "${{ inputs.ucc }}" = "true" ]; then
+        chmod -R +x /tmp/ucc/install/bin /tmp/ucc/install/lib 2>/dev/null || true
+      fi
+      if [ "${{ inputs.ompi }}" = "true" ]; then
+        chmod -R +x /tmp/ompi/install/bin /tmp/ompi/install/lib 2>/dev/null || true
+      fi
diff --git a/.github/workflows/asan-test.yaml b/.github/workflows/asan-test.yaml
@@ -1,4 +1,4 @@
-name: ASAN Tests
+name: ASAN
 
 on: [push, pull_request]
 
@@ -12,32 +12,80 @@ env:
   CLANG_VER: 17
 
 jobs:
-  gtest-asan:
-    runs-on: ubuntu-22.04
+  build:
+    runs-on: ubuntu-24.04
     steps:
     - name: Install dependencies
       run: |
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends wget gpg
-        # Setup LLVM repository
         sudo mkdir -p /etc/apt/keyrings
         wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg
-        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
+        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends clang-${CLANG_VER} clang++-${CLANG_VER} libclang-rt-${CLANG_VER}-dev
+    - name: Cache UCX
+      id: cache-ucx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ucx/install
+        key: ucx-asan-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/asan-test.yaml') }}
     - name: Get UCX
-      run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/ucx
+        git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
       run: |
         cd /tmp/ucx && ./autogen.sh
-        CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install
+        CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix /tmp/ucx/install
         make -j install
-    - uses: actions/checkout@v4
-    - name: Run gtest ASAN
+    - uses: actions/checkout@v6
+    - name: Build UCC (ASAN)
+      env:
+        LD_LIBRARY_PATH: /tmp/ucx/install/lib
       run: |
-        export ASAN_OPTIONS=fast_unwind_on_malloc=0:detect_leaks=1:print_suppressions=0
-        export LSAN_OPTIONS=report_objects=1
         ./autogen.sh
         CFLAGS="-fsanitize=address" CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./configure --prefix=/tmp/ucc/install --with-ucx=/tmp/ucx/install --enable-gtest
         make -j install
-        ./test/gtest/gtest
+        cp test/gtest/gtest /tmp/ucc/install/bin/gtest
+    - name: Bundle clang runtime for test shards
+      run: |
+        CLANG_RT_DIR=$(dirname $(clang-${CLANG_VER} -print-file-name=libclang_rt.asan-x86_64.so))
+        cp -a ${CLANG_RT_DIR} /tmp/clang-rt
+    - name: Upload build artifacts
+      uses: actions/upload-artifact@v7
+      with:
+        name: ucc-asan-build
+        path: |
+          /tmp/ucx/install
+          /tmp/ucc/install
+          /tmp/clang-rt
+        retention-days: 1
+
+  gtest-asan:
+    runs-on: ubuntu-24.04
+    needs: build
+    strategy:
+      fail-fast: false
+      matrix:
+        shard: [0, 1, 2, 3]
+    name: ASAN (shard ${{ matrix.shard }})
+    steps:
+    - uses: actions/checkout@v6
+    - name: Download build artifacts
+      uses: actions/download-artifact@v7
+      with:
+        name: ucc-asan-build
+        path: /tmp
+    - name: Restore artifact permissions
+      uses: ./.github/actions/restore-artifacts
+    - name: Run UCC gtest (ASAN, shard ${{ matrix.shard }})
+      env:
+        GTEST_TOTAL_SHARDS: 4
+        GTEST_SHARD_INDEX: ${{ matrix.shard }}
+        ASAN_OPTIONS: fast_unwind_on_malloc=1:detect_leaks=1:print_suppressions=0
+        LSAN_OPTIONS: report_objects=1
+        LD_LIBRARY_PATH: /tmp/ucc/install/lib:/tmp/ucx/install/lib:/tmp/clang-rt
+      run: /tmp/ucc/install/bin/gtest
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v6
         with:
           repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
           ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}

diff --git a/.github/workflows/clang-tidy-nvidia.yaml b/.github/workflows/clang-tidy-nvidia.yaml
@@ -1,23 +1,24 @@
-name: Linter-NVIDIA
+name: Lint (CUDA)
 
 on: [push, pull_request]
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: lint-cuda-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 env:
   OPEN_UCX_LINK: https://github.com/openucx/ucx
   OPEN_UCX_BRANCH: master
-  HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.22.1rc4/hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz
+  HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.25.1_cuda13/hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64.tbz
   CLANG_VER: 17
-  MLNX_OFED_VER: 24.10-2.1.8.0
-  CUDA_VER: 12-8
+  MLNX_OFED_VER: 24.10-4.1.4.0
+  CUDA_VER: 13-1
   LIBRARY_PATH: /tmp/ucx/install/lib
   LD_LIBRARY_PATH: /tmp/ucx/install/lib
 jobs:
   clang-tidy:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
+    name: clang-tidy (CUDA)
     steps:
     - name: Install dependencies
       run: |
@@ -26,35 +27,60 @@ jobs:
         # Setup LLVM repository
         sudo mkdir -p /etc/apt/keyrings
         wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg
-        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
+        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends clang-tidy-${CLANG_VER} bear clang-${CLANG_VER} clang++-${CLANG_VER}
+    - name: Cache MLNX_OFED tarball
+      id: cache-ofed
+      uses: actions/cache@v4
+      with:
+        path: MLNX_OFED_LINUX-${{ env.MLNX_OFED_VER }}-ubuntu24.04-x86_64.tgz
+        key: mlnx-ofed-${{ env.MLNX_OFED_VER }}-ubuntu24.04
+    - name: Download MLNX_OFED
+      if: steps.cache-ofed.outputs.cache-hit != 'true'
+      run: wget --no-verbose http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VER}/MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64.tgz
     - name: Install extra rdma dependencies
       run: |
-        wget --no-verbose http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VER}/MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64.tgz
-        sudo tar -xvzf MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64.tgz
-        sudo chmod -R a+rwx MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64
-        sudo MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64/mlnxofedinstall --skip-unsupported-devices-check --user-space-only --without-fw-update --force --basic -vvv
+        sudo tar -xzf MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64.tgz
+        sudo chmod -R a+rwx MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64
+        sudo MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64/mlnxofedinstall --skip-unsupported-devices-check --user-space-only --without-fw-update --force --basic -v
     - name: Install extra cuda dependencies
       run: |
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
+    - name: Cache UCX
+      id: cache-ucx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ucx/install
+        key: ucx-nvidia-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/clang-tidy-nvidia.yaml') }}
     - name: Get UCX
-      run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/ucx
+        git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
       run: |
         cd /tmp/ucx && ./autogen.sh
         CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install
         make -j install
+    - name: Cache HPCX
+      id: cache-hpcx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/hpcx
+        key: hpcx-v2.25.1-${{ hashFiles('.github/workflows/clang-tidy-nvidia.yaml') }}
     - name: Download HPCX
+      if: steps.cache-hpcx.outputs.cache-hit != 'true'
       run: |
         cd /tmp
         wget --no-verbose ${HPCX_LINK}
-        tar xjf hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz
-        mv hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64 hpcx
-    - uses: actions/checkout@v4
+        tar xjf hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64.tbz
+        mv hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64 hpcx
+    - uses: actions/checkout@v6
     - name: Build UCC
       run: |
         ./autogen.sh

diff --git a/.github/workflows/clang-tidy-rocm.yaml b/.github/workflows/clang-tidy-rocm.yaml
@@ -1,9 +1,9 @@
-name: Linter-ROCM
+name: Lint (ROCm)
 
 on: [push, pull_request]
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: lint-rocm-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 env:
@@ -16,6 +16,7 @@ env:
 jobs:
   clang-tidy:
     runs-on: ubuntu-22.04
+    name: clang-tidy (ROCm)
     steps:
     - name: Install dependencies
       run: |
@@ -40,14 +41,24 @@ jobs:
           bear \
           rocm-hip-sdk
         sudo ln -sf /opt/rocm-${ROCM_VER} /opt/rocm
+    - name: Cache UCX
+      id: cache-ucx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ucx/install
+        key: ucx-rocm-${{ env.OPEN_UCX_BRANCH }}-${{ env.ROCM_VER }}-${{ hashFiles('.github/workflows/clang-tidy-rocm.yaml') }}
     - name: Get UCX
-      run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/ucx
+        git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
       run: |
         cd /tmp/ucx && ./autogen.sh
         CC=gcc CXX=g++ ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install --with-rocm=/opt/rocm
         make -j install
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Build UCC
       run: |
         ./autogen.sh