diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh
index c38ece71cb..d6f52d3a1f 100755
--- a/.ci/scripts/build_ucc.sh
+++ b/.ci/scripts/build_ucc.sh
@@ -7,28 +7,8 @@ export UCC_ENABLE_GTEST=${UCC_ENABLE_GTEST:-yes}
 export UCC_ENABLE_NVLS=${UCC_ENABLE_NVLS:-no}
 export UCC_BUILD_TLS=${UCC_BUILD_TLS:-cuda,nccl,self,sharp,shm,ucp,mlx5}
 
-# In containers, calculate based on memory limits to avoid OOM
-# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes
-if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then
-    # Prefer cgroupv1 path, fall back to cgroupv2 or static default if not found
-    if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
-        limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
-    elif [ -f /sys/fs/cgroup/memory.max ]; then
-        limit=$(cat /sys/fs/cgroup/memory.max)
-        # If cgroupv2 limit is "max", meaning unlimited, set to 4GB to avoid OOM
-        [ "$limit" = "max" ] && limit=$((4 * 1024 * 1024 * 1024))
-    else
-        # Default to 4GB if no limit is found
-        limit=$((4 * 1024 * 1024 * 1024))
-    fi
-
-    # Use 1 build process per GB of memory, clamp in [1,16]
-    nproc=$((limit / (1024 * 1024 * 1024)))
-    [ "$nproc" -gt 16 ] && nproc=16
-    [ "$nproc" -lt 1 ] && nproc=1
-else
-    nproc=$(nproc --all)
-fi
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"
+. "${SCRIPT_DIR}/common.sh"
 
 echo "INFO: Build UCC"
 UCC_SRC_DIR="${SRC_DIR}/ucc"
@@ -56,7 +36,15 @@ fi
 
 echo "INFO: Configure flags: ${CONFIGURE_FLAGS}"
 eval "${UCC_SRC_DIR}/configure ${CONFIGURE_FLAGS}"
-make "-j${nproc}" install
+
+# Skip libtool relinking during install: the relink produces identical RUNPATH
+# and adds ~5s of pure overhead per build.
+if ! grep -q 'need_relink=yes' libtool; then
+    echo "WARN: libtool relinking patch had no effect (need_relink=yes not found)"
+fi
+sed -i 's/need_relink=yes/need_relink=no/g' libtool
+
+make "-j${NPROC}" install
 echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf
 ldconfig
 ldconfig -p | grep -i libucc
diff --git a/.ci/scripts/build_ucx.sh b/.ci/scripts/build_ucx.sh
index ab0ff86d60..93f90c13fc 100755
--- a/.ci/scripts/build_ucx.sh
+++ b/.ci/scripts/build_ucx.sh
@@ -1,13 +1,16 @@
 #!/bin/bash -eEx
 set -o pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"
+. "${SCRIPT_DIR}/common.sh"
+
 echo "INFO: Build UCX"
 cd "${SRC_DIR}/ucx"
 "${SRC_DIR}/ucx/autogen.sh"
 mkdir -p "${SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}"
 cd "${SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}"
 "${SRC_DIR}/ucx/contrib/configure-release-mt" --with-cuda="${CUDA_HOME}" --prefix="${UCX_INSTALL_DIR}"
-make -j install
+make "-j${NPROC}" install
 echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf
 ldconfig
 ldconfig -p | grep -i ucx
diff --git a/.ci/scripts/common.sh b/.ci/scripts/common.sh
index 8b84ab7d60..ab6350c289 100755
--- a/.ci/scripts/common.sh
+++ b/.ci/scripts/common.sh
@@ -1,22 +1,19 @@
-# In containers, calculate based on memory limits to avoid OOM
-# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes
+# Determine number of parallel build jobs based on available system memory.
+# In containers/Kubernetes, use cgroup memory limits to avoid OOM.
 if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then
-    # Prefer cgroupv1 path, fall back to cgroupv2 or static default if not found
     if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
         limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
     elif [ -f /sys/fs/cgroup/memory.max ]; then
         limit=$(cat /sys/fs/cgroup/memory.max)
-        # If cgroupv2 limit is "max", meaning unlimited, set to 4GB to avoid OOM
         [ "$limit" = "max" ] && limit=$((4 * 1024 * 1024 * 1024))
     else
-        # Default to 4GB if no limit is found
         limit=$((4 * 1024 * 1024 * 1024))
     fi
-
-    # Use 1 build process per GB of memory, clamp in [1,16]
-    nproc=$((limit / (1024 * 1024 * 1024)))
-    [ "$nproc" -gt 16 ] && nproc=16
-    [ "$nproc" -lt 1 ] && nproc=1
-    export NPROC=$nproc
+    NPROC=$((limit / (1024 * 1024 * 1024)))
+    [ "$NPROC" -gt 16 ] && NPROC=16
+    [ "$NPROC" -lt 1 ] && NPROC=1
+else
+    NPROC=$(nproc --all)
 fi
+export NPROC
 
diff --git a/.github/actions/restore-artifacts/action.yml b/.github/actions/restore-artifacts/action.yml
new file mode 100644
index 0000000000..c2a2d1c4c5
--- /dev/null
+++ b/.github/actions/restore-artifacts/action.yml
@@ -0,0 +1,28 @@
+name: Restore artifact permissions
+description: >
+  Download-artifact strips execute bits.  This action restores them for the
+  standard UCC/UCX/OMPI install trees used across CI jobs.
+inputs:
+  ucx:
+    description: Restore UCX install permissions
+    default: 'true'
+  ucc:
+    description: Restore UCC install permissions
+    default: 'true'
+  ompi:
+    description: Restore OMPI install permissions
+    default: 'false'
+runs:
+  using: composite
+  steps:
+  - shell: bash
+    run: |
+      if [ "${{ inputs.ucx }}" = "true" ]; then
+        chmod -R +x /tmp/ucx/install/bin /tmp/ucx/install/lib 2>/dev/null || true
+      fi
+      if [ "${{ inputs.ucc }}" = "true" ]; then
+        chmod -R +x /tmp/ucc/install/bin /tmp/ucc/install/lib 2>/dev/null || true
+      fi
+      if [ "${{ inputs.ompi }}" = "true" ]; then
+        chmod -R +x /tmp/ompi/install/bin /tmp/ompi/install/lib 2>/dev/null || true
+      fi
diff --git a/.github/workflows/asan-test.yaml b/.github/workflows/asan-test.yaml
index d3c7059e40..080b339937 100644
--- a/.github/workflows/asan-test.yaml
+++ b/.github/workflows/asan-test.yaml
@@ -1,4 +1,4 @@
-name: ASAN Tests
+name: ASAN
 
 on: [push, pull_request]
 
@@ -12,32 +12,80 @@ env:
   CLANG_VER: 17
 
 jobs:
-  gtest-asan:
-    runs-on: ubuntu-22.04
+  build:
+    runs-on: ubuntu-24.04
     steps:
     - name: Install dependencies
       run: |
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends wget gpg
-        # Setup LLVM repository
         sudo mkdir -p /etc/apt/keyrings
         wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg
-        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
+        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends clang-${CLANG_VER} clang++-${CLANG_VER} libclang-rt-${CLANG_VER}-dev
+    - name: Cache UCX
+      id: cache-ucx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ucx/install
+        key: ucx-asan-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/asan-test.yaml') }}
     - name: Get UCX
-      run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/ucx
+        git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
       run: |
         cd /tmp/ucx && ./autogen.sh
-        CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install
+        CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix /tmp/ucx/install
         make -j install
-    - uses: actions/checkout@v4
-    - name: Run gtest ASAN
+    - uses: actions/checkout@v6
+    - name: Build UCC (ASAN)
+      env:
+        LD_LIBRARY_PATH: /tmp/ucx/install/lib
       run: |
-        export ASAN_OPTIONS=fast_unwind_on_malloc=0:detect_leaks=1:print_suppressions=0
-        export LSAN_OPTIONS=report_objects=1
         ./autogen.sh
         CFLAGS="-fsanitize=address" CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./configure --prefix=/tmp/ucc/install --with-ucx=/tmp/ucx/install --enable-gtest
         make -j install
-        ./test/gtest/gtest
+        cp test/gtest/gtest /tmp/ucc/install/bin/gtest
+    - name: Bundle clang runtime for test shards
+      run: |
+        CLANG_RT_DIR=$(dirname $(clang-${CLANG_VER} -print-file-name=libclang_rt.asan-x86_64.so))
+        cp -a ${CLANG_RT_DIR} /tmp/clang-rt
+    - name: Upload build artifacts
+      uses: actions/upload-artifact@v7
+      with:
+        name: ucc-asan-build
+        path: |
+          /tmp/ucx/install
+          /tmp/ucc/install
+          /tmp/clang-rt
+        retention-days: 1
+
+  gtest-asan:
+    runs-on: ubuntu-24.04
+    needs: build
+    strategy:
+      fail-fast: false
+      matrix:
+        shard: [0, 1, 2, 3]
+    name: ASAN (shard ${{ matrix.shard }})
+    steps:
+    - uses: actions/checkout@v6
+    - name: Download build artifacts
+      uses: actions/download-artifact@v7
+      with:
+        name: ucc-asan-build
+        path: /tmp
+    - name: Restore artifact permissions
+      uses: ./.github/actions/restore-artifacts
+    - name: Run UCC gtest (ASAN, shard ${{ matrix.shard }})
+      env:
+        GTEST_TOTAL_SHARDS: 4
+        GTEST_SHARD_INDEX: ${{ matrix.shard }}
+        ASAN_OPTIONS: fast_unwind_on_malloc=1:detect_leaks=1:print_suppressions=0
+        LSAN_OPTIONS: report_objects=1
+        LD_LIBRARY_PATH: /tmp/ucc/install/lib:/tmp/ucx/install/lib:/tmp/clang-rt
+      run: /tmp/ucc/install/bin/gtest
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index 0037f8bc5a..6befbf3a06 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v6
         with:
           repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
           ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
diff --git a/.github/workflows/clang-tidy-nvidia.yaml b/.github/workflows/clang-tidy-nvidia.yaml
index 495e146f35..0839c87072 100644
--- a/.github/workflows/clang-tidy-nvidia.yaml
+++ b/.github/workflows/clang-tidy-nvidia.yaml
@@ -1,23 +1,24 @@
-name: Linter-NVIDIA
+name: Lint (CUDA)
 
 on: [push, pull_request]
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: lint-cuda-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 env:
   OPEN_UCX_LINK: https://github.com/openucx/ucx
   OPEN_UCX_BRANCH: master
-  HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.22.1rc4/hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz
+  HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.25.1_cuda13/hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64.tbz
   CLANG_VER: 17
-  MLNX_OFED_VER: 24.10-2.1.8.0
-  CUDA_VER: 12-8
+  MLNX_OFED_VER: 24.10-4.1.4.0
+  CUDA_VER: 13-1
   LIBRARY_PATH: /tmp/ucx/install/lib
   LD_LIBRARY_PATH: /tmp/ucx/install/lib
 jobs:
   clang-tidy:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
+    name: clang-tidy (CUDA)
     steps:
     - name: Install dependencies
       run: |
@@ -26,35 +27,60 @@ jobs:
         # Setup LLVM repository
         sudo mkdir -p /etc/apt/keyrings
         wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg
-        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
+        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends clang-tidy-${CLANG_VER} bear clang-${CLANG_VER} clang++-${CLANG_VER}
+    - name: Cache MLNX_OFED tarball
+      id: cache-ofed
+      uses: actions/cache@v4
+      with:
+        path: MLNX_OFED_LINUX-${{ env.MLNX_OFED_VER }}-ubuntu24.04-x86_64.tgz
+        key: mlnx-ofed-${{ env.MLNX_OFED_VER }}-ubuntu24.04
+    - name: Download MLNX_OFED
+      if: steps.cache-ofed.outputs.cache-hit != 'true'
+      run: wget --no-verbose http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VER}/MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64.tgz
     - name: Install extra rdma dependencies
       run: |
-        wget --no-verbose http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VER}/MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64.tgz
-        sudo tar -xvzf MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64.tgz
-        sudo chmod -R a+rwx MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64
-        sudo MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64/mlnxofedinstall --skip-unsupported-devices-check --user-space-only --without-fw-update --force --basic -vvv
+        sudo tar -xzf MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64.tgz
+        sudo chmod -R a+rwx MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64
+        sudo MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64/mlnxofedinstall --skip-unsupported-devices-check --user-space-only --without-fw-update --force --basic -v
     - name: Install extra cuda dependencies
       run: |
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
+    - name: Cache UCX
+      id: cache-ucx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ucx/install
+        key: ucx-nvidia-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/clang-tidy-nvidia.yaml') }}
     - name: Get UCX
-      run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/ucx
+        git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
       run: |
         cd /tmp/ucx && ./autogen.sh
         CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install
         make -j install
+    - name: Cache HPCX
+      id: cache-hpcx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/hpcx
+        key: hpcx-v2.25.1-${{ hashFiles('.github/workflows/clang-tidy-nvidia.yaml') }}
     - name: Download HPCX
+      if: steps.cache-hpcx.outputs.cache-hit != 'true'
       run: |
         cd /tmp
         wget --no-verbose ${HPCX_LINK}
-        tar xjf hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz
-        mv hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64 hpcx
-    - uses: actions/checkout@v4
+        tar xjf hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64.tbz
+        mv hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64 hpcx
+    - uses: actions/checkout@v6
     - name: Build UCC
       run: |
         ./autogen.sh
diff --git a/.github/workflows/clang-tidy-rocm.yaml b/.github/workflows/clang-tidy-rocm.yaml
index 6be0d4074b..74c96ef16f 100644
--- a/.github/workflows/clang-tidy-rocm.yaml
+++ b/.github/workflows/clang-tidy-rocm.yaml
@@ -1,9 +1,9 @@
-name: Linter-ROCM
+name: Lint (ROCm)
 
 on: [push, pull_request]
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: lint-rocm-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 env:
@@ -16,6 +16,7 @@ env:
 jobs:
   clang-tidy:
     runs-on: ubuntu-22.04
+    name: clang-tidy (ROCm)
     steps:
     - name: Install dependencies
       run: |
@@ -40,14 +41,24 @@ jobs:
           bear \
           rocm-hip-sdk
         sudo ln -sf /opt/rocm-${ROCM_VER} /opt/rocm
+    - name: Cache UCX
+      id: cache-ucx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ucx/install
+        key: ucx-rocm-${{ env.OPEN_UCX_BRANCH }}-${{ env.ROCM_VER }}-${{ hashFiles('.github/workflows/clang-tidy-rocm.yaml') }}
     - name: Get UCX
-      run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/ucx
+        git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
       run: |
         cd /tmp/ucx && ./autogen.sh
         CC=gcc CXX=g++ ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install --with-rocm=/opt/rocm
         make -j install
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Build UCC
       run: |
         ./autogen.sh
diff --git a/.github/workflows/clang-tidy.yaml b/.github/workflows/clang-tidy.yaml
index da6bb9cabe..8b5af028b9 100644
--- a/.github/workflows/clang-tidy.yaml
+++ b/.github/workflows/clang-tidy.yaml
@@ -1,9 +1,9 @@
-name: Linter
+name: Lint (CPU)
 
 on: [push, pull_request]
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: lint-cpu-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 env:
@@ -12,7 +12,8 @@ env:
   CLANG_VER: 17
 jobs:
   clang-tidy:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
+    name: clang-tidy
     steps:
     - name: Install dependencies
       run: |
@@ -21,17 +22,27 @@ jobs:
         # Setup LLVM repository
         sudo mkdir -p /etc/apt/keyrings
         wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg
-        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
+        echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends clang-tidy-${CLANG_VER} bear
+    - name: Cache UCX
+      id: cache-ucx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ucx/install
+        key: ucx-tidy-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/clang-tidy.yaml') }}
     - name: Get UCX
-      run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/ucx
+        git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
       run: |
         cd /tmp/ucx && ./autogen.sh
         CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install
         make -j install
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Build UCC
       run: |
         ./autogen.sh
diff --git a/.github/workflows/codestyle.yaml b/.github/workflows/codestyle.yaml
index 5d51b6ee9e..1d500d5f75 100644
--- a/.github/workflows/codestyle.yaml
+++ b/.github/workflows/codestyle.yaml
@@ -1,10 +1,9 @@
-name: Codestyle
+name: Lint (codestyle)
 
 on: [pull_request]
 
-# Cancel in-progress runs for the same PR
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: lint-codestyle-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 env:
@@ -13,8 +12,8 @@ env:
 
 jobs:
   check-codestyle:
-    runs-on: ubuntu-22.04
-    name: Check code style
+    runs-on: ubuntu-24.04
+    name: codestyle
     defaults:
       run:
         shell: bash
@@ -27,7 +26,7 @@ jobs:
         chmod +x ./git-clang-format
         sudo mv ./git-clang-format /usr/bin/git-clang-format
     - name: Checking out repository
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6
       with:
         ref: ${{ github.event.pull_request.head.sha }}
         fetch-depth: 0
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 0d57adedb7..bc8a0720c5 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -14,14 +14,14 @@ jobs:
       run: |
         sudo apt-get update
         sudo apt-get install -y --no-install-recommends doxygen doxygen-latex cm-super texlive-fonts-recommended
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Build UCC docs
       run: |
         ./autogen.sh
         ./configure --with-docs-only
         make docs
     - name: Upload docs
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v7
       with:
         name: docs
         path: ${{ github.workspace }}/docs/doxygen-doc/ucc.pdf
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index c26d0d0971..6791c3f0e9 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -1,4 +1,4 @@
-name: OpenMPI tests
+name: Build & Test
 
 on: [push, pull_request]
 
@@ -14,43 +14,109 @@ env:
   IMB_LINK: https://github.com/intel/mpi-benchmarks.git
   IMB_COLLS: allgather,allgatherv,allreduce,alltoall,alltoallv,barrier,bcast,gather,gatherv,reduce,reduce_scatter,reduce_scatter_block,scatter,scatterv
 jobs:
-  tests:
+  build:
     runs-on: ubuntu-latest
     steps:
-    - name: Install dependencies
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y --no-install-recommends doxygen doxygen-latex
+    - name: Cache UCX
+      id: cache-ucx
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ucx/install
+        key: ucx-release-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/main.yaml') }}
     - name: Get UCX
-      run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/ucx
+        git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
     - name: Build UCX
+      if: steps.cache-ucx.outputs.cache-hit != 'true'
       run: |
         cd /tmp/ucx && ./autogen.sh
-        ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install
+        ./contrib/configure-release --without-java --without-go --disable-numa --prefix /tmp/ucx/install
         make -j install
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
     - name: Build UCC
       run: |
         ./autogen.sh
         ./configure --prefix=/tmp/ucc/install --enable-gtest --with-ucx=/tmp/ucx/install
         make -j`nproc` install
-        make gtest
+        cp test/gtest/gtest /tmp/ucc/install/bin/gtest
+    - name: Upload build artifacts
+      uses: actions/upload-artifact@v7
+      with:
+        name: ucc-build
+        path: |
+          /tmp/ucx/install
+          /tmp/ucc/install
+        retention-days: 1
+
+  gtest:
+    runs-on: ubuntu-latest
+    needs: build
+    strategy:
+      fail-fast: false
+      matrix:
+        shard: [0, 1]
+    name: test (shard ${{ matrix.shard }})
+    steps:
+    - uses: actions/checkout@v6
+    - name: Download build artifacts
+      uses: actions/download-artifact@v7
+      with:
+        name: ucc-build
+        path: /tmp
+    - name: Restore artifact permissions
+      uses: ./.github/actions/restore-artifacts
+    - name: Run UCC gtest (shard ${{ matrix.shard }})
+      env:
+        GTEST_TOTAL_SHARDS: 2
+        GTEST_SHARD_INDEX: ${{ matrix.shard }}
+        LD_LIBRARY_PATH: /tmp/ucc/install/lib:/tmp/ucx/install/lib
+      run: /tmp/ucc/install/bin/gtest
     - name: Run ucc_info
+      if: matrix.shard == 0
       run: |
+        export LD_LIBRARY_PATH=/tmp/ucc/install/lib:/tmp/ucx/install/lib
         /tmp/ucc/install/bin/ucc_info -vc
     - name: Run CMake tests
+      if: matrix.shard == 0
       run: |
         set -e
+        export LD_LIBRARY_PATH=/tmp/ucc/install/lib:/tmp/ucx/install/lib
         cmake -S test/cmake -B /tmp/cmake-ucc -DCMAKE_PREFIX_PATH=/tmp/ucc/install
         cd /tmp/cmake-ucc
         cmake --build .
         ./test_ucc
+
+  ompi-build:
+    runs-on: ubuntu-latest
+    name: OMPI build
+    needs: build
+    steps:
+    - name: Download build artifacts
+      uses: actions/download-artifact@v7
+      with:
+        name: ucc-build
+        path: /tmp
+    - name: Restore permissions
+      run: |
+        chmod -R +x /tmp/ucx/install/bin /tmp/ucx/install/lib || true
+        chmod -R +x /tmp/ucc/install/bin /tmp/ucc/install/lib || true
+    - name: Cache OMPI
+      id: cache-ompi
+      uses: actions/cache@v4
+      with:
+        path: /tmp/ompi/install
+        key: ompi-${{ env.OPEN_MPI_BRANCH }}-${{ hashFiles('.github/workflows/main.yaml', 'configure.ac') }}
     - name: Get OMPI
+      if: steps.cache-ompi.outputs.cache-hit != 'true'
       run: |
-         git clone ${OPEN_MPI_LINK} -b ${OPEN_MPI_BRANCH} /tmp/ompi
-         cd /tmp/ompi
-         git submodule update --init --recursive
+        rm -rf /tmp/ompi
+        git clone --depth 1 ${OPEN_MPI_LINK} -b ${OPEN_MPI_BRANCH} /tmp/ompi
+        cd /tmp/ompi
+        git submodule update --init --recursive --depth 1
     - name: Build OMPI
+      if: steps.cache-ompi.outputs.cache-hit != 'true'
       run: >
         cd /tmp/ompi
 
@@ -65,75 +131,120 @@ jobs:
         --with-ucc=/tmp/ucc/install
 
         make -j install
+    - name: Upload OMPI build artifacts
+      uses: actions/upload-artifact@v7
+      with:
+        name: ompi-build
+        path: |
+          /tmp/ucx/install
+          /tmp/ucc/install
+          /tmp/ompi/install
+        retention-days: 1
+
+  ompi-perftest:
+    runs-on: ubuntu-latest
+    name: OMPI perftest
+    needs: ompi-build
+    steps:
+    - uses: actions/checkout@v6
+    - name: Download OMPI build artifacts
+      uses: actions/download-artifact@v7
+      with:
+        name: ompi-build
+        path: /tmp
+    - name: Restore artifact permissions
+      uses: ./.github/actions/restore-artifacts
+      with:
+        ompi: 'true'
     - name: Build ucc_perftest (with OMPI)
       run: |
         set -e
+        ./autogen.sh
         CC=/tmp/ompi/install/bin/mpicc CXX=/tmp/ompi/install/bin/mpicxx \
-          ./configure --prefix=/tmp/ucc/install --enable-gtest --with-ucx=/tmp/ucx/install --with-mpi=/tmp/ompi/install
-        make -C tools/perf -j`nproc`
-        make -C tools/perf install
+          ./configure --prefix=/tmp/ucc/install --with-ucx=/tmp/ucx/install --with-mpi=/tmp/ompi/install
+        make -j`nproc` install
     - name: Run ucc_perftest
       run: |
         set -e
         test -x /tmp/ucc/install/bin/ucc_perftest
         export LD_LIBRARY_PATH=/tmp/ucc/install/lib:/tmp/ucx/install/lib:/tmp/ompi/install/lib:$LD_LIBRARY_PATH
+        MPI_ARGS=(
+          -np 4 -H localhost:4
+          --bind-to none
+          --mca pml ucx
+          --mca pml_ucx_tls any
+          --mca pml_ucx_devices any
+          --mca coll_ucc_enable 0
+          -x LD_LIBRARY_PATH
+          -x UCC_LOG_LEVEL=info
+          -x UCC_TLS=ucp
+          -x UCC_CONFIG_FILE=
+        )
+        PERFTEST_ARGS=(-m host -d float32 -b 1024 -e 1024 -n 10 -w 2)
         COLLS=(
           allgather allgatherv allreduce alltoall alltoallv
           barrier bcast gather gatherv
           reduce reduce_scatter reduce_scatterv scatterv
         )
-        for c in "${COLLS[@]}"; do
-          echo "Running ucc_perftest -c ${c}"
-          /tmp/ompi/install/bin/mpirun \
-            -np 4 -H localhost:4 \
-            --bind-to none \
-            --mca pml ucx \
-            --mca pml_ucx_tls any \
-            --mca pml_ucx_devices any \
-            --mca coll_ucc_enable 0 \
-            -x LD_LIBRARY_PATH \
-            -x UCC_LOG_LEVEL=info \
-            -x UCC_TLS=ucp \
-            -x UCC_CONFIG_FILE= \
-            /tmp/ucc/install/bin/ucc_perftest \
-              -c "${c}" -m host -d float32 -b 1024 -e 1024 -n 10 -w 2
-        done
         INPLACE_COLLS=(
           allgather allgatherv allreduce
           gather gatherv
           reduce reduce_scatter reduce_scatterv
         )
+        for c in "${COLLS[@]}"; do
+          echo "Running ucc_perftest -c ${c}"
+          /tmp/ompi/install/bin/mpirun "${MPI_ARGS[@]}" \
+            /tmp/ucc/install/bin/ucc_perftest -c "${c}" "${PERFTEST_ARGS[@]}"
+        done
         for c in "${INPLACE_COLLS[@]}"; do
           echo "Running ucc_perftest (inplace) -c ${c}"
-          /tmp/ompi/install/bin/mpirun \
-            -np 4 -H localhost:4 \
-            --bind-to none \
-            --mca pml ucx \
-            --mca pml_ucx_tls any \
-            --mca pml_ucx_devices any \
-            --mca coll_ucc_enable 0 \
-            -x LD_LIBRARY_PATH \
-            -x UCC_LOG_LEVEL=info \
-            -x UCC_TLS=ucp \
-            -x UCC_CONFIG_FILE= \
-            /tmp/ucc/install/bin/ucc_perftest \
-              -c "${c}" -m host -d float32 -b 1024 -e 1024 -n 10 -w 2 -i
+          /tmp/ompi/install/bin/mpirun "${MPI_ARGS[@]}" \
+            /tmp/ucc/install/bin/ucc_perftest -c "${c}" "${PERFTEST_ARGS[@]}" -i
         done
+
+  ompi-imb:
+    runs-on: ubuntu-latest
+    name: OMPI IMB
+    needs: ompi-build
+    steps:
+    - uses: actions/checkout@v6
+    - name: Download OMPI build artifacts
+      uses: actions/download-artifact@v7
+      with:
+        name: ompi-build
+        path: /tmp
+    - name: Restore artifact permissions
+      uses: ./.github/actions/restore-artifacts
+      with:
+        ompi: 'true'
+    - name: Cache IMB
+      id: cache-imb
+      uses: actions/cache@v4
+      with:
+        path: /tmp/imb
+        key: imb-${{ hashFiles('.github/workflows/main.yaml') }}
     - name: Get IMB
-      run: git clone ${IMB_LINK} /tmp/imb
+      if: steps.cache-imb.outputs.cache-hit != 'true'
+      run: |
+        rm -rf /tmp/imb
+        git clone --depth 1 ${IMB_LINK} /tmp/imb
     - name: Build IMB
+      if: steps.cache-imb.outputs.cache-hit != 'true'
       run: |
-         cd /tmp/imb
-         make CC=/tmp/ompi/install/bin/mpicc CXX=/tmp/ompi/install/bin/mpicxx CPPFLAGS="-DCHECK=1" -j IMB-MPI1
+        cd /tmp/imb
+        make CC=/tmp/ompi/install/bin/mpicc CXX=/tmp/ompi/install/bin/mpicxx CPPFLAGS="-DCHECK=1" -j IMB-MPI1
     - name: Run IMB-DCHECK
+      env:
+        LD_LIBRARY_PATH: /tmp/ucc/install/lib:/tmp/ucx/install/lib:/tmp/ompi/install/lib
       run: >
-         /tmp/ompi/install/bin/mpirun
-         -np 8
-         -H localhost:8
-         --bind-to none
-         --mca pml ucx
-         --mca pml_ucx_tls any
-         --mca pml_ucx_devices any
-         --mca coll_ucc_priority 100
-         --mca coll_ucc_enable 1
-         /tmp/imb/IMB-MPI1 ${IMB_COLLS} -iter 10 -iter_policy off
+        /tmp/ompi/install/bin/mpirun
+        -np 8
+        -H localhost:8
+        --bind-to none
+        --mca pml ucx
+        --mca pml_ucx_tls any
+        --mca pml_ucx_devices any
+        --mca coll_ucc_priority 100
+        --mca coll_ucc_enable 1
+        -x LD_LIBRARY_PATH
+        /tmp/imb/IMB-MPI1 ${IMB_COLLS} -iter 10 -iter_policy off