Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 11 additions & 23 deletions .ci/scripts/build_ucc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,8 @@ export UCC_ENABLE_GTEST=${UCC_ENABLE_GTEST:-yes}
export UCC_ENABLE_NVLS=${UCC_ENABLE_NVLS:-no}
export UCC_BUILD_TLS=${UCC_BUILD_TLS:-cuda,nccl,self,sharp,shm,ucp,mlx5}

# In containers, calculate based on memory limits to avoid OOM
# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes
if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then
# Prefer cgroupv1 path, fall back to cgroupv2 or static default if not found
if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
elif [ -f /sys/fs/cgroup/memory.max ]; then
limit=$(cat /sys/fs/cgroup/memory.max)
# If cgroupv2 limit is "max", meaning unlimited, set to 4GB to avoid OOM
[ "$limit" = "max" ] && limit=$((4 * 1024 * 1024 * 1024))
else
# Default to 4GB if no limit is found
limit=$((4 * 1024 * 1024 * 1024))
fi

# Use 1 build process per GB of memory, clamp in [1,16]
nproc=$((limit / (1024 * 1024 * 1024)))
[ "$nproc" -gt 16 ] && nproc=16
[ "$nproc" -lt 1 ] && nproc=1
else
nproc=$(nproc --all)
fi
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"
. "${SCRIPT_DIR}/common.sh"

echo "INFO: Build UCC"
UCC_SRC_DIR="${SRC_DIR}/ucc"
Expand Down Expand Up @@ -56,7 +36,15 @@ fi

echo "INFO: Configure flags: ${CONFIGURE_FLAGS}"
eval "${UCC_SRC_DIR}/configure ${CONFIGURE_FLAGS}"
make "-j${nproc}" install

# Skip libtool relinking during install: the relink produces identical RUNPATH
# and adds ~5s of pure overhead per build.
if ! grep -q 'need_relink=yes' libtool; then
echo "WARN: libtool relinking patch had no effect (need_relink=yes not found)"
fi
sed -i 's/need_relink=yes/need_relink=no/g' libtool

make "-j${NPROC}" install
echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf
ldconfig
ldconfig -p | grep -i libucc
5 changes: 4 additions & 1 deletion .ci/scripts/build_ucx.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#!/bin/bash -eEx
set -o pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"
. "${SCRIPT_DIR}/common.sh"

echo "INFO: Build UCX"
cd "${SRC_DIR}/ucx"
"${SRC_DIR}/ucx/autogen.sh"
mkdir -p "${SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}"
cd "${SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}"
"${SRC_DIR}/ucx/contrib/configure-release-mt" --with-cuda="${CUDA_HOME}" --prefix="${UCX_INSTALL_DIR}"
make -j install
make "-j${NPROC}" install
echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf
ldconfig
ldconfig -p | grep -i ucx
19 changes: 8 additions & 11 deletions .ci/scripts/common.sh
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
# In containers, calculate based on memory limits to avoid OOM
# Determine number of parallel build jobs based on available system memory if running inside a container/Kubernetes
# Determine number of parallel build jobs based on available system memory.
# In containers/Kubernetes, use cgroup memory limits to avoid OOM.
if [ -f /.dockerenv ] || [ -f /run/.containerenv ] || [ -n "${KUBERNETES_SERVICE_HOST}" ]; then
# Prefer cgroupv1 path, fall back to cgroupv2 or static default if not found
if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
limit=$(cat /sys/fs/cgroup/memory/memory.limit_in_bytes)
elif [ -f /sys/fs/cgroup/memory.max ]; then
limit=$(cat /sys/fs/cgroup/memory.max)
# If cgroupv2 limit is "max", meaning unlimited, set to 4GB to avoid OOM
[ "$limit" = "max" ] && limit=$((4 * 1024 * 1024 * 1024))
else
# Default to 4GB if no limit is found
limit=$((4 * 1024 * 1024 * 1024))
fi

# Use 1 build process per GB of memory, clamp in [1,16]
nproc=$((limit / (1024 * 1024 * 1024)))
[ "$nproc" -gt 16 ] && nproc=16
[ "$nproc" -lt 1 ] && nproc=1
export NPROC=$nproc
NPROC=$((limit / (1024 * 1024 * 1024)))
[ "$NPROC" -gt 16 ] && NPROC=16
[ "$NPROC" -lt 1 ] && NPROC=1
else
NPROC=$(nproc --all)
fi
export NPROC

28 changes: 28 additions & 0 deletions .github/actions/restore-artifacts/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Restore artifact permissions
description: >
Download-artifact strips execute bits. This action restores them for the
standard UCC/UCX/OMPI install trees used across CI jobs.
inputs:
ucx:
description: Restore UCX install permissions
default: 'true'
ucc:
description: Restore UCC install permissions
default: 'true'
ompi:
description: Restore OMPI install permissions
default: 'false'
runs:
using: composite
steps:
- shell: bash
run: |
if [ "${{ inputs.ucx }}" = "true" ]; then
chmod -R +x /tmp/ucx/install/bin /tmp/ucx/install/lib 2>/dev/null || true
fi
if [ "${{ inputs.ucc }}" = "true" ]; then
chmod -R +x /tmp/ucc/install/bin /tmp/ucc/install/lib 2>/dev/null || true
fi
if [ "${{ inputs.ompi }}" = "true" ]; then
chmod -R +x /tmp/ompi/install/bin /tmp/ompi/install/lib 2>/dev/null || true
fi
72 changes: 60 additions & 12 deletions .github/workflows/asan-test.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: ASAN Tests
name: ASAN

on: [push, pull_request]

Expand All @@ -12,32 +12,80 @@ env:
CLANG_VER: 17

jobs:
gtest-asan:
runs-on: ubuntu-22.04
build:
runs-on: ubuntu-24.04
steps:
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends wget gpg
# Setup LLVM repository
sudo mkdir -p /etc/apt/keyrings
wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg
echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
sudo apt-get update
sudo apt-get install -y --no-install-recommends clang-${CLANG_VER} clang++-${CLANG_VER} libclang-rt-${CLANG_VER}-dev
- name: Cache UCX
id: cache-ucx
uses: actions/cache@v4
with:
path: /tmp/ucx/install
key: ucx-asan-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/asan-test.yaml') }}
- name: Get UCX
run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
if: steps.cache-ucx.outputs.cache-hit != 'true'
run: |
rm -rf /tmp/ucx
git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
- name: Build UCX
if: steps.cache-ucx.outputs.cache-hit != 'true'
run: |
cd /tmp/ucx && ./autogen.sh
CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install
CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix /tmp/ucx/install
make -j install
- uses: actions/checkout@v4
- name: Run gtest ASAN
- uses: actions/checkout@v6
- name: Build UCC (ASAN)
env:
LD_LIBRARY_PATH: /tmp/ucx/install/lib
run: |
export ASAN_OPTIONS=fast_unwind_on_malloc=0:detect_leaks=1:print_suppressions=0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these options were added by @MamziB to get better backtrace report from asan

export LSAN_OPTIONS=report_objects=1
./autogen.sh
CFLAGS="-fsanitize=address" CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./configure --prefix=/tmp/ucc/install --with-ucx=/tmp/ucx/install --enable-gtest
make -j install
./test/gtest/gtest
cp test/gtest/gtest /tmp/ucc/install/bin/gtest
- name: Bundle clang runtime for test shards
run: |
CLANG_RT_DIR=$(dirname $(clang-${CLANG_VER} -print-file-name=libclang_rt.asan-x86_64.so))
cp -a ${CLANG_RT_DIR} /tmp/clang-rt
- name: Upload build artifacts
uses: actions/upload-artifact@v7
with:
name: ucc-asan-build
path: |
/tmp/ucx/install
/tmp/ucc/install
/tmp/clang-rt
retention-days: 1

gtest-asan:
runs-on: ubuntu-24.04
needs: build
strategy:
fail-fast: false
matrix:
shard: [0, 1, 2, 3]
name: ASAN (shard ${{ matrix.shard }})
steps:
- uses: actions/checkout@v6
- name: Download build artifacts
uses: actions/download-artifact@v7
with:
name: ucc-asan-build
path: /tmp
- name: Restore artifact permissions
uses: ./.github/actions/restore-artifacts
- name: Run UCC gtest (ASAN, shard ${{ matrix.shard }})
env:
GTEST_TOTAL_SHARDS: 4
GTEST_SHARD_INDEX: ${{ matrix.shard }}
ASAN_OPTIONS: fast_unwind_on_malloc=1:detect_leaks=1:print_suppressions=0
LSAN_OPTIONS: report_objects=1
LD_LIBRARY_PATH: /tmp/ucc/install/lib:/tmp/ucx/install/lib:/tmp/clang-rt
run: /tmp/ucc/install/bin/gtest
2 changes: 1 addition & 1 deletion .github/workflows/blossom-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v6
with:
repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
Expand Down
58 changes: 42 additions & 16 deletions .github/workflows/clang-tidy-nvidia.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
name: Linter-NVIDIA
name: Lint (CUDA)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

previous name was correct, because we were checking not only CUDA builds but also Nvidia networking


on: [push, pull_request]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: lint-cuda-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
OPEN_UCX_LINK: https://github.com/openucx/ucx
OPEN_UCX_BRANCH: master
HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.22.1rc4/hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz
HPCX_LINK: https://content.mellanox.com/hpc/hpc-x/v2.25.1_cuda13/hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64.tbz
CLANG_VER: 17
MLNX_OFED_VER: 24.10-2.1.8.0
CUDA_VER: 12-8
MLNX_OFED_VER: 24.10-4.1.4.0
CUDA_VER: 13-1
LIBRARY_PATH: /tmp/ucx/install/lib
LD_LIBRARY_PATH: /tmp/ucx/install/lib
jobs:
clang-tidy:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
name: clang-tidy (CUDA)
steps:
- name: Install dependencies
run: |
Expand All @@ -26,35 +27,60 @@ jobs:
# Setup LLVM repository
sudo mkdir -p /etc/apt/keyrings
wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/llvm.gpg
echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/noble/ llvm-toolchain-noble-${CLANG_VER} main" | sudo tee /etc/apt/sources.list.d/llvm.list
sudo apt-get update
sudo apt-get install -y --no-install-recommends clang-tidy-${CLANG_VER} bear clang-${CLANG_VER} clang++-${CLANG_VER}
- name: Cache MLNX_OFED tarball
id: cache-ofed
uses: actions/cache@v4
with:
path: MLNX_OFED_LINUX-${{ env.MLNX_OFED_VER }}-ubuntu24.04-x86_64.tgz
key: mlnx-ofed-${{ env.MLNX_OFED_VER }}-ubuntu24.04
- name: Download MLNX_OFED
if: steps.cache-ofed.outputs.cache-hit != 'true'
run: wget --no-verbose http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VER}/MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64.tgz
- name: Install extra rdma dependencies
run: |
wget --no-verbose http://content.mellanox.com/ofed/MLNX_OFED-${MLNX_OFED_VER}/MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64.tgz
sudo tar -xvzf MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64.tgz
sudo chmod -R a+rwx MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64
sudo MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu22.04-x86_64/mlnxofedinstall --skip-unsupported-devices-check --user-space-only --without-fw-update --force --basic -vvv
sudo tar -xzf MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64.tgz
sudo chmod -R a+rwx MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64
sudo MLNX_OFED_LINUX-${MLNX_OFED_VER}-ubuntu24.04-x86_64/mlnxofedinstall --skip-unsupported-devices-check --user-space-only --without-fw-update --force --basic -v
- name: Install extra cuda dependencies
run: |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y --no-install-recommends cuda-cudart-dev-${CUDA_VER} cuda-nvcc-${CUDA_VER} cuda-nvml-dev-${CUDA_VER}
- name: Cache UCX
id: cache-ucx
uses: actions/cache@v4
with:
path: /tmp/ucx/install
key: ucx-nvidia-${{ env.OPEN_UCX_BRANCH }}-${{ hashFiles('.github/workflows/clang-tidy-nvidia.yaml') }}
- name: Get UCX
run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
if: steps.cache-ucx.outputs.cache-hit != 'true'
run: |
rm -rf /tmp/ucx
git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
- name: Build UCX
if: steps.cache-ucx.outputs.cache-hit != 'true'
run: |
cd /tmp/ucx && ./autogen.sh
CC=clang-${CLANG_VER} CXX=clang++-${CLANG_VER} ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install
make -j install
- name: Cache HPCX
id: cache-hpcx
uses: actions/cache@v4
with:
path: /tmp/hpcx
key: hpcx-v2.25.1-${{ hashFiles('.github/workflows/clang-tidy-nvidia.yaml') }}
- name: Download HPCX
if: steps.cache-hpcx.outputs.cache-hit != 'true'
run: |
cd /tmp
wget --no-verbose ${HPCX_LINK}
tar xjf hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64.tbz
mv hpcx-v2.22.1-gcc-doca_ofed-ubuntu22.04-cuda12-x86_64 hpcx
- uses: actions/checkout@v4
tar xjf hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64.tbz
mv hpcx-v2.25.1-gcc-doca_ofed-ubuntu24.04-cuda13-x86_64 hpcx
- uses: actions/checkout@v6
- name: Build UCC
run: |
./autogen.sh
Expand Down
19 changes: 15 additions & 4 deletions .github/workflows/clang-tidy-rocm.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: Linter-ROCM
name: Lint (ROCm)

on: [push, pull_request]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: lint-rocm-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
Expand All @@ -16,6 +16,7 @@ env:
jobs:
clang-tidy:
runs-on: ubuntu-22.04
name: clang-tidy (ROCm)
steps:
- name: Install dependencies
run: |
Expand All @@ -40,14 +41,24 @@ jobs:
bear \
rocm-hip-sdk
sudo ln -sf /opt/rocm-${ROCM_VER} /opt/rocm
- name: Cache UCX
id: cache-ucx
uses: actions/cache@v4
with:
path: /tmp/ucx/install
key: ucx-rocm-${{ env.OPEN_UCX_BRANCH }}-${{ env.ROCM_VER }}-${{ hashFiles('.github/workflows/clang-tidy-rocm.yaml') }}
- name: Get UCX
run: git clone ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
if: steps.cache-ucx.outputs.cache-hit != 'true'
run: |
rm -rf /tmp/ucx
git clone --depth 1 ${OPEN_UCX_LINK} -b ${OPEN_UCX_BRANCH} /tmp/ucx
- name: Build UCX
if: steps.cache-ucx.outputs.cache-hit != 'true'
run: |
cd /tmp/ucx && ./autogen.sh
CC=gcc CXX=g++ ./contrib/configure-release --without-java --without-go --disable-numa --prefix $PWD/install --with-rocm=/opt/rocm
make -j install
- uses: actions/checkout@v4
- uses: actions/checkout@v6
- name: Build UCC
run: |
./autogen.sh
Expand Down
Loading
Loading