Skip to content

Use the sccache-dist build cluster in CI #479

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 26 commits into
base: branch-25.06
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
631d3f5
Add scripts to configure sccache-dist and use the build cluster in CI
trxcllnt Apr 12, 2025
b87e1e7
Merge branch 'branch-25.06' of github.com:rapidsai/devcontainers into…
trxcllnt Apr 14, 2025
a72407d
Merge branch 'branch-25.06' into fea/use-sccache-build-cluster
trxcllnt Apr 16, 2025
3932245
remove -DCMAKE_POLICY_VERSION_MINIMUM=3.5
trxcllnt Apr 16, 2025
1de075e
temporarily use my cuspatial fork
trxcllnt Apr 16, 2025
36ad779
remove cub and thrust namespace mangling flags
trxcllnt Apr 17, 2025
8b60e23
cpu8 -> cpu4
trxcllnt Apr 17, 2025
e07b0dd
test shared-workflows fea/devcontainers-job-timeout changes
trxcllnt Apr 17, 2025
5c138aa
rename SCCACHE_DIST_SCHEDULER_URL to SCCACHE_DIST_URL
trxcllnt Apr 17, 2025
ace1e27
use cpu8 again, clone with --depth 1
trxcllnt Apr 17, 2025
5978e0d
cleanup
trxcllnt Apr 21, 2025
7f8b878
Merge branch 'branch-25.06' of github.com:rapidsai/devcontainers into…
trxcllnt Apr 22, 2025
fef88ec
Merge branch 'branch-25.06' of github.com:rapidsai/devcontainers into…
trxcllnt Apr 22, 2025
cc2a032
remove -l from ninja args and bump feature versions
trxcllnt Apr 22, 2025
622a5af
Merge branch 'branch-25.06' of github.com:rapidsai/devcontainers into…
trxcllnt Apr 23, 2025
31436cf
switch back to [email protected]
trxcllnt Apr 30, 2025
b126fd1
revert cuspatial chnage in manifest.yaml
trxcllnt May 1, 2025
3cf4f74
pass -q option to `git branch --remotes -d`
trxcllnt May 1, 2025
c86444a
print messages before doing stuff
trxcllnt May 2, 2025
0ab02c4
Merge branch 'branch-25.06' of github.com:rapidsai/devcontainers into…
trxcllnt May 2, 2025
4f767f4
ensure the gh cli user is refreshed if the build cluster is enabled b…
trxcllnt May 2, 2025
7933911
cleanup
trxcllnt May 2, 2025
9285ac5
log output
trxcllnt May 3, 2025
471307c
be safer when constructing command args
trxcllnt May 3, 2025
92d25ad
don't background rapids-generate-scripts
trxcllnt May 3, 2025
a34964f
fix typo
trxcllnt May 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .devcontainer/cuda11.8-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda",
"--ulimit",
"nofile=1048576:1048576"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
4 changes: 3 additions & 1 deletion .devcontainer/cuda11.8-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip",
"--ulimit",
"nofile=1048576:1048576"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
4 changes: 3 additions & 1 deletion .devcontainer/cuda12.8-conda/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda",
"--ulimit",
"nofile=1048576:1048576"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
4 changes: 3 additions & 1 deletion .devcontainer/cuda12.8-pip/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
"runArgs": [
"--rm",
"--name",
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip",
"--ulimit",
"nofile=1048576:1048576"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand Down
16 changes: 15 additions & 1 deletion .devcontainer/rapids.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ ENV DEFAULT_CONDA_ENV=rapids

FROM ${PYTHON_PACKAGE_MANAGER}-base

ARG TARGETARCH

ARG CUDA
ENV CUDAARCHS="RAPIDS"
ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
Expand All @@ -49,9 +51,21 @@ ENV PYTHONDONTWRITEBYTECODE="1"

ENV SCCACHE_REGION="us-east-2"
ENV SCCACHE_BUCKET="rapids-sccache-devs"
ENV SCCACHE_IDLE_TIMEOUT=900
ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
ENV SCCACHE_DIST_REQUEST_TIMEOUT=1800
ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.gha-runners.nvidia.com"
ENV SCCACHE_IDLE_TIMEOUT=1800
ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"

ENV HISTFILE="/home/coder/.cache/._bash_history"

ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"

# Prevent the sccache server from shutting down
ENV SCCACHE_IDLE_TIMEOUT=0
ENV SCCACHE_SERVER_LOG="sccache=info"
ENV SCCACHE_S3_KEY_PREFIX=rapids-test-sccache-dist

# Build as much in parallel as possible
ENV INFER_NUM_DEVICE_ARCHITECTURES=1
ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20
2 changes: 2 additions & 0 deletions .github/actions/build-and-test-feature/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ inputs:
aws_role_arn: {type: string, defaut: '', required: false}
rw_sccache_bucket: {type: string, defaut: '', required: false}
rw_sccache_region: {type: string, defaut: '', required: false}
sccache_dist_scheduler_url: {type: string, defaut: '', required: false}

runs:
using: composite
Expand All @@ -32,3 +33,4 @@ runs:
aws_role_arn: "${{ inputs.aws_role_arn }}"
rw_sccache_bucket: "${{ inputs.rw_sccache_bucket }}"
rw_sccache_region: "${{ inputs.rw_sccache_region }}"
sccache_dist_scheduler_url: "${{ inputs.sccache_dist_scheduler_url }}"
60 changes: 45 additions & 15 deletions .github/workflows/build-all-rapids-repos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,25 +37,55 @@ jobs:
with:
arch: '["amd64", "arm64"]'
cuda: '["12.8"]'
node_type: cpu32
node_type: cpu8
extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
timeout-minutes: 720
# Prevent the sccache server from shutting down
# 1. Prevent the sccache server from shutting down
# 2. Infinitely retry transient errors
# 3. Never fallback to locally compiling
env: |
SCCACHE_IDLE_TIMEOUT=0
SCCACHE_DIST_MAX_RETRIES=inf
SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
build_command: |
SCCACHE_NO_DAEMON=1 sccache --stop-server
sccache -z;
# Install the latest sccache client
devcontainer-utils-install-sccache --repo trxcllnt/sccache;

# Configure the sccache client to talk to the build cluster
devcontainer-utils-init-sccache-dist \
--enable-sccache-dist - <<< " \
--auth-type 'token' \
--auth-token '$RAPIDS_AUX_SECRET_1' \
";

# Verify sccache cache location
sccache --show-adv-stats;
clone-all -j$(nproc) -v -q --clone-upstream --single-branch --shallow-submodules;
build-all \
-Wno-dev \
-j$(nproc --ignore=1) \
-DBUILD_SHARED_LIBS=ON \
-DBUILD_TESTS=ON \
-DBUILD_BENCHMARKS=ON \
-DBUILD_PRIMS_BENCH=ON \
-DRAFT_COMPILE_LIBRARY=ON \
-DBUILD_CUGRAPH_MG_TESTS=ON \
;

# Clone all the repos
clone-all -j$(nproc) -v -q --clone-upstream --depth 1 --single-branch --shallow-submodules --no-update-env;

# Create the python env without ninja.
# ninja -j10000 fails with `ninja: FATAL: pipe: Too many open files`.
# This appears to have been fixed 13 years ago (https://github.com/ninja-build/ninja/issues/233),
# so that fix needs to be integrated into the kitware pip ninja builds.
rapids-post-start-command --exclude <(echo ninja);

set -x;

# Configure all the C++ libs
configure-all \
-j$(ulimit -Hn) \
-Wno-dev \
-DBUILD_TESTS=ON \
-DBUILD_BENCHMARKS=ON \
-DBUILD_PRIMS_BENCH=ON \
-DBUILD_SHARED_LIBS=ON \
-DRAFT_COMPILE_LIBRARY=ON \
-DBUILD_CUGRAPH_MG_TESTS=ON ;

# Build all the libs
build-all -j$(ulimit -Hn);

# Print cache and dist stats
sccache --show-adv-stats;
1 change: 1 addition & 0 deletions .github/workflows/build-and-test-feature.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ jobs:
aws_role_arn: "${{ secrets.GIST_REPO_READ_ORG_GITHUB_TOKEN && 'arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs' || '' }}"
rw_sccache_bucket: "${{ secrets.GIST_REPO_READ_ORG_GITHUB_TOKEN && 'rapids-sccache-devs' || '' }}"
rw_sccache_region: "${{ vars.AWS_REGION }}"
sccache_dist_scheduler_url: "sccache.gha-runners.nvidia.com"
2 changes: 1 addition & 1 deletion features/src/openmpi/devcontainer-feature.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "OpenMPI",
"id": "openmpi",
"version": "25.6.2",
"version": "25.6.3",
"description": "A feature to install OpenMPI with optional CUDA and UCX support",
"options": {
"version": {
Expand Down
2 changes: 1 addition & 1 deletion features/src/openmpi/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ build_and_install_openmpi() {
local -a cuda_args=();
if test "${ENABLE_CUDA:-}" = 1; then
cuda_args+=(--with-cuda="${CUDA_HOME:-/usr/local/cuda}");
cuda_args+=(--with-cuda-libdir="${CUDA_HOME:-/usr/local/cuda}/lib64/stubs}");
cuda_args+=(--with-cuda-libdir="${CUDA_HOME:-/usr/local/cuda}/lib64/stubs");
fi

IFS=" " read -r -a openmpi_dev_deps <<< "$(install_openmpi_deps)";
Expand Down
11 changes: 7 additions & 4 deletions features/src/rapids-build-utils/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,13 @@ done

# Install bash_completion script
if command -v devcontainer-utils-generate-bash-completion >/dev/null 2>&1; then
devcontainer-utils-generate-bash-completion \
--out-file /etc/bash_completion.d/rapids-build-utils-completions \
${commands[@]/#/--command rapids-} \
;
read -ra commands <<< "${commands[*]/#/--command rapids-}";
if test "${#commands[@]}" -gt 0; then
devcontainer-utils-generate-bash-completion \
--out-file /etc/bash_completion.d/rapids-build-utils-completions \
"${commands[@]}" \
;
fi
fi

find /opt/rapids-build-utils \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@ generate_completions() {
. devcontainer-utils-debug-output 'rapids_build_utils_debug' 'generate-scripts';

readarray -t commands < <(find "${TMP_SCRIPT_DIR}"/ -maxdepth 1 -type f -exec basename {} \;);

devcontainer-utils-generate-bash-completion \
--out-file "$(realpath -m "${COMPLETION_FILE}")" \
--template "$(realpath -m "${COMPLETION_TMPL}")" \
${commands[@]/#/--command } \
;
read -ra commands <<< "${commands[*]/#/--command }";

if test "${#commands[@]}" -gt 0; then
devcontainer-utils-generate-bash-completion \
--out-file "$(realpath -m "${COMPLETION_FILE}")" \
--template "$(realpath -m "${COMPLETION_TMPL}")" \
"${commands[@]}" \
;
fi
fi
}

Expand Down Expand Up @@ -63,7 +66,7 @@ generate_script() {
}

generate_all_script_impl() {
local bin="${SCRIPT}-all";
local bin="${PREFIX:-${SCRIPT}}-${SUFFIX:-all}";
if test -n "${bin:+x}" && ! test -f "${TMP_SCRIPT_DIR}/${bin}"; then
(
cat - \
Expand Down Expand Up @@ -378,8 +381,18 @@ generate_scripts() {
NAME="${cloned_repos[0]:-${repo_names[0]:-}}" \
NAMES="${repo_names[*]@Q}" \
SCRIPT="${script}" \
PREFIX="${script}" \
generate_all_script ;
done;
done
for kind in "cpp" "python"; do
# Generate a script to run a type of build for all repos
NAME="${cloned_repos[0]:-${repo_names[0]:-}}" \
NAMES="${repo_names[*]@Q}" \
SCRIPT="${kind}.build" \
PREFIX="build" \
SUFFIX="all-${kind}" \
generate_all_script ;
done
fi
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
# rapids-get-num-archs-jobs-and-load [OPTION]...
#
# Compute an appropriate total number of jobs, load, and CUDA archs to build in parallel.
# This routine scales the input `-j` with respect to the `-a` and `-m` values, taking into account the
# amount of available system memory (free mem + swap), in order to balance the job and arch parallelism.
#
# note: This wouldn't be necessary if `nvcc` interacted with the POSIX jobserver.
#
# Boolean options:
# -h,--help Print this text.
Expand All @@ -18,16 +14,6 @@
# -j,--parallel <num> Run <num> parallel compilation jobs.
# --max-archs <num> Build at most <num> CUDA archs in parallel.
# (default: 3)
# --max-total-system-memory <num> An upper-bound on the amount of total system memory (in GiB) to use during
# C++ and CUDA device compilations.
# Smaller values yield fewer parallel C++ and CUDA device compilations.
# (default: all available memory)
# --max-device-obj-memory-usage <num> An upper-bound on the amount of memory each CUDA device object compilation
# is expected to take. This is used to estimate the number of parallel device
# object compilations that can be launched without hitting the system memory
# limit.
# Higher values yield fewer parallel CUDA device object compilations.
# (default: 1)

# shellcheck disable=SC1091
. rapids-generate-docstring;
Expand All @@ -41,33 +27,22 @@ get_num_archs_jobs_and_load() {
# shellcheck disable=SC1091
. devcontainer-utils-debug-output 'rapids_build_utils_debug' 'get-num-archs-jobs-and-load';

# The return value of nproc is (who knew!) constrained by the
# values of OMP_NUM_THREADS and/or OMP_THREAD_LIMIT
# Since we want the physical number of processors here, pass --all
local -r n_cpus="$(nproc --all)";
# nproc --all returns 2x the number of threads in Ubuntu24.04+,
# so instead we cound the number of processors in /proc/cpuinfo
local -r n_cpus="$(grep -cP 'processor\s+:' /proc/cpuinfo)";

if test ${#j[@]} -gt 0 && ! test -n "${j:+x}"; then
j="${n_cpus}";
fi

parallel="${j:-${JOBS:-${PARALLEL_LEVEL:-1}}}";
max_archs="${max_archs:-${MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL:-${arch:-}}}";
max_device_obj_memory_usage="${max_device_obj_memory_usage:-${MAX_DEVICE_OBJ_MEMORY_USAGE:-1Gi}}";

local num_re="^[0-9]+$";

# Assume un-suffixed inputs means gibibytes
if [[ "${max_device_obj_memory_usage}" =~ ${num_re} ]]; then
max_device_obj_memory_usage="${max_device_obj_memory_usage}Gi";
fi

max_device_obj_memory_usage="$(numfmt --from=auto "${max_device_obj_memory_usage}")";

local n_arch="${archs:-1}";

# currently: 70-real;75-real;80-real;86-real;90
# see: https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/set_architectures.cmake#L54
local n_arch_rapids=5;
# currently: 70-real;75-real;80-real;86-real;90-real;100-real;120
# see: https://github.com/rapidsai/rapids-cmake/blob/branch-25.04/rapids-cmake/cuda/set_architectures.cmake#L59
local n_arch_rapids=7;

if ! test -n "${archs:+x}" && test -n "${INFER_NUM_DEVICE_ARCHITECTURES:+x}"; then
archs="$(rapids-select-cmake-define CMAKE_CUDA_ARCHITECTURES "${OPTS[@]}" || echo)";
Expand Down Expand Up @@ -101,31 +76,8 @@ get_num_archs_jobs_and_load() {
n_arch=$((n_arch > max_archs ? max_archs : n_arch));
fi

local mem_for_device_objs="$((n_arch * max_device_obj_memory_usage))";
local mem_total="${max_total_system_memory:-${MAX_TOTAL_SYSTEM_MEMORY:-}}";

if ! test -n "${mem_total:+x}"; then
local -r free_mem="$(free --bytes | grep -E '^Mem:' | tr -s '[:space:]' | cut -d' ' -f7 || echo '0')";
local -r freeswap="$(free --bytes | grep -E '^Swap:' | tr -s '[:space:]' | cut -d' ' -f4 || echo '0')";
mem_total="$((free_mem + freeswap))";
# Assume un-suffixed inputs means gibibytes
elif [[ "${mem_total}" =~ ${num_re} ]]; then
mem_total="${mem_total}Gi";
fi
mem_total="$(numfmt --from=auto "${mem_total}")";

local n_load=$((parallel > n_cpus ? n_cpus : parallel));
# shellcheck disable=SC2155
local n_jobs="$(
echo "
scale=0
max_cpu=(${n_load} / ${n_arch} / 2 * 3)
max_mem=(${mem_total} / ${mem_for_device_objs})
if(max_cpu < max_mem) max_cpu else max_mem
" | bc
)"
n_jobs=$((n_jobs < 1 ? 1 : n_jobs));
n_jobs=$((n_arch > 1 ? n_jobs : n_load));
local n_jobs="$((parallel < 1 ? 1 : parallel))";

echo "declare n_arch=${n_arch}";
echo "declare n_jobs=${n_jobs}";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ cpp_lib_dirs() {

local j=0;
for ((j=0; j < ${!cpp_length:-0}; j+=1)); do
# local cpp_name="${repo}_cpp_${j}_name";
local cpp_sub_dir="${repo}_cpp_${j}_sub_dir";
echo ~/"${!repo_path:-}/${!cpp_sub_dir:-}";
done
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env bash

if ! test -n "${SKIP_RAPIDS_BUILD_UTILS_POST_START_COMMAND:+x}"; then
rapids-generate-scripts;
rapids-update-build-dir-links -j;
rapids-make-vscode-workspace --update;
rapids-merge-compile-commands-json > ~/compile_commands.json;
rapids-generate-scripts;
rapids-update-build-dir-links -j &
rapids-make-vscode-workspace --update &
rapids-merge-compile-commands-json > ~/compile_commands.json &
if test -n "${PYTHON_PACKAGE_MANAGER:+x}"; then
rapids-make-"${PYTHON_PACKAGE_MANAGER}"-env "$@" || true;
fi
wait
fi
Loading