Skip to content

Commit 631d3f5

Browse files
committed
Add scripts to configure sccache-dist and use the build cluster in CI
1 parent ae6a295 commit 631d3f5

32 files changed

+974
-115
lines changed

.devcontainer/cuda11.8-conda/devcontainer.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
"runArgs": [
1212
"--rm",
1313
"--name",
14-
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda"
14+
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda",
15+
"--ulimit",
16+
"nofile=1048576:1048576"
1517
],
1618
"hostRequirements": {"gpu": "optional"},
1719
"features": {

.devcontainer/cuda11.8-pip/devcontainer.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
"runArgs": [
1212
"--rm",
1313
"--name",
14-
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip"
14+
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip",
15+
"--ulimit",
16+
"nofile=1048576:1048576"
1517
],
1618
"hostRequirements": {"gpu": "optional"},
1719
"features": {

.devcontainer/cuda12.8-conda/devcontainer.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
"runArgs": [
1212
"--rm",
1313
"--name",
14-
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda"
14+
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda",
15+
"--ulimit",
16+
"nofile=1048576:1048576"
1517
],
1618
"hostRequirements": {"gpu": "optional"},
1719
"features": {

.devcontainer/cuda12.8-pip/devcontainer.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
"runArgs": [
1212
"--rm",
1313
"--name",
14-
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip"
14+
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip",
15+
"--ulimit",
16+
"nofile=1048576:1048576"
1517
],
1618
"hostRequirements": {"gpu": "optional"},
1719
"features": {

.devcontainer/rapids.Dockerfile

+15-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ ENV DEFAULT_CONDA_ENV=rapids
3636

3737
FROM ${PYTHON_PACKAGE_MANAGER}-base
3838

39+
ARG TARGETARCH
40+
3941
ARG CUDA
4042
ENV CUDAARCHS="RAPIDS"
4143
ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
@@ -49,9 +51,21 @@ ENV PYTHONDONTWRITEBYTECODE="1"
4951

5052
ENV SCCACHE_REGION="us-east-2"
5153
ENV SCCACHE_BUCKET="rapids-sccache-devs"
52-
ENV SCCACHE_IDLE_TIMEOUT=900
54+
ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
55+
ENV SCCACHE_DIST_REQUEST_TIMEOUT=1800
56+
ENV SCCACHE_DIST_SCHEDULER_URL="https://${TARGETARCH}.linux.sccache.gha-runners.nvidia.com"
57+
ENV SCCACHE_IDLE_TIMEOUT=1800
5358
ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
5459

5560
ENV HISTFILE="/home/coder/.cache/._bash_history"
5661

5762
ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"
63+
64+
# Prevent the sccache server from shutting down
65+
ENV SCCACHE_IDLE_TIMEOUT=0
66+
ENV SCCACHE_SERVER_LOG="sccache=info"
67+
ENV SCCACHE_S3_KEY_PREFIX=rapids-test-sccache-dist
68+
69+
# Build as much in parallel as possible
70+
ENV INFER_NUM_DEVICE_ARCHITECTURES=1
71+
ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20

.github/actions/build-and-test-feature/action.yml

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ inputs:
88
aws_role_arn: {type: string, defaut: '', required: false}
99
rw_sccache_bucket: {type: string, defaut: '', required: false}
1010
rw_sccache_region: {type: string, defaut: '', required: false}
11+
sccache_dist_scheduler_url: {type: string, defaut: '', required: false}
1112

1213
runs:
1314
using: composite
@@ -32,3 +33,4 @@ runs:
3233
aws_role_arn: "${{ inputs.aws_role_arn }}"
3334
rw_sccache_bucket: "${{ inputs.rw_sccache_bucket }}"
3435
rw_sccache_region: "${{ inputs.rw_sccache_region }}"
36+
sccache_dist_scheduler_url: "${{ inputs.sccache_dist_scheduler_url }}"

.github/workflows/build-all-rapids-repos.yml

+59-15
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,68 @@ jobs:
3535
contents: read
3636
pull-requests: read
3737
with:
38-
arch: '["amd64"]'
38+
arch: '["amd64", "arm64"]'
3939
cuda: '["12.8"]'
40-
node_type: cpu32
40+
node_type: cpu8
4141
extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
42+
rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
4243
build_command: |
44+
# Repopulate the cache
45+
export SCCACHE_RECACHE=1
46+
47+
# Prevent the sccache server from shutting down
4348
export SCCACHE_IDLE_TIMEOUT=0
44-
SCCACHE_NO_DAEMON=1 sccache --stop-server
45-
sccache -z;
49+
50+
# Infinitely retry transient errors
51+
export SCCACHE_DIST_MAX_RETRIES=inf
52+
53+
# Never fallback to locally compiling
54+
export SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
55+
56+
# Disable CUB/Thrust arch-dependent namespaces.
57+
# These prevent reusing ptx and cubins from multi-arch
58+
# compilations in single-arch subset compilations.
59+
for VAR in CFLAGS CXXFLAGS CUDAFLAGS; do
60+
export "$VAR=${!VAR:+${!VAR} }-DCUB_DISABLE_NAMESPACE_MAGIC -DCUB_IGNORE_NAMESPACE_MAGIC_ERROR"
61+
export "$VAR=${!VAR:+${!VAR} }-DTHRUST_DISABLE_ABI_NAMESPACE -DTHRUST_IGNORE_ABI_NAMESPACE_ERROR"
62+
done
63+
64+
# Install the latest sccache client
65+
devcontainer-utils-install-sccache --repo trxcllnt/sccache;
66+
67+
# Configure the sccache client to talk to the build cluster
68+
devcontainer-utils-init-sccache-dist \
69+
--enable-sccache-dist \
70+
- <<< "--auth-token '$RAPIDS_AUX_SECRET_1' \
71+
--auth-type 'token' \
72+
";
73+
74+
# Verify sccache cache location
4675
sccache --show-adv-stats;
47-
clone-all -j$(nproc) -v -q --clone-upstream --single-branch --shallow-submodules;
48-
build-all \
49-
-v \
50-
-j$(nproc --ignore=1) \
51-
-DBUILD_SHARED_LIBS=ON \
52-
-DBUILD_TESTS=ON \
53-
-DBUILD_BENCHMARKS=ON \
54-
-DBUILD_PRIMS_BENCH=ON \
55-
-DRAFT_COMPILE_LIBRARY=ON \
56-
-DBUILD_CUGRAPH_MG_TESTS=ON \
57-
;
76+
77+
# Clone all the repos
78+
clone-all -j$(nproc) -v -q --clone-upstream --single-branch --shallow-submodules --no-update-env;
79+
80+
# Create the python env without ninja.
81+
# ninja -j10000 fails with `ninja: FATAL: pipe: Too many open files`.
82+
# This appears to have been fixed 13 years ago (https://github.com/ninja-build/ninja/issues/233),
83+
# so that fix needs to be integrated into the kitware pip ninja builds.
84+
rapids-post-start-command --exclude <(echo ninja);
85+
86+
# Configure all the C++ libs
87+
configure-all \
88+
-j100000 \
89+
-Wno-dev \
90+
-DBUILD_TESTS=ON \
91+
-DBUILD_BENCHMARKS=ON \
92+
-DBUILD_PRIMS_BENCH=ON \
93+
-DBUILD_SHARED_LIBS=ON \
94+
-DRAFT_COMPILE_LIBRARY=ON \
95+
-DBUILD_CUGRAPH_MG_TESTS=ON \
96+
-DCMAKE_POLICY_VERSION_MINIMUM=3.5;
97+
98+
# Build all the libs
99+
build-all -j100000;
100+
101+
# Print cache and dist stats
58102
sccache --show-adv-stats;

.github/workflows/build-and-test-feature.yml

+1
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,4 @@ jobs:
4040
aws_role_arn: "${{ secrets.GIST_REPO_READ_ORG_GITHUB_TOKEN && 'arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs' || '' }}"
4141
rw_sccache_bucket: "${{ secrets.GIST_REPO_READ_ORG_GITHUB_TOKEN && 'rapids-sccache-devs' || '' }}"
4242
rw_sccache_region: "${{ vars.AWS_REGION }}"
43+
sccache_dist_scheduler_url: "sccache.gha-runners.nvidia.com"

features/src/rapids-build-utils/devcontainer-feature.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "NVIDIA RAPIDS devcontainer build utilities",
33
"id": "rapids-build-utils",
4-
"version": "25.6.1",
4+
"version": "25.6.2",
55
"description": "A feature to install the RAPIDS devcontainer build utilities",
66
"containerEnv": {
77
"BASH_ENV": "/etc/bash.bash_env"

features/src/rapids-build-utils/opt/rapids-build-utils/bin/generate-scripts.sh

+12-2
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ generate_script() {
6363
}
6464

6565
generate_all_script_impl() {
66-
local bin="${SCRIPT}-all";
66+
local bin="${PREFIX:-${SCRIPT}}-${SUFFIX:-all}";
6767
if test -n "${bin:+x}" && ! test -f "${TMP_SCRIPT_DIR}/${bin}"; then
6868
(
6969
cat - \
@@ -378,8 +378,18 @@ generate_scripts() {
378378
NAME="${cloned_repos[0]:-${repo_names[0]:-}}" \
379379
NAMES="${repo_names[*]@Q}" \
380380
SCRIPT="${script}" \
381+
PREFIX="${script}" \
381382
generate_all_script ;
382-
done;
383+
done
384+
for kind in "cpp" "python"; do
385+
# Generate a script to run a type of build for all repos
386+
NAME="${cloned_repos[0]:-${repo_names[0]:-}}" \
387+
NAMES="${repo_names[*]@Q}" \
388+
SCRIPT="${kind}.build" \
389+
PREFIX="build" \
390+
SUFFIX="all-${kind}" \
391+
generate_all_script ;
392+
done
383393
fi
384394
}
385395

features/src/rapids-build-utils/opt/rapids-build-utils/bin/get-num-archs-jobs-and-load.sh

+7-55
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44
# rapids-get-num-archs-jobs-and-load [OPTION]...
55
#
66
# Compute an appropriate total number of jobs, load, and CUDA archs to build in parallel.
7-
# This routine scales the input `-j` with respect to the `-a` and `-m` values, taking into account the
8-
# amount of available system memory (free mem + swap), in order to balance the job and arch parallelism.
9-
#
10-
# note: This wouldn't be necessary if `nvcc` interacted with the POSIX jobserver.
117
#
128
# Boolean options:
139
# -h,--help Print this text.
@@ -18,16 +14,6 @@
1814
# -j,--parallel <num> Run <num> parallel compilation jobs.
1915
# --max-archs <num> Build at most <num> CUDA archs in parallel.
2016
# (default: 3)
21-
# --max-total-system-memory <num> An upper-bound on the amount of total system memory (in GiB) to use during
22-
# C++ and CUDA device compilations.
23-
# Smaller values yield fewer parallel C++ and CUDA device compilations.
24-
# (default: all available memory)
25-
# --max-device-obj-memory-usage <num> An upper-bound on the amount of memory each CUDA device object compilation
26-
# is expected to take. This is used to estimate the number of parallel device
27-
# object compilations that can be launched without hitting the system memory
28-
# limit.
29-
# Higher values yield fewer parallel CUDA device object compilations.
30-
# (default: 1)
3117

3218
# shellcheck disable=SC1091
3319
. rapids-generate-docstring;
@@ -41,33 +27,22 @@ get_num_archs_jobs_and_load() {
4127
# shellcheck disable=SC1091
4228
. devcontainer-utils-debug-output 'rapids_build_utils_debug' 'get-num-archs-jobs-and-load';
4329

44-
# The return value of nproc is (who knew!) constrained by the
45-
# values of OMP_NUM_THREADS and/or OMP_THREAD_LIMIT
46-
# Since we want the physical number of processors here, pass --all
47-
local -r n_cpus="$(nproc --all)";
30+
# nproc --all returns 2x the number of threads in Ubuntu24.04+,
31+
# so instead we cound the number of processors in /proc/cpuinfo
32+
local -r n_cpus="$(grep -cP 'processor\s+:' /proc/cpuinfo)";
4833

4934
if test ${#j[@]} -gt 0 && ! test -n "${j:+x}"; then
5035
j="${n_cpus}";
5136
fi
5237

5338
parallel="${j:-${JOBS:-${PARALLEL_LEVEL:-1}}}";
5439
max_archs="${max_archs:-${MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL:-${arch:-}}}";
55-
max_device_obj_memory_usage="${max_device_obj_memory_usage:-${MAX_DEVICE_OBJ_MEMORY_USAGE:-1Gi}}";
56-
57-
local num_re="^[0-9]+$";
58-
59-
# Assume un-suffixed inputs means gibibytes
60-
if [[ "${max_device_obj_memory_usage}" =~ ${num_re} ]]; then
61-
max_device_obj_memory_usage="${max_device_obj_memory_usage}Gi";
62-
fi
63-
64-
max_device_obj_memory_usage="$(numfmt --from=auto "${max_device_obj_memory_usage}")";
6540

6641
local n_arch="${archs:-1}";
6742

68-
# currently: 70-real;75-real;80-real;86-real;90
69-
# see: https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/set_architectures.cmake#L54
70-
local n_arch_rapids=5;
43+
# currently: 70-real;75-real;80-real;86-real;90-real;100-real;120
44+
# see: https://github.com/rapidsai/rapids-cmake/blob/branch-25.04/rapids-cmake/cuda/set_architectures.cmake#L59
45+
local n_arch_rapids=7;
7146

7247
if ! test -n "${archs:+x}" && test -n "${INFER_NUM_DEVICE_ARCHITECTURES:+x}"; then
7348
archs="$(rapids-select-cmake-define CMAKE_CUDA_ARCHITECTURES "${OPTS[@]}" || echo)";
@@ -101,31 +76,8 @@ get_num_archs_jobs_and_load() {
10176
n_arch=$((n_arch > max_archs ? max_archs : n_arch));
10277
fi
10378

104-
local mem_for_device_objs="$((n_arch * max_device_obj_memory_usage))";
105-
local mem_total="${max_total_system_memory:-${MAX_TOTAL_SYSTEM_MEMORY:-}}";
106-
107-
if ! test -n "${mem_total:+x}"; then
108-
local -r free_mem="$(free --bytes | grep -E '^Mem:' | tr -s '[:space:]' | cut -d' ' -f7 || echo '0')";
109-
local -r freeswap="$(free --bytes | grep -E '^Swap:' | tr -s '[:space:]' | cut -d' ' -f4 || echo '0')";
110-
mem_total="$((free_mem + freeswap))";
111-
# Assume un-suffixed inputs means gibibytes
112-
elif [[ "${mem_total}" =~ ${num_re} ]]; then
113-
mem_total="${mem_total}Gi";
114-
fi
115-
mem_total="$(numfmt --from=auto "${mem_total}")";
116-
11779
local n_load=$((parallel > n_cpus ? n_cpus : parallel));
118-
# shellcheck disable=SC2155
119-
local n_jobs="$(
120-
echo "
121-
scale=0
122-
max_cpu=(${n_load} / ${n_arch} / 2 * 3)
123-
max_mem=(${mem_total} / ${mem_for_device_objs})
124-
if(max_cpu < max_mem) max_cpu else max_mem
125-
" | bc
126-
)"
127-
n_jobs=$((n_jobs < 1 ? 1 : n_jobs));
128-
n_jobs=$((n_arch > 1 ? n_jobs : n_load));
80+
local n_jobs="$((parallel < 1 ? 1 : parallel))";
12981

13082
echo "declare n_arch=${n_arch}";
13183
echo "declare n_jobs=${n_jobs}";
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env bash
2+
3+
# Usage:
4+
# build-all-cpp [OPTION]...
5+
#
6+
# Runs build-<repo>-cpp for each repo in ${NAMES}.
7+
#
8+
# Forwards relevant arguments to each underlying script.
9+
#
10+
# @_include_options /usr/bin/build-${NAME}-cpp -h | tail -n+2;
11+
12+
# shellcheck disable=SC1091
13+
. rapids-generate-docstring;
14+
15+
_build_all_cpp() {
16+
local -;
17+
set -euo pipefail;
18+
19+
eval "$(_parse_args --take '-h,--help' "$@" <&0)";
20+
21+
# shellcheck disable=SC1091
22+
. devcontainer-utils-debug-output 'rapids_build_utils_debug' 'build-all build-all-cpp';
23+
24+
for name in ${NAMES}; do
25+
if command -v build-${name}-cpp >/dev/null 2>&1; then
26+
build-${name}-cpp "${OPTS[@]}";
27+
fi
28+
done
29+
}
30+
31+
_build_all_cpp "$@" <&0;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env bash
2+
3+
# Usage:
4+
# build-all-python [OPTION]...
5+
#
6+
# Runs build-<repo>-python for each repo in ${NAMES}.
7+
#
8+
# Forwards relevant arguments to each underlying script.
9+
#
10+
# @_include_options /usr/bin/build-${NAME}-python -h | tail -n+2;
11+
12+
# shellcheck disable=SC1091
13+
. rapids-generate-docstring;
14+
15+
_build_all_python() {
16+
local -;
17+
set -euo pipefail;
18+
19+
eval "$(_parse_args --take '-h,--help' "$@" <&0)";
20+
21+
# shellcheck disable=SC1091
22+
. devcontainer-utils-debug-output 'rapids_build_utils_debug' 'build-all build-all-python';
23+
24+
for name in ${NAMES}; do
25+
if command -v build-${name}-python >/dev/null 2>&1; then
26+
build-${name}-python "${OPTS[@]}";
27+
fi
28+
done
29+
}
30+
31+
_build_all_python "$@" <&0;

features/src/rapids-build-utils/opt/rapids-build-utils/bin/tmpl/cpp.configure.tmpl.sh

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ configure_${CPP_LIB}_cpp() {
5757

5858
time (
5959
export ${CPP_ENV} PATH="$PATH";
60+
SCCACHE_NO_DIST_COMPILE=1 \
6061
CUDAFLAGS="${CUDAFLAGS:+$CUDAFLAGS }-t=${n_arch}" \
6162
cmake "${cmake_args[@]}";
6263
{ set +x; } 2>/dev/null; echo -n "lib${CPP_LIB} configure time:";

0 commit comments

Comments
 (0)