rapidsai
diff --git a/‎.devcontainer/cuda11.8-conda/devcontainer.json
+3-1 b/‎.devcontainer/cuda11.8-conda/devcontainer.json
+3-1
diff --git a/‎.devcontainer/cuda11.8-pip/devcontainer.json
+3-1 b/‎.devcontainer/cuda11.8-pip/devcontainer.json
+3-1
diff --git a/‎.devcontainer/cuda12.8-conda/devcontainer.json
+3-1 b/‎.devcontainer/cuda12.8-conda/devcontainer.json
+3-1
diff --git a/‎.devcontainer/cuda12.8-pip/devcontainer.json
+3-1 b/‎.devcontainer/cuda12.8-pip/devcontainer.json
+3-1
diff --git a/‎.devcontainer/rapids.Dockerfile
+15-1 b/‎.devcontainer/rapids.Dockerfile
+15-1
diff --git a/‎.github/actions/build-and-test-feature/action.yml
+2 b/‎.github/actions/build-and-test-feature/action.yml
+2
diff --git a/‎.github/workflows/build-all-rapids-repos.yml
+59-15 b/‎.github/workflows/build-all-rapids-repos.yml
+59-15
diff --git a/‎.github/workflows/build-and-test-feature.yml
+1 b/‎.github/workflows/build-and-test-feature.yml
+1
diff --git a/‎features/src/rapids-build-utils/devcontainer-feature.json
+1-1 b/‎features/src/rapids-build-utils/devcontainer-feature.json
+1-1
diff --git a/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/generate-scripts.sh
+12-2 b/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/generate-scripts.sh
+12-2
diff --git a/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/get-num-archs-jobs-and-load.sh
+7-55 b/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/get-num-archs-jobs-and-load.sh
+7-55
diff --git a/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/tmpl/all.cpp.build.tmpl.sh
+31 b/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/tmpl/all.cpp.build.tmpl.sh
+31
diff --git a/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/tmpl/all.python.build.tmpl.sh
+31 b/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/tmpl/all.python.build.tmpl.sh
+31
diff --git a/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/tmpl/cpp.configure.tmpl.sh
+1 b/‎features/src/rapids-build-utils/opt/rapids-build-utils/bin/tmpl/cpp.configure.tmpl.sh
+1
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda",
+    "--ulimit",
+    "nofile=1048576:1048576"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
 
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip",
+    "--ulimit",
+    "nofile=1048576:1048576"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
 
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda",
+    "--ulimit",
+    "nofile=1048576:1048576"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
 
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip",
+    "--ulimit",
+    "nofile=1048576:1048576"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
 
@@ -36,6 +36,8 @@ ENV DEFAULT_CONDA_ENV=rapids
 
 FROM ${PYTHON_PACKAGE_MANAGER}-base
 
+ARG TARGETARCH
+
 ARG CUDA
 ENV CUDAARCHS="RAPIDS"
 ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
@@ -49,9 +51,21 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV SCCACHE_IDLE_TIMEOUT=900
+ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
+ENV SCCACHE_DIST_REQUEST_TIMEOUT=1800
+ENV SCCACHE_DIST_SCHEDULER_URL="https://${TARGETARCH}.linux.sccache.gha-runners.nvidia.com"
+ENV SCCACHE_IDLE_TIMEOUT=1800
 ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 
 ENV HISTFILE="/home/coder/.cache/._bash_history"
 
 ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"
+
+# Prevent the sccache server from shutting down
+ENV SCCACHE_IDLE_TIMEOUT=0
+ENV SCCACHE_SERVER_LOG="sccache=info"
+ENV SCCACHE_S3_KEY_PREFIX=rapids-test-sccache-dist
+
+# Build as much in parallel as possible
+ENV INFER_NUM_DEVICE_ARCHITECTURES=1
+ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20
@@ -8,6 +8,7 @@ inputs:
   aws_role_arn: {type: string, defaut: '', required: false}
   rw_sccache_bucket: {type: string, defaut: '', required: false}
   rw_sccache_region: {type: string, defaut: '', required: false}
+  sccache_dist_scheduler_url: {type: string, defaut: '', required: false}
 
 runs:
   using: composite
@@ -32,3 +33,4 @@ runs:
         aws_role_arn: "${{ inputs.aws_role_arn }}"
         rw_sccache_bucket: "${{ inputs.rw_sccache_bucket }}"
         rw_sccache_region: "${{ inputs.rw_sccache_region }}"
+        sccache_dist_scheduler_url: "${{ inputs.sccache_dist_scheduler_url }}"
@@ -35,24 +35,68 @@ jobs:
       contents: read
       pull-requests: read
     with:
-      arch: '["amd64"]'
+      arch: '["amd64", "arm64"]'
       cuda: '["12.8"]'
-      node_type: cpu32
+      node_type: cpu8
       extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
+      rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
       build_command: |
+        # Repopulate the cache
+        export SCCACHE_RECACHE=1
+
+        # Prevent the sccache server from shutting down
         export SCCACHE_IDLE_TIMEOUT=0
-        SCCACHE_NO_DAEMON=1 sccache --stop-server
-        sccache -z;
+
+        # Infinitely retry transient errors
+        export SCCACHE_DIST_MAX_RETRIES=inf
+
+        # Never fallback to locally compiling
+        export SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
+
+        # Disable CUB/Thrust arch-dependent namespaces.
+        # These prevent reusing ptx and cubins from multi-arch
+        # compilations in single-arch subset compilations.
+        for VAR in CFLAGS CXXFLAGS CUDAFLAGS; do
+          export "$VAR=${!VAR:+${!VAR} }-DCUB_DISABLE_NAMESPACE_MAGIC -DCUB_IGNORE_NAMESPACE_MAGIC_ERROR"
+          export "$VAR=${!VAR:+${!VAR} }-DTHRUST_DISABLE_ABI_NAMESPACE -DTHRUST_IGNORE_ABI_NAMESPACE_ERROR"
+        done
+
+        # Install the latest sccache client
+        devcontainer-utils-install-sccache --repo trxcllnt/sccache;
+
+        # Configure the sccache client to talk to the build cluster
+        devcontainer-utils-init-sccache-dist         \
+          --enable-sccache-dist                      \
+          - <<< "--auth-token '$RAPIDS_AUX_SECRET_1' \
+                 --auth-type 'token'                 \
+        ";
+
+        # Verify sccache cache location
         sccache --show-adv-stats;
-        clone-all -j$(nproc) -v -q --clone-upstream --single-branch --shallow-submodules;
-        build-all                        \
-          -v                             \
-          -j$(nproc --ignore=1)          \
-          -DBUILD_SHARED_LIBS=ON         \
-          -DBUILD_TESTS=ON               \
-          -DBUILD_BENCHMARKS=ON          \
-          -DBUILD_PRIMS_BENCH=ON         \
-          -DRAFT_COMPILE_LIBRARY=ON      \
-          -DBUILD_CUGRAPH_MG_TESTS=ON    \
-          ;
+
+        # Clone all the repos
+        clone-all -j$(nproc) -v -q --clone-upstream --single-branch --shallow-submodules --no-update-env;
+
+        # Create the python env without ninja.
+        # ninja -j10000 fails with `ninja: FATAL: pipe: Too many open files`.
+        # This appears to have been fixed 13 years ago (https://github.com/ninja-build/ninja/issues/233),
+        # so that fix needs to be integrated into the kitware pip ninja builds.
+        rapids-post-start-command --exclude <(echo ninja);
+
+        # Configure all the C++ libs
+        configure-all                 \
+          -j100000                    \
+          -Wno-dev                    \
+          -DBUILD_TESTS=ON            \
+          -DBUILD_BENCHMARKS=ON       \
+          -DBUILD_PRIMS_BENCH=ON      \
+          -DBUILD_SHARED_LIBS=ON      \
+          -DRAFT_COMPILE_LIBRARY=ON   \
+          -DBUILD_CUGRAPH_MG_TESTS=ON \
+          -DCMAKE_POLICY_VERSION_MINIMUM=3.5;
+
+        # Build all the libs
+        build-all -j100000;
+
+        # Print cache and dist stats
         sccache --show-adv-stats;
@@ -40,3 +40,4 @@ jobs:
           aws_role_arn: "${{ secrets.GIST_REPO_READ_ORG_GITHUB_TOKEN && 'arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs' || '' }}"
           rw_sccache_bucket: "${{ secrets.GIST_REPO_READ_ORG_GITHUB_TOKEN && 'rapids-sccache-devs' || '' }}"
           rw_sccache_region: "${{ vars.AWS_REGION }}"
+          sccache_dist_scheduler_url: "sccache.gha-runners.nvidia.com"
@@ -1,7 +1,7 @@
 {
   "name": "NVIDIA RAPIDS devcontainer build utilities",
   "id": "rapids-build-utils",
-  "version": "25.6.1",
+  "version": "25.6.2",
   "description": "A feature to install the RAPIDS devcontainer build utilities",
   "containerEnv": {
     "BASH_ENV": "/etc/bash.bash_env"
 
@@ -63,7 +63,7 @@ generate_script() {
 }
 
 generate_all_script_impl() {
-    local bin="${SCRIPT}-all";
+    local bin="${PREFIX:-${SCRIPT}}-${SUFFIX:-all}";
     if test -n "${bin:+x}" && ! test -f "${TMP_SCRIPT_DIR}/${bin}"; then
         (
             cat - \
@@ -378,8 +378,18 @@ generate_scripts() {
             NAME="${cloned_repos[0]:-${repo_names[0]:-}}" \
             NAMES="${repo_names[*]@Q}"  \
             SCRIPT="${script}"          \
+            PREFIX="${script}"          \
             generate_all_script         ;
-        done;
+        done
+        for kind in "cpp" "python"; do
+            # Generate a script to run a type of build for all repos
+            NAME="${cloned_repos[0]:-${repo_names[0]:-}}" \
+            NAMES="${repo_names[*]@Q}"  \
+            SCRIPT="${kind}.build"      \
+            PREFIX="build"              \
+            SUFFIX="all-${kind}"        \
+            generate_all_script         ;
+        done
     fi
 }
 
 
@@ -4,10 +4,6 @@
 #  rapids-get-num-archs-jobs-and-load [OPTION]...
 #
 # Compute an appropriate total number of jobs, load, and CUDA archs to build in parallel.
-# This routine scales the input `-j` with respect to the `-a` and `-m` values, taking into account the
-# amount of available system memory (free mem + swap), in order to balance the job and arch parallelism.
-#
-# note: This wouldn't be necessary if `nvcc` interacted with the POSIX jobserver.
 #
 # Boolean options:
 #  -h,--help                              Print this text.
@@ -18,16 +14,6 @@
 #  -j,--parallel <num>                    Run <num> parallel compilation jobs.
 #  --max-archs <num>                      Build at most <num> CUDA archs in parallel.
 #                                         (default: 3)
-#  --max-total-system-memory <num>        An upper-bound on the amount of total system memory (in GiB) to use during
-#                                         C++ and CUDA device compilations.
-#                                         Smaller values yield fewer parallel C++ and CUDA device compilations.
-#                                         (default: all available memory)
-#  --max-device-obj-memory-usage <num>    An upper-bound on the amount of memory each CUDA device object compilation
-#                                         is expected to take. This is used to estimate the number of parallel device
-#                                         object compilations that can be launched without hitting the system memory
-#                                         limit.
-#                                         Higher values yield fewer parallel CUDA device object compilations.
-#                                         (default: 1)
 
 # shellcheck disable=SC1091
 . rapids-generate-docstring;
@@ -41,33 +27,22 @@ get_num_archs_jobs_and_load() {
     # shellcheck disable=SC1091
     . devcontainer-utils-debug-output 'rapids_build_utils_debug' 'get-num-archs-jobs-and-load';
 
-    # The return value of nproc is (who knew!) constrained by the
-    # values of OMP_NUM_THREADS and/or OMP_THREAD_LIMIT
-    # Since we want the physical number of processors here, pass --all
-    local -r n_cpus="$(nproc --all)";
+    # nproc --all returns 2x the number of threads in Ubuntu24.04+,
+    # so instead we cound the number of processors in /proc/cpuinfo
+    local -r n_cpus="$(grep -cP 'processor\s+:' /proc/cpuinfo)";
 
     if test ${#j[@]} -gt 0 && ! test -n "${j:+x}"; then
         j="${n_cpus}";
     fi
 
     parallel="${j:-${JOBS:-${PARALLEL_LEVEL:-1}}}";
     max_archs="${max_archs:-${MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL:-${arch:-}}}";
-    max_device_obj_memory_usage="${max_device_obj_memory_usage:-${MAX_DEVICE_OBJ_MEMORY_USAGE:-1Gi}}";
-
-    local num_re="^[0-9]+$";
-
-    # Assume un-suffixed inputs means gibibytes
-    if [[ "${max_device_obj_memory_usage}" =~ ${num_re} ]]; then
-        max_device_obj_memory_usage="${max_device_obj_memory_usage}Gi";
-    fi
-
-    max_device_obj_memory_usage="$(numfmt --from=auto "${max_device_obj_memory_usage}")";
 
     local n_arch="${archs:-1}";
 
-    # currently: 70-real;75-real;80-real;86-real;90
-    # see: https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/set_architectures.cmake#L54
-    local n_arch_rapids=5;
+    # currently: 70-real;75-real;80-real;86-real;90-real;100-real;120
+    # see: https://github.com/rapidsai/rapids-cmake/blob/branch-25.04/rapids-cmake/cuda/set_architectures.cmake#L59
+    local n_arch_rapids=7;
 
     if ! test -n "${archs:+x}" && test -n "${INFER_NUM_DEVICE_ARCHITECTURES:+x}"; then
         archs="$(rapids-select-cmake-define CMAKE_CUDA_ARCHITECTURES "${OPTS[@]}" || echo)";
@@ -101,31 +76,8 @@ get_num_archs_jobs_and_load() {
         n_arch=$((n_arch > max_archs ? max_archs : n_arch));
     fi
 
-    local mem_for_device_objs="$((n_arch * max_device_obj_memory_usage))";
-    local mem_total="${max_total_system_memory:-${MAX_TOTAL_SYSTEM_MEMORY:-}}";
-
-    if ! test -n "${mem_total:+x}"; then
-        local -r free_mem="$(free --bytes | grep -E '^Mem:' | tr -s '[:space:]' | cut -d' ' -f7 || echo '0')";
-        local -r freeswap="$(free --bytes | grep -E '^Swap:' | tr -s '[:space:]' | cut -d' ' -f4 || echo '0')";
-        mem_total="$((free_mem + freeswap))";
-    # Assume un-suffixed inputs means gibibytes
-    elif [[ "${mem_total}" =~ ${num_re} ]]; then
-        mem_total="${mem_total}Gi";
-    fi
-    mem_total="$(numfmt --from=auto "${mem_total}")";
-
     local n_load=$((parallel > n_cpus ? n_cpus : parallel));
-    # shellcheck disable=SC2155
-    local n_jobs="$(
-        echo "
-scale=0
-max_cpu=(${n_load} / ${n_arch} / 2 * 3)
-max_mem=(${mem_total} / ${mem_for_device_objs})
-if(max_cpu < max_mem) max_cpu else max_mem
-" | bc
-    )"
-    n_jobs=$((n_jobs < 1 ? 1 : n_jobs));
-    n_jobs=$((n_arch > 1 ? n_jobs : n_load));
+    local n_jobs="$((parallel < 1 ? 1 : parallel))";
 
     echo "declare n_arch=${n_arch}";
     echo "declare n_jobs=${n_jobs}";
 
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# Usage:
+#  build-all-cpp [OPTION]...
+#
+# Runs build-<repo>-cpp for each repo in ${NAMES}.
+#
+# Forwards relevant arguments to each underlying script.
+#
+# @_include_options /usr/bin/build-${NAME}-cpp -h | tail -n+2;
+
+# shellcheck disable=SC1091
+. rapids-generate-docstring;
+
+_build_all_cpp() {
+    local -;
+    set -euo pipefail;
+
+    eval "$(_parse_args --take '-h,--help' "$@" <&0)";
+
+    # shellcheck disable=SC1091
+    . devcontainer-utils-debug-output 'rapids_build_utils_debug' 'build-all build-all-cpp';
+
+    for name in ${NAMES}; do
+        if command -v build-${name}-cpp >/dev/null 2>&1; then
+            build-${name}-cpp "${OPTS[@]}";
+        fi
+    done
+}
+
+_build_all_cpp "$@" <&0;
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# Usage:
+#  build-all-python [OPTION]...
+#
+# Runs build-<repo>-python for each repo in ${NAMES}.
+#
+# Forwards relevant arguments to each underlying script.
+#
+# @_include_options /usr/bin/build-${NAME}-python -h | tail -n+2;
+
+# shellcheck disable=SC1091
+. rapids-generate-docstring;
+
+_build_all_python() {
+    local -;
+    set -euo pipefail;
+
+    eval "$(_parse_args --take '-h,--help' "$@" <&0)";
+
+    # shellcheck disable=SC1091
+    . devcontainer-utils-debug-output 'rapids_build_utils_debug' 'build-all build-all-python';
+
+    for name in ${NAMES}; do
+        if command -v build-${name}-python >/dev/null 2>&1; then
+            build-${name}-python "${OPTS[@]}";
+        fi
+    done
+}
+
+_build_all_python "$@" <&0;
@@ -57,6 +57,7 @@ configure_${CPP_LIB}_cpp() {
 
     time (
         export ${CPP_ENV} PATH="$PATH";
+        SCCACHE_NO_DIST_COMPILE=1                         \
         CUDAFLAGS="${CUDAFLAGS:+$CUDAFLAGS }-t=${n_arch}" \
             cmake "${cmake_args[@]}";
         { set +x; } 2>/dev/null; echo -n "lib${CPP_LIB} configure time:";
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "NVIDIA RAPIDS devcontainer build utilities",`
`3`	`3`	`"id": "rapids-build-utils",`
`4`		`- "version": "25.6.1",`
	`4`	`+ "version": "25.6.2",`
`5`	`5`	`"description": "A feature to install the RAPIDS devcontainer build utilities",`
`6`	`6`	`"containerEnv": {`
`7`	`7`	`"BASH_ENV": "/etc/bash.bash_env"`