rapidsai · trxcllnt · Apr 12, 2025 · Apr 14, 2025 · Apr 16, 2025 · Apr 16, 2025
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-conda",
+    "--ulimit",
+    "nofile=1048576:1048576"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {

@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda11.8-pip",
+    "--ulimit",
+    "nofile=1048576:1048576"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {

@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-conda",
+    "--ulimit",
+    "nofile=1048576:1048576"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {

@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.06-cuda12.8-pip",
+    "--ulimit",
+    "nofile=1048576:1048576"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {

@@ -36,6 +36,8 @@ ENV DEFAULT_CONDA_ENV=rapids
 
 FROM ${PYTHON_PACKAGE_MANAGER}-base
 
+ARG TARGETARCH
+
 ARG CUDA
 ENV CUDAARCHS="RAPIDS"
 ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
@@ -49,9 +51,21 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV SCCACHE_IDLE_TIMEOUT=900
+ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
+ENV SCCACHE_DIST_REQUEST_TIMEOUT=1800
+ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.gha-runners.nvidia.com"
+ENV SCCACHE_IDLE_TIMEOUT=1800
 ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 
 ENV HISTFILE="/home/coder/.cache/._bash_history"
 
 ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"
+
+# Prevent the sccache server from shutting down
+ENV SCCACHE_IDLE_TIMEOUT=0
+ENV SCCACHE_SERVER_LOG="sccache=info"
+ENV SCCACHE_S3_KEY_PREFIX=rapids-test-sccache-dist
+
+# Build as much in parallel as possible
+ENV INFER_NUM_DEVICE_ARCHITECTURES=1
+ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20
@@ -8,6 +8,7 @@ inputs:
   aws_role_arn: {type: string, defaut: '', required: false}
   rw_sccache_bucket: {type: string, defaut: '', required: false}
   rw_sccache_region: {type: string, defaut: '', required: false}
+  sccache_dist_scheduler_url: {type: string, defaut: '', required: false}
 
 runs:
   using: composite
@@ -32,3 +33,4 @@ runs:
         aws_role_arn: "${{ inputs.aws_role_arn }}"
         rw_sccache_bucket: "${{ inputs.rw_sccache_bucket }}"
         rw_sccache_region: "${{ inputs.rw_sccache_region }}"
+        sccache_dist_scheduler_url: "${{ inputs.sccache_dist_scheduler_url }}"
@@ -37,25 +37,55 @@ jobs:
     with:
       arch: '["amd64", "arm64"]'
       cuda: '["12.8"]'
-      node_type: cpu32
+      node_type: cpu8
       extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
+      rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
       timeout-minutes: 720
-      # Prevent the sccache server from shutting down
+      # 1. Prevent the sccache server from shutting down
+      # 2. Infinitely retry transient errors
+      # 3. Never fallback to locally compiling
       env: |
         SCCACHE_IDLE_TIMEOUT=0
+        SCCACHE_DIST_MAX_RETRIES=inf
+        SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
       build_command: |
-        SCCACHE_NO_DAEMON=1 sccache --stop-server
-        sccache -z;
+        # Install the latest sccache client
+        devcontainer-utils-install-sccache --repo trxcllnt/sccache;
+
+        # Configure the sccache client to talk to the build cluster
+        devcontainer-utils-init-sccache-dist      \
+          --enable-sccache-dist - <<< "           \
+          --auth-type 'token'                     \
+          --auth-token '$RAPIDS_AUX_SECRET_1'     \
+        ";
+
+        # Verify sccache cache location
         sccache --show-adv-stats;
-        clone-all -j$(nproc) -v -q --clone-upstream --single-branch --shallow-submodules;
-        build-all                        \
-          -Wno-dev                       \
-          -j$(nproc --ignore=1)          \
-          -DBUILD_SHARED_LIBS=ON         \
-          -DBUILD_TESTS=ON               \
-          -DBUILD_BENCHMARKS=ON          \
-          -DBUILD_PRIMS_BENCH=ON         \
-          -DRAFT_COMPILE_LIBRARY=ON      \
-          -DBUILD_CUGRAPH_MG_TESTS=ON    \
-          ;
+
+        # Clone all the repos
+        clone-all -j$(nproc) -v -q --clone-upstream --depth 1 --single-branch --shallow-submodules --no-update-env;
+
+        # Create the python env without ninja.
+        # ninja -j10000 fails with `ninja: FATAL: pipe: Too many open files`.
+        # This appears to have been fixed 13 years ago (https://github.com/ninja-build/ninja/issues/233),
+        # so that fix needs to be integrated into the kitware pip ninja builds.
+        rapids-post-start-command --exclude <(echo ninja);
+
+        set -x;
+
+        # Configure all the C++ libs
+        configure-all                 \
+          -j$(ulimit -Hn)             \
+          -Wno-dev                    \
+          -DBUILD_TESTS=ON            \
+          -DBUILD_BENCHMARKS=ON       \
+          -DBUILD_PRIMS_BENCH=ON      \
+          -DBUILD_SHARED_LIBS=ON      \
+          -DRAFT_COMPILE_LIBRARY=ON   \
+          -DBUILD_CUGRAPH_MG_TESTS=ON ;
+
+        # Build all the libs
+        build-all -j$(ulimit -Hn);
+
+        # Print cache and dist stats
         sccache --show-adv-stats;
@@ -40,3 +40,4 @@ jobs:
           aws_role_arn: "${{ secrets.GIST_REPO_READ_ORG_GITHUB_TOKEN && 'arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs' || '' }}"
           rw_sccache_bucket: "${{ secrets.GIST_REPO_READ_ORG_GITHUB_TOKEN && 'rapids-sccache-devs' || '' }}"
           rw_sccache_region: "${{ vars.AWS_REGION }}"
+          sccache_dist_scheduler_url: "sccache.gha-runners.nvidia.com"
@@ -1,7 +1,7 @@
 {
   "name": "OpenMPI",
   "id": "openmpi",
-  "version": "25.6.2",
+  "version": "25.6.3",
   "description": "A feature to install OpenMPI with optional CUDA and UCX support",
   "options": {
     "version": {

@@ -82,7 +82,7 @@ build_and_install_openmpi() {
     local -a cuda_args=();
     if test "${ENABLE_CUDA:-}" = 1; then
         cuda_args+=(--with-cuda="${CUDA_HOME:-/usr/local/cuda}");
-        cuda_args+=(--with-cuda-libdir="${CUDA_HOME:-/usr/local/cuda}/lib64/stubs}");
+        cuda_args+=(--with-cuda-libdir="${CUDA_HOME:-/usr/local/cuda}/lib64/stubs");
     fi
 
     IFS=" " read -r -a openmpi_dev_deps <<< "$(install_openmpi_deps)";

@@ -96,10 +96,13 @@ done
 
 # Install bash_completion script
 if command -v devcontainer-utils-generate-bash-completion >/dev/null 2>&1; then
-    devcontainer-utils-generate-bash-completion                          \
-        --out-file /etc/bash_completion.d/rapids-build-utils-completions \
-        ${commands[@]/#/--command rapids-}                               \
-    ;
+    read -ra commands <<< "${commands[*]/#/--command rapids-}";
+    if test "${#commands[@]}" -gt 0; then
+        devcontainer-utils-generate-bash-completion                          \
+            --out-file /etc/bash_completion.d/rapids-build-utils-completions \
+            "${commands[@]}"                                                 \
+        ;
+    fi
 fi
 
 find /opt/rapids-build-utils \

@@ -17,12 +17,15 @@ generate_completions() {
         . devcontainer-utils-debug-output 'rapids_build_utils_debug' 'generate-scripts';
 
         readarray -t commands < <(find "${TMP_SCRIPT_DIR}"/ -maxdepth 1 -type f -exec basename {} \;);
-
-        devcontainer-utils-generate-bash-completion          \
-            --out-file "$(realpath -m "${COMPLETION_FILE}")" \
-            --template "$(realpath -m "${COMPLETION_TMPL}")" \
-            ${commands[@]/#/--command }                      \
-        ;
+        read -ra commands <<< "${commands[*]/#/--command }";
+
+        if test "${#commands[@]}" -gt 0; then
+            devcontainer-utils-generate-bash-completion          \
+                --out-file "$(realpath -m "${COMPLETION_FILE}")" \
+                --template "$(realpath -m "${COMPLETION_TMPL}")" \
+                "${commands[@]}"                                 \
+            ;
+        fi
     fi
 }
 
@@ -63,7 +66,7 @@ generate_script() {
 }
 
 generate_all_script_impl() {
-    local bin="${SCRIPT}-all";
+    local bin="${PREFIX:-${SCRIPT}}-${SUFFIX:-all}";
     if test -n "${bin:+x}" && ! test -f "${TMP_SCRIPT_DIR}/${bin}"; then
         (
             cat - \
@@ -378,8 +381,18 @@ generate_scripts() {
             NAME="${cloned_repos[0]:-${repo_names[0]:-}}" \
             NAMES="${repo_names[*]@Q}"  \
             SCRIPT="${script}"          \
+            PREFIX="${script}"          \
             generate_all_script         ;
-        done;
+        done
+        for kind in "cpp" "python"; do
+            # Generate a script to run a type of build for all repos
+            NAME="${cloned_repos[0]:-${repo_names[0]:-}}" \
+            NAMES="${repo_names[*]@Q}"  \
+            SCRIPT="${kind}.build"      \
+            PREFIX="build"              \
+            SUFFIX="all-${kind}"        \
+            generate_all_script         ;
+        done
     fi
 }
 

@@ -4,10 +4,6 @@
 #  rapids-get-num-archs-jobs-and-load [OPTION]...
 #
 # Compute an appropriate total number of jobs, load, and CUDA archs to build in parallel.
-# This routine scales the input `-j` with respect to the `-a` and `-m` values, taking into account the
-# amount of available system memory (free mem + swap), in order to balance the job and arch parallelism.
-#
-# note: This wouldn't be necessary if `nvcc` interacted with the POSIX jobserver.
 #
 # Boolean options:
 #  -h,--help                              Print this text.
@@ -18,16 +14,6 @@
 #  -j,--parallel <num>                    Run <num> parallel compilation jobs.
 #  --max-archs <num>                      Build at most <num> CUDA archs in parallel.
 #                                         (default: 3)
-#  --max-total-system-memory <num>        An upper-bound on the amount of total system memory (in GiB) to use during
-#                                         C++ and CUDA device compilations.
-#                                         Smaller values yield fewer parallel C++ and CUDA device compilations.
-#                                         (default: all available memory)
-#  --max-device-obj-memory-usage <num>    An upper-bound on the amount of memory each CUDA device object compilation
-#                                         is expected to take. This is used to estimate the number of parallel device
-#                                         object compilations that can be launched without hitting the system memory
-#                                         limit.
-#                                         Higher values yield fewer parallel CUDA device object compilations.
-#                                         (default: 1)
 
 # shellcheck disable=SC1091
 . rapids-generate-docstring;
@@ -41,33 +27,22 @@ get_num_archs_jobs_and_load() {
     # shellcheck disable=SC1091
     . devcontainer-utils-debug-output 'rapids_build_utils_debug' 'get-num-archs-jobs-and-load';
 
-    # The return value of nproc is (who knew!) constrained by the
-    # values of OMP_NUM_THREADS and/or OMP_THREAD_LIMIT
-    # Since we want the physical number of processors here, pass --all
-    local -r n_cpus="$(nproc --all)";
+    # nproc --all returns 2x the number of threads in Ubuntu24.04+,
+    # so instead we cound the number of processors in /proc/cpuinfo
+    local -r n_cpus="$(grep -cP 'processor\s+:' /proc/cpuinfo)";
 
     if test ${#j[@]} -gt 0 && ! test -n "${j:+x}"; then
         j="${n_cpus}";
     fi
 
     parallel="${j:-${JOBS:-${PARALLEL_LEVEL:-1}}}";
     max_archs="${max_archs:-${MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL:-${arch:-}}}";
-    max_device_obj_memory_usage="${max_device_obj_memory_usage:-${MAX_DEVICE_OBJ_MEMORY_USAGE:-1Gi}}";
-
-    local num_re="^[0-9]+$";
-
-    # Assume un-suffixed inputs means gibibytes
-    if [[ "${max_device_obj_memory_usage}" =~ ${num_re} ]]; then
-        max_device_obj_memory_usage="${max_device_obj_memory_usage}Gi";
-    fi
-
-    max_device_obj_memory_usage="$(numfmt --from=auto "${max_device_obj_memory_usage}")";
 
     local n_arch="${archs:-1}";
 
-    # currently: 70-real;75-real;80-real;86-real;90
-    # see: https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/set_architectures.cmake#L54
-    local n_arch_rapids=5;
+    # currently: 70-real;75-real;80-real;86-real;90-real;100-real;120
+    # see: https://github.com/rapidsai/rapids-cmake/blob/branch-25.04/rapids-cmake/cuda/set_architectures.cmake#L59
+    local n_arch_rapids=7;
 
     if ! test -n "${archs:+x}" && test -n "${INFER_NUM_DEVICE_ARCHITECTURES:+x}"; then
         archs="$(rapids-select-cmake-define CMAKE_CUDA_ARCHITECTURES "${OPTS[@]}" || echo)";
@@ -101,31 +76,8 @@ get_num_archs_jobs_and_load() {
         n_arch=$((n_arch > max_archs ? max_archs : n_arch));
     fi
 
-    local mem_for_device_objs="$((n_arch * max_device_obj_memory_usage))";
-    local mem_total="${max_total_system_memory:-${MAX_TOTAL_SYSTEM_MEMORY:-}}";
-
-    if ! test -n "${mem_total:+x}"; then
-        local -r free_mem="$(free --bytes | grep -E '^Mem:' | tr -s '[:space:]' | cut -d' ' -f7 || echo '0')";
-        local -r freeswap="$(free --bytes | grep -E '^Swap:' | tr -s '[:space:]' | cut -d' ' -f4 || echo '0')";
-        mem_total="$((free_mem + freeswap))";
-    # Assume un-suffixed inputs means gibibytes
-    elif [[ "${mem_total}" =~ ${num_re} ]]; then
-        mem_total="${mem_total}Gi";
-    fi
-    mem_total="$(numfmt --from=auto "${mem_total}")";
-
     local n_load=$((parallel > n_cpus ? n_cpus : parallel));
-    # shellcheck disable=SC2155
-    local n_jobs="$(
-        echo "
-scale=0
-max_cpu=(${n_load} / ${n_arch} / 2 * 3)
-max_mem=(${mem_total} / ${mem_for_device_objs})
-if(max_cpu < max_mem) max_cpu else max_mem
-" | bc
-    )"
-    n_jobs=$((n_jobs < 1 ? 1 : n_jobs));
-    n_jobs=$((n_arch > 1 ? n_jobs : n_load));
+    local n_jobs="$((parallel < 1 ? 1 : parallel))";
 
     echo "declare n_arch=${n_arch}";
     echo "declare n_jobs=${n_jobs}";

@@ -44,7 +44,6 @@ cpp_lib_dirs() {
 
         local j=0;
         for ((j=0; j < ${!cpp_length:-0}; j+=1)); do
-            # local cpp_name="${repo}_cpp_${j}_name";
             local cpp_sub_dir="${repo}_cpp_${j}_sub_dir";
             echo ~/"${!repo_path:-}/${!cpp_sub_dir:-}";
         done

@@ -1,11 +1,12 @@
 #!/usr/bin/env bash
 
 if ! test -n "${SKIP_RAPIDS_BUILD_UTILS_POST_START_COMMAND:+x}"; then
-    rapids-generate-scripts;
-    rapids-update-build-dir-links -j;
-    rapids-make-vscode-workspace --update;
-    rapids-merge-compile-commands-json > ~/compile_commands.json;
+        rapids-generate-scripts;
+        rapids-update-build-dir-links -j &
+        rapids-make-vscode-workspace --update &
+        rapids-merge-compile-commands-json > ~/compile_commands.json &
     if test -n "${PYTHON_PACKAGE_MANAGER:+x}"; then
         rapids-make-"${PYTHON_PACKAGE_MANAGER}"-env "$@" || true;
     fi
+    wait
 fi