4
4
# rapids-get-num-archs-jobs-and-load [OPTION]...
5
5
#
6
6
# Compute an appropriate total number of jobs, load, and CUDA archs to build in parallel.
7
- # This routine scales the input `-j` with respect to the `-a` and `-m` values, taking into account the
8
- # amount of available system memory (free mem + swap), in order to balance the job and arch parallelism.
9
- #
10
- # note: This wouldn't be necessary if `nvcc` interacted with the POSIX jobserver.
11
7
#
12
8
# Boolean options:
13
9
# -h,--help Print this text.
18
14
# -j,--parallel <num> Run <num> parallel compilation jobs.
19
15
# --max-archs <num> Build at most <num> CUDA archs in parallel.
20
16
# (default: 3)
21
- # --max-total-system-memory <num> An upper-bound on the amount of total system memory (in GiB) to use during
22
- # C++ and CUDA device compilations.
23
- # Smaller values yield fewer parallel C++ and CUDA device compilations.
24
- # (default: all available memory)
25
- # --max-device-obj-memory-usage <num> An upper-bound on the amount of memory each CUDA device object compilation
26
- # is expected to take. This is used to estimate the number of parallel device
27
- # object compilations that can be launched without hitting the system memory
28
- # limit.
29
- # Higher values yield fewer parallel CUDA device object compilations.
30
- # (default: 1)
31
17
32
18
# shellcheck disable=SC1091
33
19
. rapids-generate-docstring;
@@ -41,33 +27,22 @@ get_num_archs_jobs_and_load() {
41
27
# shellcheck disable=SC1091
42
28
. devcontainer-utils-debug-output ' rapids_build_utils_debug' ' get-num-archs-jobs-and-load' ;
43
29
44
- # The return value of nproc is (who knew!) constrained by the
45
- # values of OMP_NUM_THREADS and/or OMP_THREAD_LIMIT
46
- # Since we want the physical number of processors here, pass --all
47
- local -r n_cpus=" $( nproc --all) " ;
30
+ # nproc --all returns 2x the number of physical cores in Ubuntu24.04+,
31
+ # so instead we cound the number of processors in /proc/cpuinfo
32
+ local -r n_cpus=" $( grep -cP ' processor\s+:' /proc/cpuinfo) " ;
48
33
49
34
if test ${# j[@]} -gt 0 && test -z " ${j:- } " ; then
50
35
j=" ${n_cpus} " ;
51
36
fi
52
37
53
38
parallel=" ${j:- ${JOBS:- ${PARALLEL_LEVEL:- 1} } } " ;
54
39
max_archs=" ${max_archs:- ${MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL:- ${arch:- } } } " ;
55
- max_device_obj_memory_usage=" ${max_device_obj_memory_usage:- ${MAX_DEVICE_OBJ_MEMORY_USAGE:- 1Gi} } " ;
56
-
57
- local num_re=" ^[0-9]+$" ;
58
-
59
- # Assume un-suffixed inputs means gibibytes
60
- if [[ " ${max_device_obj_memory_usage} " =~ ${num_re} ]]; then
61
- max_device_obj_memory_usage=" ${max_device_obj_memory_usage} Gi" ;
62
- fi
63
-
64
- max_device_obj_memory_usage=" $( numfmt --from=auto " ${max_device_obj_memory_usage} " ) " ;
65
40
66
41
local n_arch=" ${archs:- 1} " ;
67
42
68
- # currently: 70-real;75-real;80-real;86-real;90
69
- # see: https://github.com/rapidsai/rapids-cmake/blob/branch-24 .04/rapids-cmake/cuda/set_architectures.cmake#L54
70
- local n_arch_rapids=5 ;
43
+ # currently: 70-real;75-real;80-real;86-real;90-real;100-real;120
44
+ # see: https://github.com/rapidsai/rapids-cmake/blob/branch-25 .04/rapids-cmake/cuda/set_architectures.cmake#L59
45
+ local n_arch_rapids=7 ;
71
46
72
47
if test -z " ${archs:- } " \
73
48
&& test -n " ${INFER_NUM_DEVICE_ARCHITECTURES:- } " ; then
@@ -102,31 +77,8 @@ get_num_archs_jobs_and_load() {
102
77
n_arch=$(( n_arch > max_archs ? max_archs : n_arch)) ;
103
78
fi
104
79
105
- local mem_for_device_objs=" $(( n_arch * max_device_obj_memory_usage)) " ;
106
- local mem_total=" ${max_total_system_memory:- ${MAX_TOTAL_SYSTEM_MEMORY:- } } " ;
107
-
108
- if test -z " ${mem_total} " ; then
109
- local -r free_mem=" $( free --bytes | grep -E ' ^Mem:' | tr -s ' [:space:]' | cut -d' ' -f7 || echo ' 0' ) " ;
110
- local -r freeswap=" $( free --bytes | grep -E ' ^Swap:' | tr -s ' [:space:]' | cut -d' ' -f4 || echo ' 0' ) " ;
111
- mem_total=" $(( free_mem + freeswap)) " ;
112
- # Assume un-suffixed inputs means gibibytes
113
- elif [[ " ${mem_total} " =~ ${num_re} ]]; then
114
- mem_total=" ${mem_total} Gi" ;
115
- fi
116
- mem_total=" $( numfmt --from=auto " ${mem_total} " ) " ;
117
-
118
80
local n_load=$(( parallel > n_cpus ? n_cpus : parallel)) ;
119
- # shellcheck disable=SC2155
120
- local n_jobs=" $(
121
- echo "
122
- scale=0
123
- max_cpu=(${n_load} / ${n_arch} / 2 * 3)
124
- max_mem=(${mem_total} / ${mem_for_device_objs} )
125
- if(max_cpu < max_mem) max_cpu else max_mem
126
- " | bc
127
- ) "
128
- n_jobs=$(( n_jobs < 1 ? 1 : n_jobs)) ;
129
- n_jobs=$(( n_arch > 1 ? n_jobs : n_load)) ;
81
+ local n_jobs=" $(( parallel < 1 ? 1 : parallel)) " ;
130
82
131
83
echo " declare n_arch=${n_arch} " ;
132
84
echo " declare n_jobs=${n_jobs} " ;
0 commit comments