Skip to content

Commit f847607

Browse files
authored
compute estimated jobs to avoid OOM (vllm-project#219)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
1 parent 377e7eb commit f847607

1 file changed

Lines changed: 25 additions & 2 deletions

File tree

setup.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,35 @@ def compute_num_jobs(self):
9797
num_jobs = int(num_jobs)
9898
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
9999
else:
100+
# Estimate the number of jobs. Each compile process may take ~8GB
101+
# of memory, so we limit jobs to avoid OOM on memory-constrained
102+
# machines.
103+
import psutil
104+
mem_bytes = psutil.virtual_memory().total
105+
100106
try:
101107
# os.sched_getaffinity() isn't universally available, so fall
102108
# back to os.cpu_count() if we get an error here.
103-
num_jobs = len(os.sched_getaffinity(0))
109+
cpu_jobs = len(os.sched_getaffinity(0))
104110
except AttributeError:
105-
num_jobs = os.cpu_count()
111+
cpu_jobs = os.cpu_count() or 1
112+
113+
if mem_bytes is not None:
114+
# Assume each compile process may require ~8GB.
115+
mem_jobs = max(1, mem_bytes // (8 * 1024**3))
116+
num_jobs = max(1, min(cpu_jobs, int(mem_jobs)))
117+
logger.info(
118+
"Auto-detected: cpu core: %d, memory_limit: %d, using: %d",
119+
cpu_jobs,
120+
mem_jobs,
121+
num_jobs,
122+
)
123+
else:
124+
num_jobs = max(1, cpu_jobs)
125+
logger.info(
126+
"Could not determine system memory. Using cpu core: %d",
127+
num_jobs,
128+
)
106129

107130
get_oneapi_version()
108131

0 commit comments

Comments
 (0)