Skip to content

Commit f58351e

Browse files
author
Hoang Thanh Loi
committed
feat(mem): report real container gpu memory and enforce limits accordingly
1 parent 950c62f commit f58351e

File tree

5 files changed

+40
-11
lines changed

5 files changed

+40
-11
lines changed

src/allocator/allocator.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ int oom_check(const int dev, size_t addon) {
4242
else
4343
d=dev;
4444
uint64_t limit = get_current_device_memory_limit(d);
45-
size_t _usage = get_gpu_memory_usage(d);
45+
// Use real NVML-reported memory usage instead of internally tracked value
46+
// This ensures OOM is triggered based on actual GPU memory consumption
47+
size_t _usage = get_gpu_memory_real_usage(d);
4648

4749
if (limit == 0) {
4850
return 0;

src/cuda/memory.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -493,17 +493,22 @@ CUresult cuMemGetInfo_v2(size_t* free, size_t* total) {
493493
LOG_DEBUG("cuMemGetInfo_v2");
494494
ENSURE_INITIALIZED();
495495
CHECK_DRV_API(cuCtxGetDevice(&dev));
496-
size_t usage = get_current_device_memory_usage(dev);
496+
// Use real NVML-reported memory usage for accurate free memory calculation
497+
size_t usage = get_gpu_memory_real_usage(dev);
497498
size_t limit = get_current_device_memory_limit(dev);
498499
if (limit == 0) {
499500
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
500501
LOG_INFO("orig free=%ld total=%ld", *free, *total);
501-
*free = *total - usage;
502+
*free = (*total > usage) ? (*total - usage) : 0;
502503
LOG_INFO("after free=%ld total=%ld", *free, *total);
503504
return CUDA_SUCCESS;
504505
} else if (limit < usage) {
505506
LOG_WARN("limit < usage; usage=%ld, limit=%ld", usage, limit);
506-
return CUDA_ERROR_INVALID_VALUE;
507+
// Return 0 free memory instead of error when over limit
508+
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
509+
*free = 0;
510+
*total = limit;
511+
return CUDA_SUCCESS;
507512
} else {
508513
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
509514
LOG_INFO("orig free=%ld total=%ld limit=%ld usage=%ld",

src/multiprocess/multiprocess_memory_limit.c

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ int _record_kernel_interval = 1;
5252

5353
void do_init_device_memory_limits(uint64_t*, int);
5454
void exit_withlock(int exitcode);
55+
uint64_t nvml_get_device_memory_usage(const int dev);
5556

5657
void set_current_gpu_status(int status){
5758
int i;
@@ -255,6 +256,20 @@ size_t get_gpu_memory_usage(const int dev) {
255256
return total;
256257
}
257258

259+
size_t get_gpu_memory_real_usage(const int dev) {
260+
LOG_INFO("get_gpu_memory_real_usage dev=%d",dev);
261+
ensure_initialized();
262+
// Query NVML directly for real-time memory usage instead of using cached monitor value
263+
// This ensures OOM checks use current actual GPU memory, not stale data
264+
size_t nvml_usage = nvml_get_device_memory_usage(dev);
265+
size_t tracked_usage = get_gpu_memory_usage(dev);
266+
// Use the maximum of NVML-reported and tracked value to be conservative
267+
// NVML gives real GPU memory, tracked gives what we've accounted for
268+
size_t real_usage = (nvml_usage > tracked_usage) ? nvml_usage : tracked_usage;
269+
LOG_INFO("get_gpu_memory_real_usage dev=%d nvml_usage=%lu tracked_usage=%lu real_usage=%lu", dev, nvml_usage, tracked_usage, real_usage);
270+
return real_usage;
271+
}
272+
258273
int set_gpu_device_memory_monitor(int32_t pid,int dev,size_t monitor){
259274
//LOG_WARN("set_gpu_device_memory_monitor:%d %d %lu",pid,dev,monitor);
260275
int i;
@@ -307,13 +322,15 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
307322
ret = nvmlDeviceGetHandleByIndex(dev, &ndev);
308323
if (ret != NVML_SUCCESS) {
309324
LOG_ERROR("NVML get device %d error, %s", dev, nvmlErrorString(ret));
325+
return 0;
310326
}
311327
unsigned int pcnt = SHARED_REGION_MAX_PROCESS_NUM;
312328
nvmlProcessInfo_v1_t infos[SHARED_REGION_MAX_PROCESS_NUM];
313329
LOG_DEBUG("before nvmlDeviceGetComputeRunningProcesses");
314330
ret = nvmlDeviceGetComputeRunningProcesses(ndev, &pcnt, infos);
315331
if (ret != NVML_SUCCESS) {
316332
LOG_ERROR("NVML get process error, %s", nvmlErrorString(ret));
333+
return 0;
317334
}
318335
int i = 0;
319336
uint64_t usage = 0;
@@ -322,9 +339,13 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
322339
for (; i < pcnt; i++) {
323340
int slot = 0;
324341
for (; slot < region->proc_num; slot++) {
325-
if (infos[i].pid != region->procs[slot].pid)
342+
// NVML returns host PIDs, so we need to compare with hostpid, not pid
343+
// pid is the container PID (from getpid()), hostpid is the real host PID
344+
if (infos[i].pid != region->procs[slot].hostpid)
326345
continue;
327346
usage += infos[i].usedGpuMemory;
347+
LOG_DEBUG("nvml_get_device_memory_usage: matched hostpid=%d, usedGpuMemory=%lu",
348+
infos[i].pid, infos[i].usedGpuMemory);
328349
}
329350
}
330351
unlock_shrreg();
@@ -851,8 +872,8 @@ uint64_t get_current_device_memory_usage(const int dev) {
851872
if (dev < 0 || dev >= CUDA_DEVICE_MAX_COUNT) {
852873
LOG_ERROR("Illegal device id: %d", dev);
853874
}
854-
result = get_gpu_memory_usage(dev);
855-
// result= nvml_get_device_memory_usage(dev);
875+
// Use real NVML-reported memory usage for accurate memory tracking
876+
result = get_gpu_memory_real_usage(dev);
856877
finish=clock();
857878
LOG_DEBUG("get_current_device_memory_usage:tick=%lu result=%lu\n",finish-start,result);
858879
return result;

src/multiprocess/multiprocess_memory_limit.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ int set_host_pid(int hostpid);
133133
uint64_t get_current_device_memory_monitor(const int dev);
134134
uint64_t get_current_device_memory_usage(const int dev);
135135
size_t get_gpu_memory_usage(const int dev);
136+
size_t get_gpu_memory_real_usage(const int dev);
136137

137138
// Priority-related
138139
int get_current_priority();

src/nvml/hook.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -338,10 +338,10 @@ nvmlReturn_t _nvmlDeviceGetMemoryInfo(nvmlDevice_t device,void* memory,int versi
338338
if (cudadev < 0) {
339339
return NVML_SUCCESS;
340340
}
341+
// get_current_device_memory_usage now returns real NVML-reported usage
341342
size_t usage = get_current_device_memory_usage(cudadev);
342-
size_t monitor = get_current_device_memory_monitor(cudadev);
343343
size_t limit = get_current_device_memory_limit(cudadev);
344-
LOG_DEBUG("usage=%ld limit=%ld monitor=%ld", usage, limit, monitor);
344+
LOG_DEBUG("usage=%ld limit=%ld", usage, limit);
345345
if (limit == 0) {
346346
switch (version) {
347347
case 1:
@@ -354,12 +354,12 @@ nvmlReturn_t _nvmlDeviceGetMemoryInfo(nvmlDevice_t device,void* memory,int versi
354354
} else {
355355
switch (version) {
356356
case 1:
357-
((nvmlMemory_t*)memory)->free = (limit-usage);
357+
((nvmlMemory_t*)memory)->free = (limit > usage) ? (limit - usage) : 0;
358358
((nvmlMemory_t*)memory)->total = limit;
359359
((nvmlMemory_t*)memory)->used = usage;
360360
return NVML_SUCCESS;
361361
case 2:
362-
((nvmlMemory_v2_t *)memory)->free = (limit-usage);
362+
((nvmlMemory_v2_t *)memory)->free = (limit > usage) ? (limit - usage) : 0;
363363
((nvmlMemory_v2_t *)memory)->total = limit;
364364
((nvmlMemory_v2_t *)memory)->used = usage;
365365
return NVML_SUCCESS;

0 commit comments

Comments
 (0)