@@ -52,6 +52,7 @@ int _record_kernel_interval = 1;
5252
5353void do_init_device_memory_limits (uint64_t * , int );
5454void exit_withlock (int exitcode );
55+ uint64_t nvml_get_device_memory_usage (const int dev );
5556
5657void set_current_gpu_status (int status ){
5758 int i ;
@@ -255,6 +256,20 @@ size_t get_gpu_memory_usage(const int dev) {
255256 return total ;
256257}
257258
259+ size_t get_gpu_memory_real_usage (const int dev ) {
260+ LOG_INFO ("get_gpu_memory_real_usage dev=%d" ,dev );
261+ ensure_initialized ();
262+ // Query NVML directly for real-time memory usage instead of using cached monitor value
263+ // This ensures OOM checks use current actual GPU memory, not stale data
264+ size_t nvml_usage = nvml_get_device_memory_usage (dev );
265+ size_t tracked_usage = get_gpu_memory_usage (dev );
266+ // Use the maximum of NVML-reported and tracked value to be conservative
267+ // NVML gives real GPU memory, tracked gives what we've accounted for
268+ size_t real_usage = (nvml_usage > tracked_usage ) ? nvml_usage : tracked_usage ;
269+ LOG_INFO ("get_gpu_memory_real_usage dev=%d nvml_usage=%lu tracked_usage=%lu real_usage=%lu" , dev , nvml_usage , tracked_usage , real_usage );
270+ return real_usage ;
271+ }
272+
258273int set_gpu_device_memory_monitor (int32_t pid ,int dev ,size_t monitor ){
259274 //LOG_WARN("set_gpu_device_memory_monitor:%d %d %lu",pid,dev,monitor);
260275 int i ;
@@ -307,13 +322,15 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
307322 ret = nvmlDeviceGetHandleByIndex (dev , & ndev );
308323 if (ret != NVML_SUCCESS ) {
309324 LOG_ERROR ("NVML get device %d error, %s" , dev , nvmlErrorString (ret ));
325+ return 0 ;
310326 }
311327 unsigned int pcnt = SHARED_REGION_MAX_PROCESS_NUM ;
312328 nvmlProcessInfo_v1_t infos [SHARED_REGION_MAX_PROCESS_NUM ];
313329 LOG_DEBUG ("before nvmlDeviceGetComputeRunningProcesses" );
314330 ret = nvmlDeviceGetComputeRunningProcesses (ndev , & pcnt , infos );
315331 if (ret != NVML_SUCCESS ) {
316332 LOG_ERROR ("NVML get process error, %s" , nvmlErrorString (ret ));
333+ return 0 ;
317334 }
318335 int i = 0 ;
319336 uint64_t usage = 0 ;
@@ -322,9 +339,13 @@ uint64_t nvml_get_device_memory_usage(const int dev) {
322339 for (; i < pcnt ; i ++ ) {
323340 int slot = 0 ;
324341 for (; slot < region -> proc_num ; slot ++ ) {
325- if (infos [i ].pid != region -> procs [slot ].pid )
342+ // NVML returns host PIDs, so we need to compare with hostpid, not pid
343+ // pid is the container PID (from getpid()), hostpid is the real host PID
344+ if (infos [i ].pid != region -> procs [slot ].hostpid )
326345 continue ;
327346 usage += infos [i ].usedGpuMemory ;
347+ LOG_DEBUG ("nvml_get_device_memory_usage: matched hostpid=%d, usedGpuMemory=%lu" ,
348+ infos [i ].pid , infos [i ].usedGpuMemory );
328349 }
329350 }
330351 unlock_shrreg ();
@@ -851,8 +872,8 @@ uint64_t get_current_device_memory_usage(const int dev) {
851872 if (dev < 0 || dev >= CUDA_DEVICE_MAX_COUNT ) {
852873 LOG_ERROR ("Illegal device id: %d" , dev );
853874 }
854- result = get_gpu_memory_usage ( dev );
855- // result= nvml_get_device_memory_usage (dev);
875+ // Use real NVML-reported memory usage for accurate memory tracking
876+ result = get_gpu_memory_real_usage (dev );
856877 finish = clock ();
857878 LOG_DEBUG ("get_current_device_memory_usage:tick=%lu result=%lu\n" ,finish - start ,result );
858879 return result ;
0 commit comments