Description
A suspected nvidia driver bug.
I’m using python’s subprocess library to run the following command once per second:.
[‘nvidia-smi’, ‘–query-gpu=memory.used’, ‘–format=csv,noheader,nounits’]
Full script:
import subprocess
import time
log_file_path = “logs/gpu_log.log”
utilization_result = subprocess.run(
[‘nvidia-smi’, ‘–query-gpu=utilization.gpu’, ‘–format=csv,noheader,nounits’],
capture_output=True, text=True, check=True)
gpu_utilization = utilization_result.stdout.strip()
memory_result = subprocess.run(
[‘nvidia-smi’, ‘–query-gpu=memory.used’, ‘–format=csv,noheader,nounits’],
capture_output=True, text=True, check=True)
gpu_memory_used = memory_result.stdout.strip()
current_time = time.strftime(“%Y-%m-%d %H:%M:%S”)
log_entry = f"{current_time}: GPU Utilization: {gpu_utilization}%, GPU Memory Used: {gpu_memory_used} MiB\n"
with open(log_file_path, ‘a’, encoding=‘utf-8’) as log_file:
log_file.write(log_entry)
print(log_entry.strip())
Encountered an unexpected problem when using model inference for an extended period of time, causing the graphics card call to fail:
In the meantime, the python model inference script errors out:
2025-04-09 08:58:51.810 | ERROR | pkg.analyze.model:get_screen_status:179 - 图标状态判别模型出错: CUDA error: unknown error
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
2025-04-09 08:58:52.151 | ERROR | pkg.analyze.model:get_screen_status:180 - Traceback: Traceback (most recent call last):
File “ultralytics\engine\predictor.py”, line 259, in stream_inference
preds = self.inference(im, *args, **kwargs)
File “ultralytics\engine\predictor.py”, line 143, in inference
return self.model(im, augment=self.args.augment, visualize=visualize, embed=self.args.embed, *args, **kwargs)
File “torch\nn\modules\module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “torch\nn\modules\module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “ultralytics\nn\autobackend.py”, line 524, in forward
y = self.model(im, augment=augment, visualize=visualize, embed=embed)
File “torch\nn\modules\module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “torch\nn\modules\module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “ultralytics\nn\tasks.py”, line 114, in forward
return self.predict(x, *args, **kwargs)
File “ultralytics\nn\tasks.py”, line 132, in predict
return self._predict_once(x, profile, visualize, embed)
File “ultralytics\nn\tasks.py”, line 153, in _predict_once
x = m(x) # run
File “torch\nn\modules\module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “torch\nn\modules\module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “ultralytics\nn\modules\block.py”, line 239, in forward
y.extend(m(y[-1]) for m in self.m)
File “ultralytics\nn\modules\block.py”, line 239, in
y.extend(m(y[-1]) for m in self.m)
File “torch\nn\modules\module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “torch\nn\modules\module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “ultralytics\nn\modules\block.py”, line 348, in forward
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
File “torch\nn\modules\module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “torch\nn\modules\module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “ultralytics\nn\modules\conv.py”, line 55, in forward_fuse
return self.act(self.conv(x))
File “torch\nn\modules\module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “torch\nn\modules\module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “torch\nn\modules\conv.py”, line 460, in forward
return self._conv_forward(input, self.weight, self.bias)
File “torch\nn\modules\conv.py”, line 456, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “pkg\analyze\model.py”, line 175, in get_screen_status
File “ultralytics\engine\model.py”, line 180, in call
return self.predict(source, stream, **kwargs)
File “ultralytics\engine\model.py”, line 558, in predict
return self.predictor.predict_cli(source=source) if is_cli else self.predictor(source=source, stream=stream)
File “ultralytics\engine\predictor.py”, line 173, in call
return list(self.stream_inference(source, model, *args, **kwargs)) # merge list of Result into one
File “torch\utils_contextlib.py”, line 35, in generator_context
response = gen.send(None)
File “ultralytics\engine\predictor.py”, line 258, in stream_inference
with profilers[1]:
File “ultralytics\utils\ops.py”, line 51, in exit
self.dt = self.time() - self.start # delta-time
File “ultralytics\utils\ops.py”, line 61, in time
torch.cuda.synchronize(self.device)
File “torch\cuda_init_.py”, line 792, in synchronize
return torch._C._cuda_synchronize()
RuntimeError: CUDA error: unknown error
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
Running environment:
Windows 11
Nvidia Driver Version 572.80
Cuda Tookit 12.6