Skip to content

Cluster fails to start due to NVMLError_NoPermission when accessing GPU information #9093

@alifyasa

Description

@alifyasa

Describe the issue:

Cluster failed to start because of NVMLError_NoPermission

Example:

from dask.distributed import LocalCluster, Client

n_workers = 4
cluster = LocalCluster(
    n_workers=n_workers,
    memory_limit='10GB',
    threads_per_worker=4,
    death_timeout=600
)
NVMLError_NoPermission                    Traceback (most recent call last)
File ~/Devito-fwi/ta/lib/python3.11/site-packages/distributed/deploy/spec.py:330, in SpecCluster._start(self)
    329     cls = import_term(cls)
--> 330 self.scheduler = cls(**self.scheduler_spec.get("options", {}))
    331 self.scheduler = await self.scheduler

File ~/Devito-fwi/ta/lib/python3.11/site-packages/distributed/scheduler.py:4070, in Scheduler.__init__(self, loop, services, service_kwargs, allowed_failures, extensions, validate, scheduler_file, security, worker_ttl, idle_timeout, interface, host, port, protocol, dashboard_address, dashboard, http_prefix, preload, preload_argv, plugins, contact_address, transition_counter_max, jupyter, **kwargs)
   4056 SchedulerState.__init__(
   4057     self,
   4058     aliases=aliases,
   (...)   4068     transition_counter_max=transition_counter_max,
   4069 )
-> 4070 ServerNode.__init__(
   4071     self,
   4072     handlers=self.handlers,
   4073     stream_handlers=merge(worker_handlers, client_handlers),
   4074     connection_limit=connection_limit,
   4075     deserialize=False,
   4076     connection_args=self.connection_args,
   4077     **kwargs,
   4078 )
   4080 if self.worker_ttl:

File ~/Devito-fwi/ta/lib/python3.11/site-packages/distributed/core.py:307, in Server.__init__(self, handlers, blocked_handlers, stream_handlers, connection_limit, deserialize, serializers, deserializers, connection_args, timeout, io_loop, local_directory, needs_workdir)
    306 self.deserialize = deserialize
--> 307 self.monitor = SystemMonitor()
    308 self._ongoing_background_tasks = AsyncTaskGroup()

File ~/Devito-fwi/ta/lib/python3.11/site-packages/distributed/system_monitor.py:129, in SystemMonitor.__init__(self, maxlen, monitor_disk_io, monitor_host_cpu, monitor_gil_contention)
    128 if nvml.device_get_count() > 0:
--> 129     gpu_extra = nvml.one_time()
    130     self.gpu_name = gpu_extra["name"]

File ~/Devito-fwi/ta/lib/python3.11/site-packages/distributed/diagnostics/nvml.py:378, in one_time()
    376 h = _pynvml_handles()
    377 return {
--> 378     "memory-total": _get_memory_total(h),
    379     "name": _get_name(h),
    380 }

File ~/Devito-fwi/ta/lib/python3.11/site-packages/distributed/diagnostics/nvml.py:353, in _get_memory_total(h)
    352 try:
--> 353     return pynvml.nvmlDeviceGetMemoryInfo(h).total
    354 except pynvml.NVMLError_NotSupported:

File ~/Devito-fwi/ta/lib/python3.11/site-packages/pynvml.py:3522, in nvmlDeviceGetMemoryInfo(handle, version)
   3521 ret = fn(handle, byref(c_memory))
-> 3522 _nvmlCheckReturn(ret)
   3523 return c_memory

File ~/Devito-fwi/ta/lib/python3.11/site-packages/pynvml.py:1059, in _nvmlCheckReturn(ret)
   1058 if (ret != NVML_SUCCESS):
-> 1059     raise NVMLError(ret)
   1060 return ret

NVMLError_NoPermission: Insufficient Permissions

Anything else we need to know?:

I suggest adding NVMLError_NoPermission to the exception handling, following the same pattern as NVMLError_NotSupported which was added in #5343.

Environment:

  1. Dask Version: 2025.5.0
  2. Python Version: 3.11.6
  3. Operating System: Linux 5.15.0-117-generic #127-Ubuntu SMP Fri Jul 5 20:13:28 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux
  4. Install method: pip

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions