Open
Description
A client.restart()
failed in a test: one of the nannies timed out trying to join its worker process. The worker process joining was recently added in #6427, but I don't think that's causing the error—rather, there's probably always been something that was preventing worker processes from shutting down, but before we were just leaking them.
_______________ test_forgotten_futures_dont_clean_up_new_futures _______________
args = (), kwds = {}
@wraps(func)
def inner(*args, **kwds):
with self._recreate_cm():
> return func(*args, **kwds)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/contextlib.py:79:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/contextlib.py:79: in inner
return func(*args, **kwds)
distributed/utils_test.py:1072: in test_func
return _run_and_close_tornado(async_fn_outer)
distributed/utils_test.py:376: in _run_and_close_tornado
return asyncio.run(inner_fn())
/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/asyncio/runners.py:44: in run
return loop.run_until_complete(main)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/asyncio/base_events.py:634: in run_until_complete
self.run_forever()
/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/asyncio/base_events.py:601: in run_forever
self._run_once()
/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/asyncio/base_events.py:1869: in _run_once
event_list = self._selector.select(timeout)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <selectors.EpollSelector object at 0x7f3cae960ac0>
timeout = 0.051000000000000004
def select(self, timeout=None):
if timeout is None:
timeout = -1
elif timeout <= 0:
timeout = 0
else:
# epoll_wait() has a resolution of 1 millisecond, round away
# from zero to wait *at least* timeout seconds.
timeout = math.ceil(timeout * 1e3) * 1e-3
# epoll_wait() expects `maxevents` to be greater than zero;
# we want to make sure that `select()` can be called when no
# FD is registered.
max_ev = max(len(self._fd_to_key), 1)
ready = []
try:
> fd_event_list = self._selector.poll(timeout, max_ev)
E Failed: Timeout >300.0s
/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/selectors.py:469: Failed
----------------------------- Captured stdout call -----------------------------
Dumped cluster state to test_cluster_dump/test_forgotten_futures_dont_clean_up_new_futures.yaml
----------------------------- Captured stderr call -----------------------------
2022-08-05 17:47:42,354 - distributed.scheduler - INFO - State start
2022-08-05 17:47:42,357 - distributed.scheduler - INFO - Clear task state
2022-08-05 17:47:42,357 - distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:39207
2022-08-05 17:47:42,357 - distributed.scheduler - INFO - dashboard at: 127.0.0.1:37049
2022-08-05 17:47:42,369 - distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:46731'
2022-08-05 17:47:42,371 - distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.1:45209'
2022-08-05 17:47:43,689 - distributed.worker - INFO - Start worker at: tcp://127.0.0.1:36343
2022-08-05 17:47:43,689 - distributed.worker - INFO - Listening to: tcp://127.0.0.1:36343
2022-08-05 17:47:43,689 - distributed.worker - INFO - dashboard at: 127.0.0.1:33211
2022-08-05 17:47:43,689 - distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:39207
2022-08-05 17:47:43,689 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:43,689 - distributed.worker - INFO - Threads: 1
2022-08-05 17:47:43,689 - distributed.worker - INFO - Memory: 6.78 GiB
2022-08-05 17:47:43,689 - distributed.worker - INFO - Local Directory: /tmp/dask-worker-space/worker-xxlsgbtk
2022-08-05 17:47:43,690 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:43,713 - distributed.worker - INFO - Start worker at: tcp://127.0.0.1:41143
2022-08-05 17:47:43,713 - distributed.worker - INFO - Listening to: tcp://127.0.0.1:41143
2022-08-05 17:47:43,713 - distributed.worker - INFO - dashboard at: 127.0.0.1:42653
2022-08-05 17:47:43,713 - distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:39207
2022-08-05 17:47:43,713 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:43,713 - distributed.worker - INFO - Threads: 2
2022-08-05 17:47:43,713 - distributed.worker - INFO - Memory: 6.78 GiB
2022-08-05 17:47:43,713 - distributed.worker - INFO - Local Directory: /tmp/dask-worker-space/worker-d0g8lgzh
2022-08-05 17:47:43,713 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:44,153 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:36343', name: 0, status: init, memory: 0, processing: 0>
2022-08-05 17:47:44,154 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:36343
2022-08-05 17:47:44,154 - distributed.core - INFO - Starting established connection
2022-08-05 17:47:44,155 - distributed.worker - INFO - Registered to: tcp://127.0.0.1:39207
2022-08-05 17:47:44,156 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:44,156 - distributed.core - INFO - Starting established connection
2022-08-05 17:47:44,165 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:41143', name: 1, status: init, memory: 0, processing: 0>
2022-08-05 17:47:44,166 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:41143
2022-08-05 17:47:44,166 - distributed.core - INFO - Starting established connection
2022-08-05 17:47:44,166 - distributed.worker - INFO - Registered to: tcp://127.0.0.1:39207
2022-08-05 17:47:44,167 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:44,167 - distributed.core - INFO - Starting established connection
2022-08-05 17:47:44,216 - distributed.scheduler - INFO - Receive client connection: Client-b5b70f25-14e6-11ed-8a5c-0022481c22d0
2022-08-05 17:47:44,217 - distributed.core - INFO - Starting established connection
2022-08-05 17:47:44,222 - distributed.scheduler - INFO - Releasing all requested keys
2022-08-05 17:47:44,222 - distributed.scheduler - INFO - Clear task state
2022-08-05 17:47:44,229 - distributed.nanny - INFO - Nanny asking worker to close
2022-08-05 17:47:44,230 - distributed.nanny - INFO - Nanny asking worker to close
2022-08-05 17:47:44,231 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:36343
2022-08-05 17:47:44,233 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-67ebaa27-4cf9-44b0-b67e-cd90f5ea0525 Address tcp://127.0.0.1:36343 Status: Status.closing
2022-08-05 17:47:44,234 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:41143
2022-08-05 17:47:44,236 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:36343', name: 0, status: closing, memory: 0, processing: 1>
2022-08-05 17:47:44,236 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:36343
2022-08-05 17:47:44,237 - distributed.batched - INFO - Batched Comm Closed <TCP (closed) Scheduler connection to worker local=tcp://127.0.0.1:39207 remote=tcp://127.0.0.1:41216>
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/batched.py", line 97, in _background_send
nbytes = yield self.comm.write(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/site-packages/tornado/gen.py", line 762, in run
value = future.result()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 267, in write
raise CommClosedError()
distributed.comm.core.CommClosedError
2022-08-05 17:47:44,237 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-18dd4ccf-3dd1-4716-b73b-ad0d38f2d9b6 Address tcp://127.0.0.1:41143 Status: Status.closing
2022-08-05 17:47:44,238 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:41143', name: 1, status: closing, memory: 0, processing: 1>
2022-08-05 17:47:44,238 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:41143
2022-08-05 17:47:44,239 - distributed.scheduler - INFO - Lost all workers
2022-08-05 17:47:44,427 - distributed.nanny - WARNING - Restarting worker
2022-08-05 17:47:45,651 - distributed.worker - INFO - Start worker at: tcp://127.0.0.1:39227
2022-08-05 17:47:45,652 - distributed.worker - INFO - Listening to: tcp://127.0.0.1:39227
2022-08-05 17:47:45,652 - distributed.worker - INFO - dashboard at: 127.0.0.1:34551
2022-08-05 17:47:45,652 - distributed.worker - INFO - Waiting to connect to: tcp://127.0.0.1:39207
2022-08-05 17:47:45,652 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:45,652 - distributed.worker - INFO - Threads: 1
2022-08-05 17:47:45,652 - distributed.worker - INFO - Memory: 6.78 GiB
2022-08-05 17:47:45,652 - distributed.worker - INFO - Local Directory: /tmp/dask-worker-space/worker-gplzg82s
2022-08-05 17:47:45,652 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:46,097 - distributed.scheduler - INFO - Register worker <WorkerState 'tcp://127.0.0.1:39227', name: 0, status: init, memory: 0, processing: 0>
2022-08-05 17:47:46,098 - distributed.scheduler - INFO - Starting worker compute stream, tcp://127.0.0.1:39227
2022-08-05 17:47:46,099 - distributed.core - INFO - Starting established connection
2022-08-05 17:47:46,099 - distributed.worker - INFO - Registered to: tcp://127.0.0.1:39207
2022-08-05 17:47:46,099 - distributed.worker - INFO - -------------------------------------------------
2022-08-05 17:47:46,100 - distributed.core - INFO - Starting established connection
2022-08-05 17:47:57,031 - distributed.nanny - WARNING - Worker process still alive after 12.799998626708986 seconds, killing
2022-08-05 17:48:00,230 - distributed.core - ERROR - Exception while handling op kill
Traceback (most recent call last):
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
return fut.result()
asyncio.exceptions.CancelledError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 770, in _handle_comm
result = await result
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 372, in kill
await self.process.kill(timeout=0.8 * (deadline - time()))
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 781, in kill
await process.join(max(0, deadline - time()))
File "/home/runner/work/distributed/distributed/distributed/process.py", line 304, in join
await asyncio.wait_for(asyncio.shield(self._exit_future), timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/asyncio/tasks.py", line 492, in wait_for
raise exceptions.TimeoutError() from exc
asyncio.exceptions.TimeoutError
2022-08-05 17:48:00,239 - distributed.core - ERROR - 1/2 nanny worker(s) did not shut down within 20s
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/utils.py", line 799, in wrapper
return await func(*args, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/scheduler.py", line 5261, in restart
raise TimeoutError(
asyncio.exceptions.TimeoutError: 1/2 nanny worker(s) did not shut down within 20s
2022-08-05 17:48:00,239 - distributed.core - ERROR - Exception while handling op restart
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 770, in _handle_comm
result = await result
File "/home/runner/work/distributed/distributed/distributed/utils.py", line 799, in wrapper
return await func(*args, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/scheduler.py", line 5261, in restart
raise TimeoutError(
asyncio.exceptions.TimeoutError: 1/2 nanny worker(s) did not shut down within 20s
2022-08-05 17:49:42,355 - distributed.scheduler - INFO - Remove client Client-b5b70f25-14e6-11ed-8a5c-0022481c22d0
2022-08-05 17:49:42,355 - distributed.scheduler - INFO - Remove client Client-b5b70f25-14e6-11ed-8a5c-0022481c22d0
2022-08-05 17:49:42,356 - distributed.scheduler - INFO - Close client connection: Client-b5b70f25-14e6-11ed-8a5c-0022481c22d0
2022-08-05 17:49:42,358 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:46731'.
2022-08-05 17:49:42,358 - distributed.nanny - INFO - Nanny asking worker to close
2022-08-05 17:49:42,358 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:45209'.
2022-08-05 17:49:42,360 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:39227
2022-08-05 17:49:42,362 - distributed.worker - INFO - Connection to scheduler broken. Closing without reporting. ID: Worker-dd4b2b9e-cc40-477e-8834-9b8c475c2881 Address tcp://127.0.0.1:39227 Status: Status.closing
2022-08-05 17:49:42,362 - distributed.scheduler - INFO - Remove worker <WorkerState 'tcp://127.0.0.1:39227', name: 0, status: closing, memory: 0, processing: 0>
2022-08-05 17:49:42,363 - distributed.core - INFO - Removing comms to tcp://127.0.0.1:39227
2022-08-05 17:49:42,363 - distributed.scheduler - INFO - Lost all workers
+++++++++++++++++++++++++++++++++++ Timeout ++++++++++++++++++++++++++++++++++++
~~~~~~~~~~~~~~~~~~~~~ Stack of asyncio_1 ([1398](https://github.com/dask/distributed/runs/7696136507?check_suite_focus=true#step:11:1399)98006791936) ~~~~~~~~~~~~~~~~~~~~~
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 937, in _bootstrap
self._bootstrap_inner()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 980, in _bootstrap_inner
self.run()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 917, in run
self._target(*self._args, **self._kwargs)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/concurrent/futures/thread.py", line 81, in _worker
work_item = work_queue.get(block=True)
Stack of AsyncProcess Dask Worker process (from Nanny) watch message queue (139898586732288)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 937, in _bootstrap
self._bootstrap_inner()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 980, in _bootstrap_inner
self.run()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 917, in run
self._target(*self._args, **self._kwargs)
File "/home/runner/work/distributed/distributed/distributed/process.py", line 216, in _watch_message_queue
msg = q.get()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/queue.py", line 171, in get
self.not_empty.wait()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 312, in wait
waiter.acquire()
~~~~~~~~~~~~~~~~~~~~~ Stack of asyncio_0 (139898023573248) ~~~~~~~~~~~~~~~~~~~~~
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 937, in _bootstrap
self._bootstrap_inner()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 980, in _bootstrap_inner
self.run()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 917, in run
self._target(*self._args, **self._kwargs)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/concurrent/futures/thread.py", line 81, in _worker
work_item = work_queue.get(block=True)
~~~~~~~~~~~~~~ Stack of Dask-Callback-Thread_0 (139898[1410](https://github.com/dask/distributed/runs/7696136507?check_suite_focus=true#step:11:1411)09664) ~~~~~~~~~~~~~~~
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 937, in _bootstrap
self._bootstrap_inner()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 980, in _bootstrap_inner
self.run()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 917, in run
self._target(*self._args, **self._kwargs)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/concurrent/futures/thread.py", line 81, in _worker
work_item = work_queue.get(block=True)
~~~~~~~~~~~~~~~~~~ Stack of Dask-Offload_0 (139899265611520) ~~~~~~~~~~~~~~~~~~~
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 937, in _bootstrap
self._bootstrap_inner()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 980, in _bootstrap_inner
self.run()
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/threading.py", line 917, in run
self._target(*self._args, **self._kwargs)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.9/concurrent/futures/thread.py", line 81, in _worker
work_item = work_queue.get(block=True)
https://github.com/dask/distributed/runs/7696136507?check_suite_focus=true#step:11:1439