Open
Description
I received this error after creating a cluster, running jobs with it for ~5 hours, and then trying to exit out of that process.
This resulted in having to delete all the instances for the cluster myself.
# create cluster, let jobs run, and then ~5 hours later:
In [3]: exit
tornado.application - ERROR - Exception in callback functools.partial(<bound method IOLoop._discard_future_result of <tornado.platform.asyncio.AsyncIOLoop object at 0x7fb69c986760>>, <Task finished name='Task-130' coro=<SpecCluster._correct_state_internal() done, defined at /home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/deploy/spec.py:320> exception=OSError('Timed out during handshake while connecting to tcp://10.142.0.3:8786 after 10 s')>)
Traceback (most recent call last):
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/comm/core.py", line 319, in connect
handshake = await asyncio.wait_for(comm.read(), time_left())
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/asyncio/tasks.py", line 498, in wait_for
raise exceptions.TimeoutError()
asyncio.exceptions.TimeoutError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/tornado/ioloop.py", line 741, in _run_callback
ret = callback()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/tornado/ioloop.py", line 765, in _discard_future_result
future.result()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/deploy/spec.py", line 401, in _close
await self._correct_state()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/deploy/spec.py", line 328, in _correct_state_internal
await self.scheduler_comm.retire_workers(workers=list(to_close))
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/core.py", line 810, in send_recv_from_rpc
comm = await self.live_comm()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/core.py", line 768, in live_comm
comm = await connect(
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/comm/core.py", line 324, in connect
raise IOError(
OSError: Timed out during handshake while connecting to tcp://10.142.0.3:8786 after 10 s
Error in atexit._run_exitfuncs:
Traceback (most recent call last):
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/comm/core.py", line 319, in connect
handshake = await asyncio.wait_for(comm.read(), time_left())
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/asyncio/tasks.py", line 498, in wait_for
raise exceptions.TimeoutError()
asyncio.exceptions.TimeoutError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/deploy/spec.py", line 641, in close_clusters
cluster.close(timeout=10)
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/deploy/cluster.py", line 104, in close
return self.sync(self._close, callback_timeout=timeout)
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/deploy/cluster.py", line 183, in sync
return sync(self.loop, func, *args, **kwargs)
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/utils.py", line 340, in sync
raise exc.with_traceback(tb)
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/utils.py", line 324, in f
result[0] = yield future
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/tornado/gen.py", line 762, in run
value = future.result()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/tornado/ioloop.py", line 741, in _run_callback
ret = callback()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/tornado/ioloop.py", line 765, in _discard_future_result
future.result()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/deploy/spec.py", line 401, in _close
await self._correct_state()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/deploy/spec.py", line 328, in _correct_state_internal
await self.scheduler_comm.retire_workers(workers=list(to_close))
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/core.py", line 810, in send_recv_from_rpc
comm = await self.live_comm()
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/core.py", line 768, in live_comm
comm = await connect(
File "/home/eczech/miniconda3/envs/cloudprovider/lib/python3.8/site-packages/distributed/comm/core.py", line 324, in connect
raise IOError(
OSError: Timed out during handshake while connecting to tcp://10.142.0.3:8786 after 10 s
This is similar to #179 but the timeout here occurs when trying to communicate with the cluster, not when using the GCP API.