Open
Description
Haven't seen this fail before, but it looks like an instance of #5186
Key part is just that a worker fails to come up and we see OSError: [Errno 98] Address already in use
.
The fact that the nanny spews the same error 3 times would be fixed by #6427
____________ test_contact_listen_address[tcp://127.0.0.2:---nanny] _____________
args = (), kwds = {'listen_address': 'tcp://127.0.0.2:', 'nanny': '--nanny'}
@wraps(func)
definner(*args, **kwds):
withself._recreate_cm():
> return func(*args, **kwds)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:79:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/utils_test.py:1199: in test_func
return _run_and_close_tornado(async_fn_outer)
distributed/utils_test.py:480: in _run_and_close_tornado
return asyncio.run(inner_fn())
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/runners.py:44: in run
return loop.run_until_complete(main)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/base_events.py:646: in run_until_complete
return future.result()
distributed/utils_test.py:477: in inner_fn
returnawait async_fn(*args, **kwargs)
distributed/utils_test.py:1196: in async_fn_outer
returnawait asyncio.wait_for(async_fn(), timeout=timeout * 2)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py:445: in wait_for
return fut.result()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
asyncdefasync_fn():
result = None
with dask.config.set(config):
workers = []
s = False
for _ inrange(60):
try:
s, ws = await start_cluster(
nthreads,
scheduler,
security=security,
Worker=Worker,
scheduler_kwargs=scheduler_kwargs,
worker_kwargs=worker_kwargs,
)
exceptExceptionas e:
logger.error(
"Failed to start gen_cluster: "
f"{e.__class__.__name__}: {e}; retrying",
exc_info=True,
)
await asyncio.sleep(1)
else:
workers[:] = ws
args = [s] + workers
break
if s isFalse:
raiseException("Could not start cluster")
if client:
c = await Client(
s.address,
security=security,
asynchronous=True,
**client_kwargs,
)
args = [c] + args
try:
coro = func(*args, *outer_args, **kwargs)
task = asyncio.create_task(coro)
coro2 = asyncio.wait_for(asyncio.shield(task), timeout)
result = await coro2
validate_state(s, *workers)
except asyncio.TimeoutError:
assert task
buffer = io.StringIO()
# This stack indicates where the coro/test is suspended
task.print_stack(file=buffer)
if cluster_dump_directory:
await dump_cluster_state(
s,
ws,
output_dir=cluster_dump_directory,
func_name=func.__name__,
)
task.cancel()
whilenot task.cancelled():
await asyncio.sleep(0.01)
# Hopefully, the hang has been caused by inconsistent
# state, which should be much more meaningful than the
# timeout
validate_state(s, *workers)
# Remove as much of the traceback as possible; it's
# uninteresting boilerplate from utils_test and asyncio
# and not from the code being tested.
> raise asyncio.TimeoutError(
f"Test timeout after {timeout}s.\n"
"========== Test stack trace starts here ==========\n"
f"{buffer.getvalue()}"
) fromNone
E asyncio.exceptions.TimeoutError: Test timeout after 30s.
E ========== Test stack trace starts here ==========
E Stack for <Task pending name='Task-1433' coro=<test_contact_listen_address() running at /home/runner/work/distributed/distributed/distributed/cli/tests/test_dask_worker.py:443> wait_for=<Future pending cb=[Task.task_wakeup()]>> (most recent call last):
E File "/home/runner/work/distributed/distributed/distributed/cli/tests/test_dask_worker.py", line 443, in test_contact_listen_address
E await c.wait_for_workers(1)
distributed/utils_test.py:1127: TimeoutError
----------------------------- Captured stdout call -----------------------------
Dumped cluster state to test_cluster_dump/test_contact_listen_address.yaml
----------------------------- Captured stderr call -----------------------------
2022-07-19 15:24:10,010 - distributed.scheduler - INFO - State start
2022-07-19 15:24:10,012 - distributed.scheduler - INFO - Clear task state
2022-07-19 15:24:10,012 - distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:35175
2022-07-19 15:24:10,012 - distributed.scheduler - INFO - dashboard at: 127.0.0.1:42855
2022-07-19 15:24:10,020 - distributed.scheduler - INFO - Receive client connection: Client-d63bdbc0-0776-11ed-8b95-000d3aa688e3
2022-07-19 15:24:10,021 - distributed.core - INFO - Starting established connection
2022-07-19 15:24:10,659 - distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.2:38265'
2022-07-19 15:24:11,358 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.2:38265
2022-07-19 15:24:11,358 - distributed.worker - INFO - Closed worker has not yet started: Status.init
2022-07-19 15:24:11,360 - distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1316, in start_unsafe
await self.listen(start_address, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/core.py", line 658, in listen
listener = await listen(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 212, in _
await self.start()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 528, in start
sockets = netutil.bind_sockets(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/netutil.py", line 161, in bind_sockets
sock.bind(sockaddr)
OSError: [Errno 98] Address already in use
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 878, in run
await worker
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Worker failed to start.
2022-07-19 15:24:11,379 - distributed.nanny - ERROR - Failed while trying to start worker process: Worker failed to start.
2022-07-19 15:24:11,379 - distributed.nanny - ERROR - Failed to connect to process
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1316, in start_unsafe
await self.listen(start_address, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/core.py", line 658, in listen
listener = await listen(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 212, in _
await self.start()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 528, in start
sockets = netutil.bind_sockets(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/netutil.py", line 161, in bind_sockets
sock.bind(sockaddr)
OSError: [Errno 98] Address already in use
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 686, in start
msg = await self._wait_until_connected(uid)
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 806, in _wait_until_connected
raise msg["exception"]
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 878, in run
await worker
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Worker failed to start.
[2022](https://github.com/dask/distributed/runs/7412362356?check_suite_focus=true#step:11:2023)-07-19 15:24:11,381 - distributed.nanny - ERROR - Failed to start process
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1316, in start_unsafe
await self.listen(start_address, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/core.py", line 658, in listen
listener = await listen(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 212, in _
await self.start()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 528, in start
sockets = netutil.bind_sockets(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/netutil.py", line 161, in bind_sockets
sock.bind(sockaddr)
OSError: [Errno 98] Address already in use
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 436, in instantiate
result = await self.process.start()
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 686, in start
msg = await self._wait_until_connected(uid)
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 806, in _wait_until_connected
raise msg["exception"]
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 878, in run
await worker
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Worker failed to start.
2022-07-19 15:24:11,382 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.2:38265'.
2022-07-19 15:24:11,384 - distributed.dask_worker - INFO - End worker
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1316, in start_unsafe
await self.listen(start_address, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/core.py", line 658, in listen
listener = await listen(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 212, in _
await self.start()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 528, in start
sockets = netutil.bind_sockets(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/netutil.py", line 161, in bind_sockets
sock.bind(sockaddr)
OSError: [Errno 98] Address already in use
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 363, in start_unsafe
response = await self.instantiate()
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 436, in instantiate
result = await self.process.start()
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 686, in start
msg = await self._wait_until_connected(uid)
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 806, in _wait_until_connected
raise msg["exception"]
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 878, in run
await worker
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Worker failed to start.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/share/miniconda3/envs/dask-distributed/bin/dask-worker", line 33, in <module>
sys.exit(load_entry_point('distributed', 'console_scripts', 'dask-worker')())
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/cli/dask_worker.py", line 500, in main
asyncio.run(run())
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/base_events.py", line 646, in run_until_complete
return future.result()
File "/home/runner/work/distributed/distributed/distributed/cli/dask_worker.py", line 497, in run
[task.result() for task in done]
File "/home/runner/work/distributed/distributed/distributed/cli/dask_worker.py", line 497, in <listcomp>
[task.result() for task in done]
File "/home/runner/work/distributed/distributed/distributed/cli/dask_worker.py", line 472, in wait_for_nannies_to_finish
await asyncio.gather(*nannies)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 650, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Nanny failed to start.
2022-07-19 15:24:40,060 - distributed.scheduler - INFO - Remove client Client-d63bdbc0-0776-11ed-8b95-000d3aa688e3
2022-07-19 15:24:40,061 - distributed.scheduler - INFO - Remove client Client-d63bdbc0-0776-11ed-8b95-000d3aa688e3
2022-07-19 15:24:40,062 - distributed.scheduler - INFO - Close client connection: Client-d63bdbc0-0776-11ed-8b95-000d3aa688e3
2022-07-19 15:24:40,062 - distributed.scheduler - INFO - Scheduler closing...
2022-07-19 15:24:40,063 - distributed.scheduler - INFO - Scheduler closing all comms