Open
Description
I thought this was only failing with queuing on, but it also just failed on a non-queue job.
The full garbage collections took 52% CPU time recently
and lack of logs of the cluster even starting up makes me wonder if something else is going on here?
_______________ test_steal_reschedule_reset_in_flight_occupancy ________________
args = (), kwds = {}
@wraps(func)
def inner(*args, **kwds):
with self._recreate_cm():
> return func(*args, **kwds)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:79:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:79: in inner
return func(*args, **kwds)
distributed/utils_test.py:1084: in test_func
return _run_and_close_tornado(async_fn_outer)
distributed/utils_test.py:377: in _run_and_close_tornado
return asyncio.run(inner_fn())
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/runners.py:44: in run
return loop.run_until_complete(main)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/base_events.py:646: in run_until_complete
return future.result()
distributed/utils_test.py:374: in inner_fn
return await async_fn(*args, **kwargs)
distributed/utils_test.py:1081: in async_fn_outer
return await asyncio.wait_for(async_fn(), timeout=timeout * 2)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py:445: in wait_for
return fut.result()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
2022-09-02 22:18:17,877 - distributed.utils_perf - WARNING - full garbage collections took 52% CPU time recently (threshold: 10%)
2022-09-02 22:18:18,535 - distributed.utils_perf - WARNING - full garbage collections took 50% CPU time recently (threshold: 10%)
2022-09-02 22:18:19,211 - distributed.utils_perf - WARNING - full garbage collections took 48% CPU time recently (threshold: 10%)
async def async_fn():
result = None
with dask.config.set(config):
async with _cluster_factory() as (s, workers), _client_factory(
s
) as c:
args = [s] + workers
if c is not None:
args = [c] + args
try:
coro = func(*args, *outer_args, **kwargs)
task = asyncio.create_task(coro)
coro2 = asyncio.wait_for(asyncio.shield(task), timeout)
result = await coro2
validate_state(s, *workers)
except asyncio.TimeoutError:
assert task
buffer = io.StringIO()
# This stack indicates where the coro/test is suspended
task.print_stack(file=buffer)
if cluster_dump_directory:
await dump_cluster_state(
s=s,
ws=workers,
output_dir=cluster_dump_directory,
func_name=func.__name__,
)
task.cancel()
while not task.cancelled():
await asyncio.sleep(0.01)
# Hopefully, the hang has been caused by inconsistent
# state, which should be much more meaningful than the
# timeout
validate_state(s, *workers)
# Remove as much of the traceback as possible; it's
# uninteresting boilerplate from utils_test and asyncio
# and not from the code being tested.
> raise asyncio.TimeoutError(
f"Test timeout after {timeout}s.\n"
"========== Test stack trace starts here ==========\n"
f"{buffer.getvalue()}"
) from None
E asyncio.exceptions.TimeoutError: Test timeout after 30s.
E ========== Test stack trace starts here ==========
E Stack for <Task pending name='Task-109389' coro=<test_steal_reschedule_reset_in_flight_occupancy() running at /home/runner/work/distributed/distributed/distributed/tests/test_steal.py:1105> wait_for=<Future pending cb=[Task.task_wakeup()]>> (most recent call last):
E File "/home/runner/work/distributed/distributed/distributed/tests/test_steal.py", line 1105, in test_steal_reschedule_reset_in_flight_occupancy
E await asyncio.sleep(0.01)
distributed/utils_test.py:1018: TimeoutError
----------------------------- Captured stdout call -----------------------------
Dumped cluster state to test_cluster_dump/test_steal_reschedule_reset_in_flight_occupancy.yaml
- generated xml file: /home/runner/work/distributed/distributed/reports/pytest.xml -
https://github.com/dask/distributed/runs/8164596075?check_suite_focus=true#step:16:1305