Workaround for test failures

sk1p · sk1p · commit 40565929f9b4 · 2025-04-30T19:07:59.000+02:00
Shutting down a spec cluster shortly after scaling down and up results in `KeyError`s. Instead, explicitly synchronize after scaling down, such that all distributed components have "settled", and we can be more confident that a shutdown in the near future will succeed. (as I understand it, "remove" ops from scaling down are still coming in when shutting down, and they are no longer expected, because the state was changed by scaling up again in between, forgetting about the workers that were in the process of being removed) See dask/distributed#9064 for the upstream reproducer and proposed fix
diff --git a/tests/executor/test_dask.py b/tests/executor/test_dask.py
@@ -297,7 +297,16 @@ def test_local_cluster_snooze():
         num_workers = len(ctx.executor.get_available_workers())
         assert num_workers == 2 + 1  # +service worker
         ctx.executor.snooze_manager.snooze()
-        time.sleep(1.)
+
+        # workaround: cannot call `ctx.close` before the `snooze` operation has
+        # completely finished, so we need to wait here
+        # NOTE: once this is fixed upstream in distributed, this should be
+        # removed, as it also means we are not testing the real sequence here,
+        # which doesn't wait!
+        t0 = time.monotonic()
+        while len(ctx.executor.client.cluster.workers) > 1 and time.monotonic() < t0 + 3:
+            time.sleep(0.1)
+
         assert len(ctx.executor.get_available_workers()) == 1
         ctx.executor.snooze_manager.unsnooze()
         assert len(ctx.executor.get_available_workers()) == 2 + 1
diff --git a/tests/server/test_state.py b/tests/server/test_state.py
@@ -3,6 +3,7 @@
 import pytest
 import queue
 from unittest import mock
+import time
 
 from libertem.executor.base import AsyncAdapter
 from libertem.executor.dask import DaskJobExecutor
@@ -144,6 +145,18 @@ async def test_get_executor_unsnooze():
         executor_state.executor.snooze_manager.snooze()
         assert executor_state.executor.snooze_manager.is_snoozing
 
+        # workaround: cannot call `executor.close` before the `snooze`
+        # operation has completely finished, so we need to wait here.
+        # NOTE: once this is fixed upstream in distributed, this should be
+        # removed, as it also means we are not testing the real sequence here,
+        # which doesn't wait!
+        t0 = time.monotonic()
+        while (
+            len(executor_state.executor._wrapped.client.cluster.workers) > 1
+            and time.monotonic() < t0 + 3
+        ):
+            time.sleep(0.1)
+
         # Getting the executor brings it out of snooze
         await executor_state.get_executor()
         assert not executor_state.executor.snooze_manager.is_snoozing
@@ -189,6 +202,18 @@ async def test_snooze_explicit_keep_alive():
         snoozer.snooze()
         assert snoozer.is_snoozing
 
+        # workaround: cannot call `executor.close` before the `snooze`
+        # operation has completely finished, so we need to wait here.
+        # NOTE: once this is fixed upstream in distributed, this should be
+        # removed, as it also means we are not testing the real sequence here,
+        # which doesn't wait!
+        t0 = time.monotonic()
+        while (
+            len(executor_state.executor._wrapped.client.cluster.workers) > 1
+            and time.monotonic() < t0 + 3
+        ):
+            time.sleep(0.1)
+
         snoozer.unsnooze()
         assert not snoozer.is_snoozing
         # these two work without raising an exception:
@@ -225,6 +250,18 @@ async def test_snooze_by_activity(local_cluster_url):
         # opportunities to snooze in between:
         assert snoozer.is_snoozing
 
+        # workaround: cannot call `executor.close` before the `snooze`
+        # operation has completely finished, so we need to wait here.
+        # NOTE: once this is fixed upstream in distributed, this should be
+        # removed, as it also means we are not testing the real sequence here,
+        # which doesn't wait!
+        t0 = time.monotonic()
+        while (
+            len(executor_state.executor._wrapped.client.cluster.workers) > 1
+            and time.monotonic() < t0 + 3
+        ):
+            time.sleep(0.1)
+
         # and this should directly unsnooze the executor
         # (we need to change the timeout etc. here, before we trigger the unsnooze,
         # to make sure we don't directly snooze again):
@@ -260,6 +297,19 @@ async def test_messages():
         await executor_state.set_executor(executor, params)
 
         await asyncio.sleep(0.1)
+
+        # workaround: cannot call `executor.close` before the `snooze` operation has
+        # completely finished, so we need to wait here
+        # NOTE: once this is fixed upstream in distributed, this should be
+        # removed, as it also means we are not testing the real sequence here,
+        # which doesn't wait!
+        t0 = time.monotonic()
+        while (
+            len(executor_state.executor._wrapped.client.cluster.workers) > 1
+            and time.monotonic() < t0 + 3
+        ):
+            await asyncio.sleep(0.1)
+
         # these two work without raising an exception:
         await executor_state.get_executor()
         messages = []