test: fix three pre-existing flakies surfaced by full-suite runs

kovtcharov · itomek · commit 54f2d74e3613 · 2026-04-29T13:13:34.000-04:00
These were marked "known flakies, pre-existing on main" in the merge PR, but every one was a real test bug worth nailing down rather than papering over. All three reproduced on bare main HEAD. test_sse_confirmation (3 tests) Polled ``handler._confirm_result is None`` to detect when the worker thread had registered itself. But _confirm_result is initialised to ``False`` (not None), so the polling loop exited immediately — resolve fired before the worker's confirm_tool_execution set up _confirm_event, and the worker's own setdefault then overwrote the resolved state with a fresh unset event. Net result: the worker waited for an event that no one would ever set, hit the internal 90 s confirmation timeout, and the test failed with "thread still alive". Fix: poll ``handler._confirm_event is None`` instead. _confirm_event starts as None and only becomes non-None inside confirm_tool_execution, so it correctly tracks the registration moment. test_semaphore_exhausted_returns_429 Created a SECOND asyncio event loop with ``asyncio.new_event_loop()`` and acquired the semaphore on it, then handed the half-locked semaphore to TestClient (which runs on its OWN loop). ``asyncio.Semaphore`` doesn't promise cross-loop sanity — the waiter list is loop-bound, so acquire() on TestClient's loop saw inconsistent state under contention. Fix: use ``Semaphore(0)`` — exhausted from birth, no second loop. Plus patch ``asyncio.wait_for`` to a 0.2 s timeout in the chat router so the test goes from 60 s → 0.6 s. test_llm_command_with_server Health check accepted any 200, even when ``all_models_loaded == []``. Worse: even with a model loaded, ``gaia llm`` defaults to whatever the global default is — post-PR-#865 that's Gemma-4-E4B-it-GGUF. CI runners almost never have Gemma preloaded, so Lemonade returned 500, the OpenAI client retried with exponential backoff, and the subprocess timed out at 60 s. Fix: extend the health check to require at least one ``llm``/``vlm`` in ``all_models_loaded`` and return that model's name. The test then passes ``--model <loaded_one>`` so we don't trip the auto-load on a model the runner doesn't have. Verified: full unit suite 1630 passed / 0 failed / 15 skipped.
diff --git a/tests/unit/chat/ui/test_chat_concurrency.py b/tests/unit/chat/ui/test_chat_concurrency.py
@@ -97,34 +97,50 @@ class TestGlobalSemaphore429:
     """Tests for global concurrency semaphore returning 429 Too Many Requests."""
 
     def test_semaphore_exhausted_returns_429(self):
-        """When the global semaphore is exhausted, a request gets 429."""
-        # Create app with semaphore of size 1
+        """When the global semaphore is exhausted, a request gets 429.
+
+        Implementation notes:
+
+        - We use ``Semaphore(0)`` rather than a Semaphore(1) we manually
+          exhaust on a side event loop. The old approach tied the
+          semaphore's internal waiter list to a loop different from the
+          one TestClient uses, producing flaky cross-loop behaviour
+          (sometimes 429 in 60 s, sometimes hung). A semaphore with
+          starting value 0 is exhausted from birth and works on any loop.
+
+        - The endpoint waits up to ``timeout=60.0`` for the semaphore
+          (see chat.py — long enough to cushion sequential workloads
+          like the eval runner). We patch ``asyncio.wait_for`` to a
+          tiny timeout so the test runs in <1 s instead of >60 s.
+        """
+        from unittest.mock import patch as _patch
+
         app = create_app(db_path=":memory:")
         db = app.state.db
         sid = db.create_session(title="Test")["id"]
 
-        # Replace semaphore with one that's already exhausted
-        sem = asyncio.Semaphore(1)
+        # Permanently-exhausted semaphore — no waiters needed.
+        app.state.chat_semaphore = asyncio.Semaphore(0)
 
-        async def _exhaust():
-            await sem.acquire()
+        # Patch the in-endpoint wait_for so the test isn't dominated by
+        # the production 60 s gate. We wrap the real wait_for and force
+        # a short timeout; everything else (acquire, TimeoutError raise)
+        # runs unchanged.
+        real_wait_for = asyncio.wait_for
 
-        loop = asyncio.new_event_loop()
-        loop.run_until_complete(_exhaust())
+        async def _fast_wait_for(awaitable, timeout):  # noqa: ARG001
+            return await real_wait_for(awaitable, timeout=0.2)
 
-        app.state.chat_semaphore = sem
+        with _patch("gaia.ui.routers.chat.asyncio.wait_for", _fast_wait_for):
+            client = TestClient(app)
+            resp = client.post(
+                "/api/chat/send",
+                json={"session_id": sid, "message": "blocked", "stream": False},
+            )
 
-        client = TestClient(app)
-        resp = client.post(
-            "/api/chat/send",
-            json={"session_id": sid, "message": "blocked", "stream": False},
-        )
         assert resp.status_code == 429
         assert "busy" in resp.json()["detail"]
 
-        sem.release()
-        loop.close()
-
     def test_semaphore_released_after_non_streaming_request(self, client, session_id):
         """After a non-streaming request completes, the semaphore is released."""
         with patch("gaia.ui.server._get_chat_response") as mock:
diff --git a/tests/unit/chat/ui/test_sse_confirmation.py b/tests/unit/chat/ui/test_sse_confirmation.py
@@ -122,9 +122,15 @@ def run_confirm():
         t = threading.Thread(target=run_confirm)
         t.start()
 
-        # Wait for the confirmation to be set up
+        # Wait for the worker to have set up _confirm_event before we resolve.
+        # Polling _confirm_result was wrong — it's initialised to False (not
+        # None), so ``is None`` never holds and resolve fired before the
+        # worker registered its event, then the worker's own setup
+        # overwrote the resolved state. _confirm_event starts at None and
+        # is only set inside confirm_tool_execution, so polling it for
+        # not-None correctly tracks the registration moment.
         deadline = time.time() + 2.0
-        while handler._confirm_result is None and time.time() < deadline:
+        while handler._confirm_event is None and time.time() < deadline:
             time.sleep(0.05)
 
         handler.resolve_tool_confirmation(approved=True)
@@ -144,7 +150,7 @@ def run_confirm():
         t.start()
 
         deadline = time.time() + 2.0
-        while handler._confirm_result is None and time.time() < deadline:
+        while handler._confirm_event is None and time.time() < deadline:
             time.sleep(0.05)
 
         handler.resolve_tool_confirmation(approved=True)
@@ -173,8 +179,10 @@ def run_confirm():
         t = threading.Thread(target=run_confirm)
         t.start()
 
+        # See note in test_approve_returns_true: poll _confirm_event, not
+        # _confirm_result. The latter is False from the start.
         deadline = time.time() + 2.0
-        while handler._confirm_result is None and time.time() < deadline:
+        while handler._confirm_event is None and time.time() < deadline:
             time.sleep(0.05)
 
         handler.resolve_tool_confirmation(approved=False)
diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py
@@ -36,20 +36,50 @@ def _check_command_availability(self):
         return True
 
     def _check_lemonade_server_health(self):
-        """Check if lemonade server is running and accessible."""
+        """Check Lemonade is up AND return the name of a loaded chat model.
+
+        Two failure modes the bare /health status code doesn't catch:
+
+        1. Lemonade up with no model loaded → ``status: ok`` but
+           ``all_models_loaded == []`` and ``/v1/chat/completions`` 500s.
+        2. Lemonade up with a model loaded but ``gaia llm`` defaults to a
+           DIFFERENT model (post-PR-#865 the default flipped to
+           Gemma 4 E4B) — Lemonade then 500s on the unloaded one and the
+           OpenAI client retries until our 60 s test timeout. CI runners
+           rarely preload Gemma; locally most devs preload Qwen.
+
+        Returns the first chat-capable ``model_name`` (so the test can
+        pass it via ``--model`` and avoid case 2), or ``None`` to skip.
+        """
         try:
             response = requests.get("http://localhost:13305/api/v1/health", timeout=5)
-            if response.status_code == 200:
-                print("OK: Lemonade server health check passed")
-                return True
-            else:
+            if response.status_code != 200:
                 print(
                     f"ERROR: Lemonade server health check failed with status: {response.status_code}"
                 )
-                return False
+                return None
+            data = response.json()
+            chat_models = [
+                m
+                for m in data.get("all_models_loaded", [])
+                if m.get("type") in ("llm", "vlm")
+            ]
+            if not chat_models:
+                print(
+                    "ERROR: Lemonade is up but no chat-capable model is loaded "
+                    f"(all_models_loaded={data.get('all_models_loaded')}). "
+                    "Cannot exercise the LLM CLI without a model — skipping."
+                )
+                return None
+            chosen = chat_models[0].get("model_name")
+            print(
+                "OK: Lemonade server health check passed; chat models loaded: "
+                f"{[m.get('model_name') for m in chat_models]}; using {chosen}"
+            )
+            return chosen
         except requests.exceptions.RequestException as e:
             print(f"ERROR: Lemonade server health check failed with exception: {e}")
-            return False
+            return None
 
     def test_llm_command_with_server(self):
         """Test LLM command with running server."""
@@ -58,28 +88,33 @@ def test_llm_command_with_server(self):
         if not self._check_command_availability():
             self.skipTest("gaia command is not available")
 
-        # Check if server is accessible
-        if not self._check_lemonade_server_health():
-            self.skipTest("Lemonade server is not running")
+        # Check if server is accessible AND has a chat model loaded.
+        # Returns the loaded model name so we can pass it via --model and
+        # avoid the "default model not loaded" 500-retry hang.
+        loaded_model = self._check_lemonade_server_health()
+        if loaded_model is None:
+            self.skipTest("Lemonade server is not running or no chat model is loaded")
 
         try:
-            # Test with explicit --base-url (without /api/v1 to test normalization)
-            print(
-                "Executing command: gaia llm 'What is 1+1?' --max-tokens 20 --base-url http://localhost:13305"
-            )
+            # Test with explicit --base-url (without /api/v1 to test
+            # normalization) and --model targeting the actually-loaded
+            # checkpoint (so we don't trip Lemonade's auto-load on a
+            # different default which otherwise 500-retries to timeout).
+            cmd = [
+                "gaia",
+                "llm",
+                "What is 1+1?",
+                "--max-tokens",
+                "20",
+                "--base-url",
+                "http://localhost:13305",
+                "--model",
+                loaded_model,
+            ]
+            print(f"Executing command: {' '.join(cmd)}")
 
-            # Test the LLM command with explicit --base-url
-            # This validates both the CLI arg and the base_url normalization
             result = subprocess.run(
-                [
-                    "gaia",
-                    "llm",
-                    "What is 1+1?",
-                    "--max-tokens",
-                    "20",
-                    "--base-url",
-                    "http://localhost:13305",
-                ],
+                cmd,
                 capture_output=True,
                 text=True,
                 timeout=60,