Skip to content

Commit 54f2d74

Browse files
kovtcharovitomek
authored andcommitted
test: fix three pre-existing flakies surfaced by full-suite runs
These were marked "known flakies, pre-existing on main" in the merge PR, but every one was a real test bug worth nailing down rather than papering over. All three reproduced on bare main HEAD. test_sse_confirmation (3 tests) Polled ``handler._confirm_result is None`` to detect when the worker thread had registered itself. But _confirm_result is initialised to ``False`` (not None), so the polling loop exited immediately — resolve fired before the worker's confirm_tool_execution set up _confirm_event, and the worker's own setdefault then overwrote the resolved state with a fresh unset event. Net result: the worker waited for an event that no one would ever set, hit the internal 90 s confirmation timeout, and the test failed with "thread still alive". Fix: poll ``handler._confirm_event is None`` instead. _confirm_event starts as None and only becomes non-None inside confirm_tool_execution, so it correctly tracks the registration moment. test_semaphore_exhausted_returns_429 Created a SECOND asyncio event loop with ``asyncio.new_event_loop()`` and acquired the semaphore on it, then handed the half-locked semaphore to TestClient (which runs on its OWN loop). ``asyncio.Semaphore`` doesn't promise cross-loop sanity — the waiter list is loop-bound, so acquire() on TestClient's loop saw inconsistent state under contention. Fix: use ``Semaphore(0)`` — exhausted from birth, no second loop. Plus patch ``asyncio.wait_for`` to a 0.2 s timeout in the chat router so the test goes from 60 s → 0.6 s. test_llm_command_with_server Health check accepted any 200, even when ``all_models_loaded == []``. Worse: even with a model loaded, ``gaia llm`` defaults to whatever the global default is — post-PR-#865 that's Gemma-4-E4B-it-GGUF. CI runners almost never have Gemma preloaded, so Lemonade returned 500, the OpenAI client retried with exponential backoff, and the subprocess timed out at 60 s. Fix: extend the health check to require at least one ``llm``/``vlm`` in ``all_models_loaded`` and return that model's name. The test then passes ``--model <loaded_one>`` so we don't trip the auto-load on a model the runner doesn't have. Verified: full unit suite 1630 passed / 0 failed / 15 skipped.
1 parent 322ed7c commit 54f2d74

3 files changed

Lines changed: 105 additions & 46 deletions

File tree

tests/unit/chat/ui/test_chat_concurrency.py

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -97,34 +97,50 @@ class TestGlobalSemaphore429:
9797
"""Tests for global concurrency semaphore returning 429 Too Many Requests."""
9898

9999
def test_semaphore_exhausted_returns_429(self):
100-
"""When the global semaphore is exhausted, a request gets 429."""
101-
# Create app with semaphore of size 1
100+
"""When the global semaphore is exhausted, a request gets 429.
101+
102+
Implementation notes:
103+
104+
- We use ``Semaphore(0)`` rather than a Semaphore(1) we manually
105+
exhaust on a side event loop. The old approach tied the
106+
semaphore's internal waiter list to a loop different from the
107+
one TestClient uses, producing flaky cross-loop behaviour
108+
(sometimes 429 in 60 s, sometimes hung). A semaphore with
109+
starting value 0 is exhausted from birth and works on any loop.
110+
111+
- The endpoint waits up to ``timeout=60.0`` for the semaphore
112+
(see chat.py — long enough to cushion sequential workloads
113+
like the eval runner). We patch ``asyncio.wait_for`` to a
114+
tiny timeout so the test runs in <1 s instead of >60 s.
115+
"""
116+
from unittest.mock import patch as _patch
117+
102118
app = create_app(db_path=":memory:")
103119
db = app.state.db
104120
sid = db.create_session(title="Test")["id"]
105121

106-
# Replace semaphore with one that's already exhausted
107-
sem = asyncio.Semaphore(1)
122+
# Permanently-exhausted semaphore — no waiters needed.
123+
app.state.chat_semaphore = asyncio.Semaphore(0)
108124

109-
async def _exhaust():
110-
await sem.acquire()
125+
# Patch the in-endpoint wait_for so the test isn't dominated by
126+
# the production 60 s gate. We wrap the real wait_for and force
127+
# a short timeout; everything else (acquire, TimeoutError raise)
128+
# runs unchanged.
129+
real_wait_for = asyncio.wait_for
111130

112-
loop = asyncio.new_event_loop()
113-
loop.run_until_complete(_exhaust())
131+
async def _fast_wait_for(awaitable, timeout): # noqa: ARG001
132+
return await real_wait_for(awaitable, timeout=0.2)
114133

115-
app.state.chat_semaphore = sem
134+
with _patch("gaia.ui.routers.chat.asyncio.wait_for", _fast_wait_for):
135+
client = TestClient(app)
136+
resp = client.post(
137+
"/api/chat/send",
138+
json={"session_id": sid, "message": "blocked", "stream": False},
139+
)
116140

117-
client = TestClient(app)
118-
resp = client.post(
119-
"/api/chat/send",
120-
json={"session_id": sid, "message": "blocked", "stream": False},
121-
)
122141
assert resp.status_code == 429
123142
assert "busy" in resp.json()["detail"]
124143

125-
sem.release()
126-
loop.close()
127-
128144
def test_semaphore_released_after_non_streaming_request(self, client, session_id):
129145
"""After a non-streaming request completes, the semaphore is released."""
130146
with patch("gaia.ui.server._get_chat_response") as mock:

tests/unit/chat/ui/test_sse_confirmation.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,15 @@ def run_confirm():
122122
t = threading.Thread(target=run_confirm)
123123
t.start()
124124

125-
# Wait for the confirmation to be set up
125+
# Wait for the worker to have set up _confirm_event before we resolve.
126+
# Polling _confirm_result was wrong — it's initialised to False (not
127+
# None), so ``is None`` never holds and resolve fired before the
128+
# worker registered its event, then the worker's own setup
129+
# overwrote the resolved state. _confirm_event starts at None and
130+
# is only set inside confirm_tool_execution, so polling it for
131+
# not-None correctly tracks the registration moment.
126132
deadline = time.time() + 2.0
127-
while handler._confirm_result is None and time.time() < deadline:
133+
while handler._confirm_event is None and time.time() < deadline:
128134
time.sleep(0.05)
129135

130136
handler.resolve_tool_confirmation(approved=True)
@@ -144,7 +150,7 @@ def run_confirm():
144150
t.start()
145151

146152
deadline = time.time() + 2.0
147-
while handler._confirm_result is None and time.time() < deadline:
153+
while handler._confirm_event is None and time.time() < deadline:
148154
time.sleep(0.05)
149155

150156
handler.resolve_tool_confirmation(approved=True)
@@ -173,8 +179,10 @@ def run_confirm():
173179
t = threading.Thread(target=run_confirm)
174180
t.start()
175181

182+
# See note in test_approve_returns_true: poll _confirm_event, not
183+
# _confirm_result. The latter is False from the start.
176184
deadline = time.time() + 2.0
177-
while handler._confirm_result is None and time.time() < deadline:
185+
while handler._confirm_event is None and time.time() < deadline:
178186
time.sleep(0.05)
179187

180188
handler.resolve_tool_confirmation(approved=False)

tests/unit/test_llm.py

Lines changed: 60 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,50 @@ def _check_command_availability(self):
3636
return True
3737

3838
def _check_lemonade_server_health(self):
39-
"""Check if lemonade server is running and accessible."""
39+
"""Check Lemonade is up AND return the name of a loaded chat model.
40+
41+
Two failure modes the bare /health status code doesn't catch:
42+
43+
1. Lemonade up with no model loaded → ``status: ok`` but
44+
``all_models_loaded == []`` and ``/v1/chat/completions`` 500s.
45+
2. Lemonade up with a model loaded but ``gaia llm`` defaults to a
46+
DIFFERENT model (post-PR-#865 the default flipped to
47+
Gemma 4 E4B) — Lemonade then 500s on the unloaded one and the
48+
OpenAI client retries until our 60 s test timeout. CI runners
49+
rarely preload Gemma; locally most devs preload Qwen.
50+
51+
Returns the first chat-capable ``model_name`` (so the test can
52+
pass it via ``--model`` and avoid case 2), or ``None`` to skip.
53+
"""
4054
try:
4155
response = requests.get("http://localhost:13305/api/v1/health", timeout=5)
42-
if response.status_code == 200:
43-
print("OK: Lemonade server health check passed")
44-
return True
45-
else:
56+
if response.status_code != 200:
4657
print(
4758
f"ERROR: Lemonade server health check failed with status: {response.status_code}"
4859
)
49-
return False
60+
return None
61+
data = response.json()
62+
chat_models = [
63+
m
64+
for m in data.get("all_models_loaded", [])
65+
if m.get("type") in ("llm", "vlm")
66+
]
67+
if not chat_models:
68+
print(
69+
"ERROR: Lemonade is up but no chat-capable model is loaded "
70+
f"(all_models_loaded={data.get('all_models_loaded')}). "
71+
"Cannot exercise the LLM CLI without a model — skipping."
72+
)
73+
return None
74+
chosen = chat_models[0].get("model_name")
75+
print(
76+
"OK: Lemonade server health check passed; chat models loaded: "
77+
f"{[m.get('model_name') for m in chat_models]}; using {chosen}"
78+
)
79+
return chosen
5080
except requests.exceptions.RequestException as e:
5181
print(f"ERROR: Lemonade server health check failed with exception: {e}")
52-
return False
82+
return None
5383

5484
def test_llm_command_with_server(self):
5585
"""Test LLM command with running server."""
@@ -58,28 +88,33 @@ def test_llm_command_with_server(self):
5888
if not self._check_command_availability():
5989
self.skipTest("gaia command is not available")
6090

61-
# Check if server is accessible
62-
if not self._check_lemonade_server_health():
63-
self.skipTest("Lemonade server is not running")
91+
# Check if server is accessible AND has a chat model loaded.
92+
# Returns the loaded model name so we can pass it via --model and
93+
# avoid the "default model not loaded" 500-retry hang.
94+
loaded_model = self._check_lemonade_server_health()
95+
if loaded_model is None:
96+
self.skipTest("Lemonade server is not running or no chat model is loaded")
6497

6598
try:
66-
# Test with explicit --base-url (without /api/v1 to test normalization)
67-
print(
68-
"Executing command: gaia llm 'What is 1+1?' --max-tokens 20 --base-url http://localhost:13305"
69-
)
99+
# Test with explicit --base-url (without /api/v1 to test
100+
# normalization) and --model targeting the actually-loaded
101+
# checkpoint (so we don't trip Lemonade's auto-load on a
102+
# different default which otherwise 500-retries to timeout).
103+
cmd = [
104+
"gaia",
105+
"llm",
106+
"What is 1+1?",
107+
"--max-tokens",
108+
"20",
109+
"--base-url",
110+
"http://localhost:13305",
111+
"--model",
112+
loaded_model,
113+
]
114+
print(f"Executing command: {' '.join(cmd)}")
70115

71-
# Test the LLM command with explicit --base-url
72-
# This validates both the CLI arg and the base_url normalization
73116
result = subprocess.run(
74-
[
75-
"gaia",
76-
"llm",
77-
"What is 1+1?",
78-
"--max-tokens",
79-
"20",
80-
"--base-url",
81-
"http://localhost:13305",
82-
],
117+
cmd,
83118
capture_output=True,
84119
text=True,
85120
timeout=60,

0 commit comments

Comments
 (0)