|
26 | 26 | import pytest |
27 | 27 |
|
28 | 28 | from gaia.agents.email.config import ConfigurationError, EmailAgentConfig |
| 29 | +from gaia.llm.lemonade_client import MODELS, ModelType |
29 | 30 |
|
30 | 31 |
|
31 | 32 | class TestNoCloudLlmFields: |
@@ -119,3 +120,174 @@ def test_default_db_path_under_home(self, tmp_path, monkeypatch): |
119 | 120 | def test_explicit_db_path_overrides(self, tmp_path): |
120 | 121 | cfg = EmailAgentConfig(db_path=str(tmp_path / "x.db")) |
121 | 122 | assert cfg.resolved_db_path() == str(tmp_path / "x.db") |
| 123 | + |
| 124 | + |
| 125 | +class TestGemmaE2BCatalogEntry: |
| 126 | + """The Gemma-4 E2B model MUST be registered in the MODELS catalog so the |
| 127 | + email agent can select it without falling back to the larger E4B model. |
| 128 | +
|
| 129 | + Issue #1282: register E2B as a first-class catalog option. |
| 130 | + """ |
| 131 | + |
| 132 | + E2B_KEY = "gemma-4-e2b" |
| 133 | + # The NPU-native FastFlowLM build (checkpoint gemma4-it:e2b), validated on |
| 134 | + # the Strix Halo box (device=npu, recipe=flm). NOT the llama.cpp GGUF |
| 135 | + # variant — only the FLM build runs on the NPU (issue #1282). |
| 136 | + E2B_MODEL_ID = "gemma4-it-e2b-FLM" |
| 137 | + |
| 138 | + def test_e2b_key_exists_in_catalog(self): |
| 139 | + """``gemma-4-e2b`` key must be present in MODELS.""" |
| 140 | + assert self.E2B_KEY in MODELS, ( |
| 141 | + f"'gemma-4-e2b' not found in MODELS — did you add the catalog entry in " |
| 142 | + "src/gaia/llm/lemonade_client.py? (issue #1282)" |
| 143 | + ) |
| 144 | + |
| 145 | + def test_e2b_model_id_is_flm_npu_build(self): |
| 146 | + """model_id must be the NPU-native FLM build (validated on hardware).""" |
| 147 | + req = MODELS[self.E2B_KEY] |
| 148 | + assert req.model_id == self.E2B_MODEL_ID, ( |
| 149 | + f"Expected model_id={self.E2B_MODEL_ID!r}, got {req.model_id!r}. " |
| 150 | + "This must be the FLM (NPU) build, not the GGUF (llama.cpp) variant " |
| 151 | + "— only FLM runs on the Strix Halo NPU. Verify against the box's " |
| 152 | + "`/api/v1/models` before changing this." |
| 153 | + ) |
| 154 | + |
| 155 | + def test_e2b_is_llm_type(self): |
| 156 | + """The E2B entry must be an LLM (not VLM/embed) for email triage.""" |
| 157 | + req = MODELS[self.E2B_KEY] |
| 158 | + assert ( |
| 159 | + req.model_type == ModelType.LLM |
| 160 | + ), f"Expected model_type=ModelType.LLM, got {req.model_type!r}" |
| 161 | + |
| 162 | + def test_e2b_tool_calling_disabled_for_flm_build(self): |
| 163 | + """The FLM/NPU build does NOT serve native OpenAI tool calls. |
| 164 | +
|
| 165 | + Verified on Strix Halo hardware: passing an OpenAI ``tools`` payload to |
| 166 | + the FLM server 500-errors ("type must be string, but is object"). So |
| 167 | + this entry must declare ``tool_calling=False`` — the agent then uses the |
| 168 | + embedded-JSON tool path. Email triage itself parses a JSON object from a |
| 169 | + plain completion, so it is unaffected either way. |
| 170 | + """ |
| 171 | + req = MODELS[self.E2B_KEY] |
| 172 | + assert req.tool_calling is False, ( |
| 173 | + "gemma-4-e2b is the FLM/NPU build, which 500-errors on a native " |
| 174 | + "tools payload — it must declare tool_calling=False so the agent " |
| 175 | + "uses the embedded-JSON tool path." |
| 176 | + ) |
| 177 | + |
| 178 | + def test_e2b_min_ctx_size_matches_npu_window(self): |
| 179 | + """min_ctx_size must match what the E2B/FLM NPU build actually serves. |
| 180 | +
|
| 181 | + Validated on the Strix Halo box: the FLM build serves ctx_size=4096. |
| 182 | + The triage classifier clips email bodies to 4000 chars, so a single |
| 183 | + email + the triage system prompt fit in the 4 K window. Asserting a |
| 184 | + larger minimum than the hardware serves would be a faked requirement |
| 185 | + that the model can't satisfy. |
| 186 | + """ |
| 187 | + req = MODELS[self.E2B_KEY] |
| 188 | + assert req.min_ctx_size >= 4096, ( |
| 189 | + f"min_ctx_size={req.min_ctx_size} is below the 4 K NPU window the " |
| 190 | + "FLM build serves." |
| 191 | + ) |
| 192 | + |
| 193 | + def test_e2b_display_name_set(self): |
| 194 | + """display_name must be a non-empty string.""" |
| 195 | + req = MODELS[self.E2B_KEY] |
| 196 | + assert ( |
| 197 | + isinstance(req.display_name, str) and req.display_name.strip() |
| 198 | + ), "gemma-4-e2b.display_name is empty — set a human-readable label." |
| 199 | + |
| 200 | + def test_email_agent_config_accepts_e2b_model_id(self): |
| 201 | + """``EmailAgentConfig(model_id=<e2b_id>)`` must be constructable and |
| 202 | + pass the base_url allowlist check (no ``base_url`` means local default). |
| 203 | + """ |
| 204 | + cfg = EmailAgentConfig(model_id=self.E2B_MODEL_ID) |
| 205 | + cfg.validate() # MUST NOT raise — default base_url is None (local) |
| 206 | + assert cfg.model_id == self.E2B_MODEL_ID |
| 207 | + |
| 208 | + def test_email_agent_config_e2b_rejects_cloud_base_url(self): |
| 209 | + """Specifying the E2B model_id must not open a path to a cloud LLM. |
| 210 | +
|
| 211 | + The AC3 allowlist must still block cloud ``base_url`` even when the |
| 212 | + caller explicitly requests the E2B model. |
| 213 | + """ |
| 214 | + cfg = EmailAgentConfig( |
| 215 | + model_id=self.E2B_MODEL_ID, |
| 216 | + base_url="https://api.openai.com/v1", |
| 217 | + ) |
| 218 | + with pytest.raises(ConfigurationError) as exc: |
| 219 | + cfg.validate() |
| 220 | + assert "AC3" in str(exc.value) |
| 221 | + |
| 222 | + |
| 223 | +class TestGemmaE2BLazyDownload: |
| 224 | + """Registering the E2B model in the catalog MUST NOT trigger a model |
| 225 | + download at import / config-construction time. The download must remain |
| 226 | + lazy — deferred to first use via ``_ensure_model_loaded`` / |
| 227 | + ``_preload_on_idle_server``. |
| 228 | +
|
| 229 | + This protects the critical install path: ``gaia init`` and a fresh |
| 230 | + ``import gaia.llm.lemonade_client`` MUST NOT pull multi-GB weights. |
| 231 | + """ |
| 232 | + |
| 233 | + def test_importing_lemonade_client_does_no_network_or_subprocess(self): |
| 234 | + """Importing the module MUST cross no network/subprocess boundary. |
| 235 | +
|
| 236 | + Guards the #1282 AC "no large download in the critical install path": |
| 237 | + merely declaring the E2B entry in ``MODELS`` must not pull weights or |
| 238 | + probe the server at import time. The import runs in an ISOLATED |
| 239 | + subprocess with the real chokepoints a download/server-spawn must |
| 240 | + cross — ``requests`` (the HTTP adapter every ``requests`` call funnels |
| 241 | + through) and ``subprocess`` (``Popen``/``run`` for the server) — |
| 242 | + instrumented to fail loudly. Running it out-of-process is deliberate: |
| 243 | + an in-process ``sys.modules`` pop + re-import rebuilds the module's |
| 244 | + classes and corrupts their identity for every later test in the |
| 245 | + session. A fresh interpreter also makes the guard stronger (a brand-new |
| 246 | + class object means patching the module's own ``load_model`` would be |
| 247 | + vacuous). |
| 248 | + """ |
| 249 | + import os |
| 250 | + import subprocess |
| 251 | + import sys |
| 252 | + import textwrap |
| 253 | + |
| 254 | + probe = textwrap.dedent(""" |
| 255 | + import requests.adapters |
| 256 | + import subprocess |
| 257 | +
|
| 258 | + def _boom(*_a, **_k): |
| 259 | + raise AssertionError("import-time network/subprocess call") |
| 260 | +
|
| 261 | + requests.adapters.HTTPAdapter.send = _boom |
| 262 | + subprocess.Popen = _boom |
| 263 | + subprocess.run = _boom |
| 264 | +
|
| 265 | + import gaia.llm.lemonade_client # noqa: F401 — must not trip _boom |
| 266 | + print("IMPORT_OK") |
| 267 | + """) |
| 268 | + result = subprocess.run( |
| 269 | + [sys.executable, "-c", probe], |
| 270 | + capture_output=True, |
| 271 | + text=True, |
| 272 | + env=os.environ.copy(), |
| 273 | + ) |
| 274 | + assert result.returncode == 0 and "IMPORT_OK" in result.stdout, ( |
| 275 | + "importing gaia.llm.lemonade_client triggered an import-time " |
| 276 | + f"network/subprocess call (the install path must stay lazy):\n" |
| 277 | + f"stdout={result.stdout!r}\nstderr={result.stderr!r}" |
| 278 | + ) |
| 279 | + |
| 280 | + def test_email_agent_config_construction_does_not_call_load_model( |
| 281 | + self, monkeypatch |
| 282 | + ): |
| 283 | + """``EmailAgentConfig(model_id=<e2b>)`` must not trigger a download. |
| 284 | +
|
| 285 | + Construction is purely a dataclass assignment; the download happens |
| 286 | + later when the agent's LLM client first sends a request. |
| 287 | + """ |
| 288 | + import unittest.mock as mock |
| 289 | + |
| 290 | + with mock.patch("gaia.llm.lemonade_client.LemonadeClient.load_model") as m: |
| 291 | + cfg = EmailAgentConfig(model_id="gemma4-it-e2b-FLM") |
| 292 | + cfg.validate() |
| 293 | + m.assert_not_called() |
0 commit comments