Skip to content

Commit c8b9aef

Browse files
authored
feat(email): on-device E2B (NPU/FLM) model integration (#1282) (#1433)
Closes #1282 Users on Ryzen AI NPU hardware couldn't pick the lighter, faster on-device model for email triage — only the larger E4B was in the catalog. Now `gemma4-it-e2b-FLM` (the NPU-native FastFlowLM build) is a first-class catalog model, selectable via `EmailAgentConfig(model_id="gemma4-it-e2b-FLM")`. Validated on real Strix Halo NPU hardware: `device=npu`, `recipe=flm`, served at `:13305`, ~24 tok/s decode / ~1s TTFT. The model downloads lazily on first use, so it never enters the install path — confirmed it is NOT pulled by `gaia init --profile all`. ## Test plan - [ ] `python -m pytest tests/unit/agents/test_email_agent_local_llm_enforcement.py -q` — catalog entry (FLM id, ctx 4096), local-only (AC3) enforcement, and a real no-import-time-download guard - [ ] On a Strix Halo NPU box: load `gemma4-it-e2b-FLM` via Lemonade; confirm `device=npu` / `recipe=flm` in `/api/v1/health`; run `LEMONADE_MODEL=gemma4-it-e2b-FLM python -m pytest tests/integration/test_email_bench_throughput.py`
1 parent e3a67dd commit c8b9aef

2 files changed

Lines changed: 194 additions & 0 deletions

File tree

src/gaia/llm/lemonade_client.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,28 @@ class LemonadeStatus:
214214
min_ctx_size=65536,
215215
tool_calling=True,
216216
),
217+
# --- Gemma 4 E2B: primary on-device NPU model for email triage ---
218+
# Issue #1282. This is the NPU-native FastFlowLM build (checkpoint
219+
# ``gemma4-it:e2b``), NOT the llama.cpp GGUF variant — only the FLM build
220+
# runs on the Strix Halo NPU. Validated on hardware: device=npu,
221+
# recipe=flm, served at :13305, ctx_size=4096 (the NPU window). The triage
222+
# classifier clips email bodies to 4000 chars, so a single email + the
223+
# triage system prompt fit. The E2B *FLM* accuracy baseline is a follow-up:
224+
# baseline_accuracy_e2b.json was recorded on the GGUF build, a different
225+
# variant.
226+
# tool_calling=False: unlike the GGUF builds (native tool calls via
227+
# --jinja), the FLM/NPU server 500-errors on an OpenAI ``tools`` payload
228+
# ("type must be string, but is object" — verified on hardware). The agent
229+
# therefore uses the embedded-JSON tool path for this model. Email triage
230+
# itself parses a JSON object from a plain completion (no native tool
231+
# calls), so triage is unaffected.
232+
"gemma-4-e2b": ModelRequirement(
233+
model_type=ModelType.LLM,
234+
model_id="gemma4-it-e2b-FLM",
235+
display_name="Gemma 4 E2B (NPU/FLM)",
236+
min_ctx_size=4096,
237+
tool_calling=False,
238+
),
217239
# --- Legacy Qwen models: kept so existing pinned sessions/configs don't break ---
218240
"qwen3.5-35b": ModelRequirement(
219241
model_type=ModelType.LLM,

tests/unit/agents/test_email_agent_local_llm_enforcement.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import pytest
2727

2828
from gaia.agents.email.config import ConfigurationError, EmailAgentConfig
29+
from gaia.llm.lemonade_client import MODELS, ModelType
2930

3031

3132
class TestNoCloudLlmFields:
@@ -119,3 +120,174 @@ def test_default_db_path_under_home(self, tmp_path, monkeypatch):
119120
def test_explicit_db_path_overrides(self, tmp_path):
120121
cfg = EmailAgentConfig(db_path=str(tmp_path / "x.db"))
121122
assert cfg.resolved_db_path() == str(tmp_path / "x.db")
123+
124+
125+
class TestGemmaE2BCatalogEntry:
126+
"""The Gemma-4 E2B model MUST be registered in the MODELS catalog so the
127+
email agent can select it without falling back to the larger E4B model.
128+
129+
Issue #1282: register E2B as a first-class catalog option.
130+
"""
131+
132+
E2B_KEY = "gemma-4-e2b"
133+
# The NPU-native FastFlowLM build (checkpoint gemma4-it:e2b), validated on
134+
# the Strix Halo box (device=npu, recipe=flm). NOT the llama.cpp GGUF
135+
# variant — only the FLM build runs on the NPU (issue #1282).
136+
E2B_MODEL_ID = "gemma4-it-e2b-FLM"
137+
138+
def test_e2b_key_exists_in_catalog(self):
139+
"""``gemma-4-e2b`` key must be present in MODELS."""
140+
assert self.E2B_KEY in MODELS, (
141+
f"'gemma-4-e2b' not found in MODELS — did you add the catalog entry in "
142+
"src/gaia/llm/lemonade_client.py? (issue #1282)"
143+
)
144+
145+
def test_e2b_model_id_is_flm_npu_build(self):
146+
"""model_id must be the NPU-native FLM build (validated on hardware)."""
147+
req = MODELS[self.E2B_KEY]
148+
assert req.model_id == self.E2B_MODEL_ID, (
149+
f"Expected model_id={self.E2B_MODEL_ID!r}, got {req.model_id!r}. "
150+
"This must be the FLM (NPU) build, not the GGUF (llama.cpp) variant "
151+
"— only FLM runs on the Strix Halo NPU. Verify against the box's "
152+
"`/api/v1/models` before changing this."
153+
)
154+
155+
def test_e2b_is_llm_type(self):
156+
"""The E2B entry must be an LLM (not VLM/embed) for email triage."""
157+
req = MODELS[self.E2B_KEY]
158+
assert (
159+
req.model_type == ModelType.LLM
160+
), f"Expected model_type=ModelType.LLM, got {req.model_type!r}"
161+
162+
def test_e2b_tool_calling_disabled_for_flm_build(self):
163+
"""The FLM/NPU build does NOT serve native OpenAI tool calls.
164+
165+
Verified on Strix Halo hardware: passing an OpenAI ``tools`` payload to
166+
the FLM server 500-errors ("type must be string, but is object"). So
167+
this entry must declare ``tool_calling=False`` — the agent then uses the
168+
embedded-JSON tool path. Email triage itself parses a JSON object from a
169+
plain completion, so it is unaffected either way.
170+
"""
171+
req = MODELS[self.E2B_KEY]
172+
assert req.tool_calling is False, (
173+
"gemma-4-e2b is the FLM/NPU build, which 500-errors on a native "
174+
"tools payload — it must declare tool_calling=False so the agent "
175+
"uses the embedded-JSON tool path."
176+
)
177+
178+
def test_e2b_min_ctx_size_matches_npu_window(self):
179+
"""min_ctx_size must match what the E2B/FLM NPU build actually serves.
180+
181+
Validated on the Strix Halo box: the FLM build serves ctx_size=4096.
182+
The triage classifier clips email bodies to 4000 chars, so a single
183+
email + the triage system prompt fit in the 4 K window. Asserting a
184+
larger minimum than the hardware serves would be a faked requirement
185+
that the model can't satisfy.
186+
"""
187+
req = MODELS[self.E2B_KEY]
188+
assert req.min_ctx_size >= 4096, (
189+
f"min_ctx_size={req.min_ctx_size} is below the 4 K NPU window the "
190+
"FLM build serves."
191+
)
192+
193+
def test_e2b_display_name_set(self):
194+
"""display_name must be a non-empty string."""
195+
req = MODELS[self.E2B_KEY]
196+
assert (
197+
isinstance(req.display_name, str) and req.display_name.strip()
198+
), "gemma-4-e2b.display_name is empty — set a human-readable label."
199+
200+
def test_email_agent_config_accepts_e2b_model_id(self):
201+
"""``EmailAgentConfig(model_id=<e2b_id>)`` must be constructable and
202+
pass the base_url allowlist check (no ``base_url`` means local default).
203+
"""
204+
cfg = EmailAgentConfig(model_id=self.E2B_MODEL_ID)
205+
cfg.validate() # MUST NOT raise — default base_url is None (local)
206+
assert cfg.model_id == self.E2B_MODEL_ID
207+
208+
def test_email_agent_config_e2b_rejects_cloud_base_url(self):
209+
"""Specifying the E2B model_id must not open a path to a cloud LLM.
210+
211+
The AC3 allowlist must still block cloud ``base_url`` even when the
212+
caller explicitly requests the E2B model.
213+
"""
214+
cfg = EmailAgentConfig(
215+
model_id=self.E2B_MODEL_ID,
216+
base_url="https://api.openai.com/v1",
217+
)
218+
with pytest.raises(ConfigurationError) as exc:
219+
cfg.validate()
220+
assert "AC3" in str(exc.value)
221+
222+
223+
class TestGemmaE2BLazyDownload:
224+
"""Registering the E2B model in the catalog MUST NOT trigger a model
225+
download at import / config-construction time. The download must remain
226+
lazy — deferred to first use via ``_ensure_model_loaded`` /
227+
``_preload_on_idle_server``.
228+
229+
This protects the critical install path: ``gaia init`` and a fresh
230+
``import gaia.llm.lemonade_client`` MUST NOT pull multi-GB weights.
231+
"""
232+
233+
def test_importing_lemonade_client_does_no_network_or_subprocess(self):
234+
"""Importing the module MUST cross no network/subprocess boundary.
235+
236+
Guards the #1282 AC "no large download in the critical install path":
237+
merely declaring the E2B entry in ``MODELS`` must not pull weights or
238+
probe the server at import time. The import runs in an ISOLATED
239+
subprocess with the real chokepoints a download/server-spawn must
240+
cross — ``requests`` (the HTTP adapter every ``requests`` call funnels
241+
through) and ``subprocess`` (``Popen``/``run`` for the server) —
242+
instrumented to fail loudly. Running it out-of-process is deliberate:
243+
an in-process ``sys.modules`` pop + re-import rebuilds the module's
244+
classes and corrupts their identity for every later test in the
245+
session. A fresh interpreter also makes the guard stronger (a brand-new
246+
class object means patching the module's own ``load_model`` would be
247+
vacuous).
248+
"""
249+
import os
250+
import subprocess
251+
import sys
252+
import textwrap
253+
254+
probe = textwrap.dedent("""
255+
import requests.adapters
256+
import subprocess
257+
258+
def _boom(*_a, **_k):
259+
raise AssertionError("import-time network/subprocess call")
260+
261+
requests.adapters.HTTPAdapter.send = _boom
262+
subprocess.Popen = _boom
263+
subprocess.run = _boom
264+
265+
import gaia.llm.lemonade_client # noqa: F401 — must not trip _boom
266+
print("IMPORT_OK")
267+
""")
268+
result = subprocess.run(
269+
[sys.executable, "-c", probe],
270+
capture_output=True,
271+
text=True,
272+
env=os.environ.copy(),
273+
)
274+
assert result.returncode == 0 and "IMPORT_OK" in result.stdout, (
275+
"importing gaia.llm.lemonade_client triggered an import-time "
276+
f"network/subprocess call (the install path must stay lazy):\n"
277+
f"stdout={result.stdout!r}\nstderr={result.stderr!r}"
278+
)
279+
280+
def test_email_agent_config_construction_does_not_call_load_model(
281+
self, monkeypatch
282+
):
283+
"""``EmailAgentConfig(model_id=<e2b>)`` must not trigger a download.
284+
285+
Construction is purely a dataclass assignment; the download happens
286+
later when the agent's LLM client first sends a request.
287+
"""
288+
import unittest.mock as mock
289+
290+
with mock.patch("gaia.llm.lemonade_client.LemonadeClient.load_model") as m:
291+
cfg = EmailAgentConfig(model_id="gemma4-it-e2b-FLM")
292+
cfg.validate()
293+
m.assert_not_called()

0 commit comments

Comments
 (0)