feat(email): on-device E2B (NPU/FLM) model integration (#1282) (#1433)

itomek · web-flow · commit c8b9aefe5fcf · 2026-06-04T15:49:08.000Z
Closes #1282 Users on Ryzen AI NPU hardware couldn't pick the lighter, faster on-device model for email triage — only the larger E4B was in the catalog. Now `gemma4-it-e2b-FLM` (the NPU-native FastFlowLM build) is a first-class catalog model, selectable via `EmailAgentConfig(model_id="gemma4-it-e2b-FLM")`. Validated on real Strix Halo NPU hardware: `device=npu`, `recipe=flm`, served at `:13305`, ~24 tok/s decode / ~1s TTFT. The model downloads lazily on first use, so it never enters the install path — confirmed it is NOT pulled by `gaia init --profile all`. ## Test plan - [ ] `python -m pytest tests/unit/agents/test_email_agent_local_llm_enforcement.py -q` — catalog entry (FLM id, ctx 4096), local-only (AC3) enforcement, and a real no-import-time-download guard - [ ] On a Strix Halo NPU box: load `gemma4-it-e2b-FLM` via Lemonade; confirm `device=npu` / `recipe=flm` in `/api/v1/health`; run `LEMONADE_MODEL=gemma4-it-e2b-FLM python -m pytest tests/integration/test_email_bench_throughput.py`
diff --git a/src/gaia/llm/lemonade_client.py b/src/gaia/llm/lemonade_client.py
@@ -214,6 +214,28 @@ class LemonadeStatus:
         min_ctx_size=65536,
         tool_calling=True,
     ),
+    # --- Gemma 4 E2B: primary on-device NPU model for email triage ---
+    # Issue #1282. This is the NPU-native FastFlowLM build (checkpoint
+    # ``gemma4-it:e2b``), NOT the llama.cpp GGUF variant — only the FLM build
+    # runs on the Strix Halo NPU. Validated on hardware: device=npu,
+    # recipe=flm, served at :13305, ctx_size=4096 (the NPU window). The triage
+    # classifier clips email bodies to 4000 chars, so a single email + the
+    # triage system prompt fit. The E2B *FLM* accuracy baseline is a follow-up:
+    # baseline_accuracy_e2b.json was recorded on the GGUF build, a different
+    # variant.
+    # tool_calling=False: unlike the GGUF builds (native tool calls via
+    # --jinja), the FLM/NPU server 500-errors on an OpenAI ``tools`` payload
+    # ("type must be string, but is object" — verified on hardware). The agent
+    # therefore uses the embedded-JSON tool path for this model. Email triage
+    # itself parses a JSON object from a plain completion (no native tool
+    # calls), so triage is unaffected.
+    "gemma-4-e2b": ModelRequirement(
+        model_type=ModelType.LLM,
+        model_id="gemma4-it-e2b-FLM",
+        display_name="Gemma 4 E2B (NPU/FLM)",
+        min_ctx_size=4096,
+        tool_calling=False,
+    ),
     # --- Legacy Qwen models: kept so existing pinned sessions/configs don't break ---
     "qwen3.5-35b": ModelRequirement(
         model_type=ModelType.LLM,
diff --git a/tests/unit/agents/test_email_agent_local_llm_enforcement.py b/tests/unit/agents/test_email_agent_local_llm_enforcement.py
@@ -26,6 +26,7 @@
 import pytest
 
 from gaia.agents.email.config import ConfigurationError, EmailAgentConfig
+from gaia.llm.lemonade_client import MODELS, ModelType
 
 
 class TestNoCloudLlmFields:
@@ -119,3 +120,174 @@ def test_default_db_path_under_home(self, tmp_path, monkeypatch):
     def test_explicit_db_path_overrides(self, tmp_path):
         cfg = EmailAgentConfig(db_path=str(tmp_path / "x.db"))
         assert cfg.resolved_db_path() == str(tmp_path / "x.db")
+
+
+class TestGemmaE2BCatalogEntry:
+    """The Gemma-4 E2B model MUST be registered in the MODELS catalog so the
+    email agent can select it without falling back to the larger E4B model.
+
+    Issue #1282: register E2B as a first-class catalog option.
+    """
+
+    E2B_KEY = "gemma-4-e2b"
+    # The NPU-native FastFlowLM build (checkpoint gemma4-it:e2b), validated on
+    # the Strix Halo box (device=npu, recipe=flm). NOT the llama.cpp GGUF
+    # variant — only the FLM build runs on the NPU (issue #1282).
+    E2B_MODEL_ID = "gemma4-it-e2b-FLM"
+
+    def test_e2b_key_exists_in_catalog(self):
+        """``gemma-4-e2b`` key must be present in MODELS."""
+        assert self.E2B_KEY in MODELS, (
+            f"'gemma-4-e2b' not found in MODELS — did you add the catalog entry in "
+            "src/gaia/llm/lemonade_client.py? (issue #1282)"
+        )
+
+    def test_e2b_model_id_is_flm_npu_build(self):
+        """model_id must be the NPU-native FLM build (validated on hardware)."""
+        req = MODELS[self.E2B_KEY]
+        assert req.model_id == self.E2B_MODEL_ID, (
+            f"Expected model_id={self.E2B_MODEL_ID!r}, got {req.model_id!r}. "
+            "This must be the FLM (NPU) build, not the GGUF (llama.cpp) variant "
+            "— only FLM runs on the Strix Halo NPU. Verify against the box's "
+            "`/api/v1/models` before changing this."
+        )
+
+    def test_e2b_is_llm_type(self):
+        """The E2B entry must be an LLM (not VLM/embed) for email triage."""
+        req = MODELS[self.E2B_KEY]
+        assert (
+            req.model_type == ModelType.LLM
+        ), f"Expected model_type=ModelType.LLM, got {req.model_type!r}"
+
+    def test_e2b_tool_calling_disabled_for_flm_build(self):
+        """The FLM/NPU build does NOT serve native OpenAI tool calls.
+
+        Verified on Strix Halo hardware: passing an OpenAI ``tools`` payload to
+        the FLM server 500-errors ("type must be string, but is object"). So
+        this entry must declare ``tool_calling=False`` — the agent then uses the
+        embedded-JSON tool path. Email triage itself parses a JSON object from a
+        plain completion, so it is unaffected either way.
+        """
+        req = MODELS[self.E2B_KEY]
+        assert req.tool_calling is False, (
+            "gemma-4-e2b is the FLM/NPU build, which 500-errors on a native "
+            "tools payload — it must declare tool_calling=False so the agent "
+            "uses the embedded-JSON tool path."
+        )
+
+    def test_e2b_min_ctx_size_matches_npu_window(self):
+        """min_ctx_size must match what the E2B/FLM NPU build actually serves.
+
+        Validated on the Strix Halo box: the FLM build serves ctx_size=4096.
+        The triage classifier clips email bodies to 4000 chars, so a single
+        email + the triage system prompt fit in the 4 K window. Asserting a
+        larger minimum than the hardware serves would be a faked requirement
+        that the model can't satisfy.
+        """
+        req = MODELS[self.E2B_KEY]
+        assert req.min_ctx_size >= 4096, (
+            f"min_ctx_size={req.min_ctx_size} is below the 4 K NPU window the "
+            "FLM build serves."
+        )
+
+    def test_e2b_display_name_set(self):
+        """display_name must be a non-empty string."""
+        req = MODELS[self.E2B_KEY]
+        assert (
+            isinstance(req.display_name, str) and req.display_name.strip()
+        ), "gemma-4-e2b.display_name is empty — set a human-readable label."
+
+    def test_email_agent_config_accepts_e2b_model_id(self):
+        """``EmailAgentConfig(model_id=<e2b_id>)`` must be constructable and
+        pass the base_url allowlist check (no ``base_url`` means local default).
+        """
+        cfg = EmailAgentConfig(model_id=self.E2B_MODEL_ID)
+        cfg.validate()  # MUST NOT raise — default base_url is None (local)
+        assert cfg.model_id == self.E2B_MODEL_ID
+
+    def test_email_agent_config_e2b_rejects_cloud_base_url(self):
+        """Specifying the E2B model_id must not open a path to a cloud LLM.
+
+        The AC3 allowlist must still block cloud ``base_url`` even when the
+        caller explicitly requests the E2B model.
+        """
+        cfg = EmailAgentConfig(
+            model_id=self.E2B_MODEL_ID,
+            base_url="https://api.openai.com/v1",
+        )
+        with pytest.raises(ConfigurationError) as exc:
+            cfg.validate()
+        assert "AC3" in str(exc.value)
+
+
+class TestGemmaE2BLazyDownload:
+    """Registering the E2B model in the catalog MUST NOT trigger a model
+    download at import / config-construction time.  The download must remain
+    lazy — deferred to first use via ``_ensure_model_loaded`` /
+    ``_preload_on_idle_server``.
+
+    This protects the critical install path: ``gaia init`` and a fresh
+    ``import gaia.llm.lemonade_client`` MUST NOT pull multi-GB weights.
+    """
+
+    def test_importing_lemonade_client_does_no_network_or_subprocess(self):
+        """Importing the module MUST cross no network/subprocess boundary.
+
+        Guards the #1282 AC "no large download in the critical install path":
+        merely declaring the E2B entry in ``MODELS`` must not pull weights or
+        probe the server at import time. The import runs in an ISOLATED
+        subprocess with the real chokepoints a download/server-spawn must
+        cross — ``requests`` (the HTTP adapter every ``requests`` call funnels
+        through) and ``subprocess`` (``Popen``/``run`` for the server) —
+        instrumented to fail loudly. Running it out-of-process is deliberate:
+        an in-process ``sys.modules`` pop + re-import rebuilds the module's
+        classes and corrupts their identity for every later test in the
+        session. A fresh interpreter also makes the guard stronger (a brand-new
+        class object means patching the module's own ``load_model`` would be
+        vacuous).
+        """
+        import os
+        import subprocess
+        import sys
+        import textwrap
+
+        probe = textwrap.dedent("""
+            import requests.adapters
+            import subprocess
+
+            def _boom(*_a, **_k):
+                raise AssertionError("import-time network/subprocess call")
+
+            requests.adapters.HTTPAdapter.send = _boom
+            subprocess.Popen = _boom
+            subprocess.run = _boom
+
+            import gaia.llm.lemonade_client  # noqa: F401 — must not trip _boom
+            print("IMPORT_OK")
+            """)
+        result = subprocess.run(
+            [sys.executable, "-c", probe],
+            capture_output=True,
+            text=True,
+            env=os.environ.copy(),
+        )
+        assert result.returncode == 0 and "IMPORT_OK" in result.stdout, (
+            "importing gaia.llm.lemonade_client triggered an import-time "
+            f"network/subprocess call (the install path must stay lazy):\n"
+            f"stdout={result.stdout!r}\nstderr={result.stderr!r}"
+        )
+
+    def test_email_agent_config_construction_does_not_call_load_model(
+        self, monkeypatch
+    ):
+        """``EmailAgentConfig(model_id=<e2b>)`` must not trigger a download.
+
+        Construction is purely a dataclass assignment; the download happens
+        later when the agent's LLM client first sends a request.
+        """
+        import unittest.mock as mock
+
+        with mock.patch("gaia.llm.lemonade_client.LemonadeClient.load_model") as m:
+            cfg = EmailAgentConfig(model_id="gemma4-it-e2b-FLM")
+            cfg.validate()
+        m.assert_not_called()