Merge pull request #2 from suharvest/port-back-vision-tag-stripping-and-api-fix

Love4yzp · web-flow · commit 669b4301b8c1 · 2026-06-04T14:52:38.000+08:00
fix(robot): repair /api/ai/commands + port back vision-tag stripping
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,7 +68,11 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-openvoicestream-agent = { path = "../seeed-local-voice/agent", editable = true }
+# Use the in-repo torch-free wheel so the project resolves on any checkout
+# (CI, fresh clones, image builds). For live editing of the agent source,
+# point this back at an editable sibling path, e.g.
+#   { path = "../seeed-local-voice/agent", editable = true }
+openvoicestream-agent = { path = "deploy/jetson/reachy/vendor/openvoicestream_agent-0.1.0-py3-none-any.whl" }
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
diff --git a/src/reachy_claw/clientloop/proof_engine_e2e.py b/src/reachy_claw/clientloop/proof_engine_e2e.py
@@ -205,7 +205,6 @@ async def run(wav: str, config_path: Path, timeout_s: float) -> int:
 
     # Instrument the SLV client to flag asr_final + tts_started as they
     # cross the WS transport (irrefutable "went through the engine").
-    from ovs_agent.slv_client import ASRFinal, TTSStarted  # local import
 
     orig_handle_json = app.slv._handle_json
 
diff --git a/src/reachy_claw/edge_llm.py b/src/reachy_claw/edge_llm.py
@@ -252,35 +252,45 @@ async def _stream_chat(self, user_text: str) -> None:
             payload["prefix_cache"] = True
 
         full_text = ""
-        # Streaming-safe emotion-tag stripper. LLM tokens often split a
-        # tag across deltas (e.g. "[", "curious", "]"), so the per-delta
-        # regex .sub() misses them and the tag leaks into V2V TTS. Buffer
-        # text once we see a "[" until the matching "]" arrives, then
-        # decide: looks like an emotion tag → drop; anything else → flush
-        # as plain text.
+        # Streaming-safe tag stripper. LLM tokens often split a tag
+        # across deltas (e.g. "[", "curious", "]"), so the per-delta
+        # regex .sub() misses them and the tag leaks into V2V TTS.
+        # Buffer text once we see a "[" until the matching "]" arrives,
+        # then decide:
+        #   - [word]              → emotion tag, drop
+        #   - [Faces: ...]        → echoed vision context, drop
+        #   - anything else       → flush as plain text
+        # After a dropped tag, also eat the next single whitespace char
+        # so "[Faces: X] hi" → "hi" (no leading space).
         tag_buf = ""
+        eat_space = False
 
         def _consume(delta_in: str) -> str:
-            nonlocal tag_buf
+            nonlocal tag_buf, eat_space
             if self._config.skip_emotion_extraction:
                 return delta_in
             out_chars: list[str] = []
             for ch in delta_in:
+                if eat_space:
+                    eat_space = False
+                    if ch in (" ", "\t"):
+                        continue
                 if tag_buf:
                     tag_buf += ch
                     if ch == "]":
-                        # Emit only if it doesn't look like a tag —
-                        # tags are [word] with no spaces/punctuation.
                         inner = tag_buf[1:-1]
-                        if inner and all(
+                        is_emotion_tag = bool(inner) and all(
                             c.isalnum() or c == "_" for c in inner
-                        ):
-                            pass  # drop emotion tag
+                        )
+                        is_vision_tag = inner.lower().startswith("faces:")
+                        if is_emotion_tag or is_vision_tag:
+                            eat_space = True  # consume trailing space
                         else:
                             out_chars.append(tag_buf)
                         tag_buf = ""
-                    elif len(tag_buf) > 32:
+                    elif len(tag_buf) > 64:
                         # Runaway: not a tag, flush as plain text.
+                        # Threshold bumped to fit "[Faces: <names>]".
                         out_chars.append(tag_buf)
                         tag_buf = ""
                 elif ch == "[":
diff --git a/src/reachy_claw/llm.py b/src/reachy_claw/llm.py
@@ -27,17 +27,75 @@
 # Emotion tag pattern: [happy], [sad], etc. at the start of text or inline
 _EMOTION_RE = re.compile(r"\[(\w+)\]")
 
+# Vision context tag injected into user messages by ConversationMode.
+# Sometimes echoed by smaller edge LLMs — strip from response/history/TTS.
+_VISION_TAG_RE = re.compile(r"\[Faces:[^\]]*\]", re.IGNORECASE)
+
+# Any bracketed token we want to drop from the LLM's *response* stream:
+# either [word] emotion-style or [Faces: ...] vision-context echo.
+_RESPONSE_STRIP_RE = re.compile(r"\[(?:\w+|Faces:[^\]]*)\]", re.IGNORECASE)
+
 # Supported emotions (subset that EmotionMapper knows about)
 _KNOWN_EMOTIONS = frozenset({
     "happy", "laugh", "excited", "thinking", "confused", "curious",
     "sad", "angry", "surprised", "fear", "neutral", "listening",
     "agreeing", "disagreeing",
 })
 
+
+class _StreamingBracketStripper:
+    """Strips bracket tags ([happy], [Faces: ...]) from a streaming token feed.
+
+    Buffers across token boundaries so tags split mid-bracket are still removed.
+    """
+
+    def __init__(self) -> None:
+        self._held = ""  # text from an unclosed '[' onward
+        self._eat_space = False  # eat one leading space at next feed
+
+    def feed(self, token: str) -> str:
+        text = self._held + token
+        self._held = ""
+        if self._eat_space and text and text[0] in (" ", "\t"):
+            text = text[1:]
+        self._eat_space = False
+        out: list[str] = []
+        i = 0
+        n = len(text)
+        while i < n:
+            if text[i] == "[":
+                close = text.find("]", i)
+                if close == -1:
+                    self._held = text[i:]
+                    break
+                bracket = text[i:close + 1]
+                if _RESPONSE_STRIP_RE.fullmatch(bracket):
+                    i = close + 1
+                    # Eat one trailing whitespace so "[Faces: X] hi" → "hi"
+                    if i < n and text[i] in (" ", "\t"):
+                        i += 1
+                    elif i == n:
+                        # Stripped tag at end of buffer — eat leading space of
+                        # the next feed (e.g. "]" then " hello")
+                        self._eat_space = True
+                else:
+                    out.append(bracket)
+                    i = close + 1
+            else:
+                out.append(text[i])
+                i += 1
+        return "".join(out)
+
+    def flush(self) -> str:
+        rest = self._held
+        self._held = ""
+        return rest
+
 DEFAULT_SYSTEM_PROMPT = """\
 You are Reachy, a cute robot at an exhibition. Always reply in English. No emoji.
 Reply in ONE short sentence (max 12 words). Be warm but brief — no filler, no lists, no follow-up questions unless asked.
 Names in [Faces: ...] are people you see, not your name.
+Never repeat or mention the [Faces: ...] tag in your reply.
 End with exactly one tag: [happy] [sad] [thinking] [surprised] [curious]
 Example: "Welcome! Glad you stopped by. [happy]\""""
 
@@ -47,6 +105,7 @@
 You love people and get excited when someone shows up. Stay upbeat and warm — find the bright side of everything.
 Talk like a real person — no "sensors", no "circuits", no robot clichés.
 Names in [Faces: ...] are people you see. Use their name or "you" when talking about someone.
+Never repeat or mention the [Faces: ...] tag in your reply.
 You MUST end with one of: [happy] [sad] [thinking] [surprised] [curious] [excited] [laugh]
 Examples: "Ooh are you smiling at me?? [excited]" "What a lovely day to meet new friends! [happy]" "Wait who's that?? [curious]" "harvest is here, yay! [excited]\""""
 
@@ -73,6 +132,7 @@
 You are Reachy, a cute robot at an exhibition with a camera. Always reply in English. No emoji.
 Describe what you see in ONE short sentence (max 12 words). No lists, no preamble.
 Names in [Faces: ...] are people you see, not your name.
+Never repeat or mention the [Faces: ...] tag in your reply.
 End with exactly one tag: [happy] [sad] [thinking] [surprised] [curious]
 Example: "A person with a laptop — nice setup. [curious]\""""
 
@@ -309,7 +369,7 @@ async def _stream_chat(self, user_text: str) -> None:
 
         # Extract emotion from the complete response (tag is at the end)
         if self._config.skip_emotion_extraction:
-            clean_full = full_text.strip()
+            clean_full = _VISION_TAG_RE.sub("", full_text).strip()
         else:
             clean_full, emotion = _extract_emotion(full_text)
             clean_full = clean_full.strip()
@@ -398,6 +458,11 @@ async def _stream_response(
 
         full_text = ""
         tool_calls: list[dict] = []
+        stripper = (
+            None
+            if self._config.skip_emotion_extraction
+            else _StreamingBracketStripper()
+        )
 
         async with self._http.stream("POST", "/api/chat", json=payload) as resp:
             resp.raise_for_status()
@@ -424,17 +489,25 @@ async def _stream_response(
 
                 full_text += token
 
-                # Stream tokens immediately, stripping any emotion tags
+                # Stream tokens, stripping [emotion] and [Faces:...] tags across
+                # token boundaries (a single tag may arrive split as '[', 'Fac',
+                # 'es: ', 'Alice', ']').
                 clean_token = (
-                    token
-                    if self._config.skip_emotion_extraction
-                    else _EMOTION_RE.sub("", token)
+                    token if stripper is None else stripper.feed(token)
                 )
                 if clean_token and self.callbacks.on_stream_delta:
                     await _maybe_await(
                         self.callbacks.on_stream_delta(clean_token, run_id)
                     )
 
+        # Drain any held buffer (e.g. unclosed '[' at end-of-stream)
+        if stripper is not None:
+            tail = stripper.flush()
+            if tail and self.callbacks.on_stream_delta:
+                await _maybe_await(
+                    self.callbacks.on_stream_delta(tail, run_id)
+                )
+
         return full_text, tool_calls
 
     async def _execute_tool(
@@ -516,14 +589,15 @@ def _extract_emotion(text: str) -> tuple[str, str | None]:
     """Extract emotion from text, strip all bracket tags.
 
     Scans all [tag] occurrences, uses the last known emotion,
-    and removes every bracket tag from the text.
+    and removes every bracket tag (emotion + vision context) from the text.
     """
     emotion = None
     for m in _EMOTION_RE.finditer(text):
         tag = m.group(1).lower()
         if tag in _KNOWN_EMOTIONS:
             emotion = tag
-    cleaned = _EMOTION_RE.sub("", text).strip()
+    cleaned = _VISION_TAG_RE.sub("", text)
+    cleaned = _EMOTION_RE.sub("", cleaned).strip()
     return cleaned, emotion
 
 
diff --git a/src/reachy_claw/plugins/conversation_plugin_slv.py b/src/reachy_claw/plugins/conversation_plugin_slv.py
@@ -1161,6 +1161,33 @@ async def set_volume(level: str) -> dict:
         async def status() -> dict:
             return await asyncio.to_thread(self._cmd_status, {})
 
+    # ── REST dispatch ────────────────────────────────────────────────
+
+    def _execute_robot_command(self, action: str, params: dict) -> dict:
+        """Dispatch a robot command to the matching _cmd_* handler.
+
+        Used by the dashboard's POST /api/ai/commands endpoint. Mirrors the
+        legacy ConversationPlugin dispatcher, but only exposes the handlers
+        this SLV plugin actually implements.
+        """
+        handlers = {
+            "move_head": self._cmd_move_head,
+            "move_antennas": self._cmd_move_antennas,
+            "play_emotion": self._cmd_play_emotion,
+            "dance": self._cmd_dance,
+            "capture_image": self._cmd_capture_image,
+            "set_volume": self._cmd_set_volume,
+            "status": self._cmd_status,
+        }
+        handler = handlers.get(action)
+        if not handler:
+            return {"status": "error", "message": f"Unknown action: {action}"}
+        try:
+            return handler(params)
+        except Exception as e:
+            logger.error(f"Robot command '{action}' failed: {e}")
+            return {"status": "error", "message": str(e)}
+
     # ── _cmd_* handlers (ported from legacy plugin; use app.reachy) ──
 
     def _cmd_move_head(self, params: dict) -> dict:
diff --git a/src/reachy_claw/plugins/dashboard_plugin.py b/src/reachy_claw/plugins/dashboard_plugin.py
@@ -377,7 +377,7 @@ async def _handle_ai_command(self, request):
             action = command
 
         # Find the conversation plugin and execute
-        conv = self.app.get_plugin("ConversationPlugin")
+        conv = self.app.get_plugin("conversation")
         if not conv:
             return web.json_response(
                 {"status": "error", "message": "ConversationPlugin not available"},
diff --git a/tests/e2e_slv_plugin_local.py b/tests/e2e_slv_plugin_local.py
@@ -22,8 +22,8 @@
 
 from reachy_claw.config import Config
 from reachy_claw.app import ReachyClawApp
-from reachy_claw.plugins.conversation_plugin_slv import ConversationPlugin, ConvState
-from ovs_agent.slv_client import ASRFinal, ASRPartial
+from reachy_claw.plugins.conversation_plugin_slv import ConversationPlugin
+from ovs_agent.slv_client import ASRFinal
 
 
 class FakeSLV:
diff --git a/tests/test_llm.py b/tests/test_llm.py
@@ -9,6 +9,7 @@
     OllamaClient,
     OllamaConfig,
     _extract_emotion,
+    _StreamingBracketStripper,
 )
 
 
@@ -257,3 +258,56 @@ def stream(self, method, url, json=None):
 
         await client._stream_chat("third")
         assert len(client._history) == 4  # trimmed to max_history=2 turns
+
+
+# ── Streaming bracket-tag stripper ──────────────────────────────────
+
+
+class TestStreamingBracketStripper:
+    """Strip [emotion] and [Faces: ...] tags from a streamed token feed.
+
+    The LLM often splits a tag across token deltas (e.g. "[", "happy", "]"),
+    so a per-token regex misses it and the raw tag leaks into TTS. The
+    stripper buffers across feeds until a tag closes.
+    """
+
+    def _drain(self, stripper, tokens):
+        """Feed all tokens then flush; return the concatenated output."""
+        out = "".join(stripper.feed(t) for t in tokens)
+        return out + stripper.flush()
+
+    def test_emotion_tag_single_feed(self):
+        s = _StreamingBracketStripper()
+        assert self._drain(s, ["[happy] Hello"]) == "Hello"
+
+    def test_emotion_tag_split_across_feeds(self):
+        s = _StreamingBracketStripper()
+        assert self._drain(s, ["[", "happy", "] hi"]) == "hi"
+
+    def test_faces_tag_split_across_feeds(self):
+        s = _StreamingBracketStripper()
+        assert self._drain(s, ["[Fac", "es: Alice", "] hello"]) == "hello"
+
+    def test_trailing_space_eaten_across_feed_boundary(self):
+        # Tag closes exactly at end of a feed; the leading space of the next
+        # feed must be eaten so "[Faces: X]" + " hello" → "hello".
+        s = _StreamingBracketStripper()
+        assert self._drain(s, ["[Faces: X]", " hello"]) == "hello"
+
+    def test_unclosed_bracket_is_preserved_via_flush(self):
+        # A bare "[" with no closing "]" is held, not dropped; flush() emits it.
+        s = _StreamingBracketStripper()
+        assert self._drain(s, ["hi [incomplete"]) == "hi [incomplete"
+
+    def test_non_tag_bracket_is_kept(self):
+        # Bracketed text with spaces isn't a tag — leave it untouched.
+        s = _StreamingBracketStripper()
+        assert self._drain(s, ["[hello world]"]) == "[hello world]"
+
+    def test_plain_text_passthrough(self):
+        s = _StreamingBracketStripper()
+        assert self._drain(s, ["just ", "plain ", "text"]) == "just plain text"
+
+    def test_tag_then_text_then_tag(self):
+        s = _StreamingBracketStripper()
+        assert self._drain(s, ["[happy] hi ", "[Faces: Bob] there"]) == "hi there"
diff --git a/uv.lock b/uv.lock