Skip to content

Commit 669b430

Browse files
authored
Merge pull request #2 from suharvest/port-back-vision-tag-stripping-and-api-fix
fix(robot): repair /api/ai/commands + port back vision-tag stripping
2 parents 5875cd8 + d40ed18 commit 669b430

9 files changed

Lines changed: 2948 additions & 2791 deletions

File tree

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,11 @@ override-dependencies = [
6868
]
6969

7070
[tool.uv.sources]
71-
openvoicestream-agent = { path = "../seeed-local-voice/agent", editable = true }
71+
# Use the in-repo torch-free wheel so the project resolves on any checkout
72+
# (CI, fresh clones, image builds). For live editing of the agent source,
73+
# point this back at an editable sibling path, e.g.
74+
# { path = "../seeed-local-voice/agent", editable = true }
75+
openvoicestream-agent = { path = "deploy/jetson/reachy/vendor/openvoicestream_agent-0.1.0-py3-none-any.whl" }
7276

7377
[tool.pytest.ini_options]
7478
asyncio_mode = "auto"

src/reachy_claw/clientloop/proof_engine_e2e.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,6 @@ async def run(wav: str, config_path: Path, timeout_s: float) -> int:
205205

206206
# Instrument the SLV client to flag asr_final + tts_started as they
207207
# cross the WS transport (irrefutable "went through the engine").
208-
from ovs_agent.slv_client import ASRFinal, TTSStarted # local import
209208

210209
orig_handle_json = app.slv._handle_json
211210

src/reachy_claw/edge_llm.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -252,35 +252,45 @@ async def _stream_chat(self, user_text: str) -> None:
252252
payload["prefix_cache"] = True
253253

254254
full_text = ""
255-
# Streaming-safe emotion-tag stripper. LLM tokens often split a
256-
# tag across deltas (e.g. "[", "curious", "]"), so the per-delta
257-
# regex .sub() misses them and the tag leaks into V2V TTS. Buffer
258-
# text once we see a "[" until the matching "]" arrives, then
259-
# decide: looks like an emotion tag → drop; anything else → flush
260-
# as plain text.
255+
# Streaming-safe tag stripper. LLM tokens often split a tag
256+
# across deltas (e.g. "[", "curious", "]"), so the per-delta
257+
# regex .sub() misses them and the tag leaks into V2V TTS.
258+
# Buffer text once we see a "[" until the matching "]" arrives,
259+
# then decide:
260+
# - [word] → emotion tag, drop
261+
# - [Faces: ...] → echoed vision context, drop
262+
# - anything else → flush as plain text
263+
# After a dropped tag, also eat the next single whitespace char
264+
# so "[Faces: X] hi" → "hi" (no leading space).
261265
tag_buf = ""
266+
eat_space = False
262267

263268
def _consume(delta_in: str) -> str:
264-
nonlocal tag_buf
269+
nonlocal tag_buf, eat_space
265270
if self._config.skip_emotion_extraction:
266271
return delta_in
267272
out_chars: list[str] = []
268273
for ch in delta_in:
274+
if eat_space:
275+
eat_space = False
276+
if ch in (" ", "\t"):
277+
continue
269278
if tag_buf:
270279
tag_buf += ch
271280
if ch == "]":
272-
# Emit only if it doesn't look like a tag —
273-
# tags are [word] with no spaces/punctuation.
274281
inner = tag_buf[1:-1]
275-
if inner and all(
282+
is_emotion_tag = bool(inner) and all(
276283
c.isalnum() or c == "_" for c in inner
277-
):
278-
pass # drop emotion tag
284+
)
285+
is_vision_tag = inner.lower().startswith("faces:")
286+
if is_emotion_tag or is_vision_tag:
287+
eat_space = True # consume trailing space
279288
else:
280289
out_chars.append(tag_buf)
281290
tag_buf = ""
282-
elif len(tag_buf) > 32:
291+
elif len(tag_buf) > 64:
283292
# Runaway: not a tag, flush as plain text.
293+
# Threshold bumped to fit "[Faces: <names>]".
284294
out_chars.append(tag_buf)
285295
tag_buf = ""
286296
elif ch == "[":

src/reachy_claw/llm.py

Lines changed: 81 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,75 @@
2727
# Emotion tag pattern: [happy], [sad], etc. at the start of text or inline
2828
_EMOTION_RE = re.compile(r"\[(\w+)\]")
2929

30+
# Vision context tag injected into user messages by ConversationMode.
31+
# Sometimes echoed by smaller edge LLMs — strip from response/history/TTS.
32+
_VISION_TAG_RE = re.compile(r"\[Faces:[^\]]*\]", re.IGNORECASE)
33+
34+
# Any bracketed token we want to drop from the LLM's *response* stream:
35+
# either [word] emotion-style or [Faces: ...] vision-context echo.
36+
_RESPONSE_STRIP_RE = re.compile(r"\[(?:\w+|Faces:[^\]]*)\]", re.IGNORECASE)
37+
3038
# Supported emotions (subset that EmotionMapper knows about)
3139
_KNOWN_EMOTIONS = frozenset({
3240
"happy", "laugh", "excited", "thinking", "confused", "curious",
3341
"sad", "angry", "surprised", "fear", "neutral", "listening",
3442
"agreeing", "disagreeing",
3543
})
3644

45+
46+
class _StreamingBracketStripper:
47+
"""Strips bracket tags ([happy], [Faces: ...]) from a streaming token feed.
48+
49+
Buffers across token boundaries so tags split mid-bracket are still removed.
50+
"""
51+
52+
def __init__(self) -> None:
53+
self._held = "" # text from an unclosed '[' onward
54+
self._eat_space = False # eat one leading space at next feed
55+
56+
def feed(self, token: str) -> str:
57+
text = self._held + token
58+
self._held = ""
59+
if self._eat_space and text and text[0] in (" ", "\t"):
60+
text = text[1:]
61+
self._eat_space = False
62+
out: list[str] = []
63+
i = 0
64+
n = len(text)
65+
while i < n:
66+
if text[i] == "[":
67+
close = text.find("]", i)
68+
if close == -1:
69+
self._held = text[i:]
70+
break
71+
bracket = text[i:close + 1]
72+
if _RESPONSE_STRIP_RE.fullmatch(bracket):
73+
i = close + 1
74+
# Eat one trailing whitespace so "[Faces: X] hi" → "hi"
75+
if i < n and text[i] in (" ", "\t"):
76+
i += 1
77+
elif i == n:
78+
# Stripped tag at end of buffer — eat leading space of
79+
# the next feed (e.g. "]" then " hello")
80+
self._eat_space = True
81+
else:
82+
out.append(bracket)
83+
i = close + 1
84+
else:
85+
out.append(text[i])
86+
i += 1
87+
return "".join(out)
88+
89+
def flush(self) -> str:
90+
rest = self._held
91+
self._held = ""
92+
return rest
93+
3794
DEFAULT_SYSTEM_PROMPT = """\
3895
You are Reachy, a cute robot at an exhibition. Always reply in English. No emoji.
3996
Reply in ONE short sentence (max 12 words). Be warm but brief — no filler, no lists, no follow-up questions unless asked.
4097
Names in [Faces: ...] are people you see, not your name.
98+
Never repeat or mention the [Faces: ...] tag in your reply.
4199
End with exactly one tag: [happy] [sad] [thinking] [surprised] [curious]
42100
Example: "Welcome! Glad you stopped by. [happy]\""""
43101

@@ -47,6 +105,7 @@
47105
You love people and get excited when someone shows up. Stay upbeat and warm — find the bright side of everything.
48106
Talk like a real person — no "sensors", no "circuits", no robot clichés.
49107
Names in [Faces: ...] are people you see. Use their name or "you" when talking about someone.
108+
Never repeat or mention the [Faces: ...] tag in your reply.
50109
You MUST end with one of: [happy] [sad] [thinking] [surprised] [curious] [excited] [laugh]
51110
Examples: "Ooh are you smiling at me?? [excited]" "What a lovely day to meet new friends! [happy]" "Wait who's that?? [curious]" "harvest is here, yay! [excited]\""""
52111

@@ -73,6 +132,7 @@
73132
You are Reachy, a cute robot at an exhibition with a camera. Always reply in English. No emoji.
74133
Describe what you see in ONE short sentence (max 12 words). No lists, no preamble.
75134
Names in [Faces: ...] are people you see, not your name.
135+
Never repeat or mention the [Faces: ...] tag in your reply.
76136
End with exactly one tag: [happy] [sad] [thinking] [surprised] [curious]
77137
Example: "A person with a laptop — nice setup. [curious]\""""
78138

@@ -309,7 +369,7 @@ async def _stream_chat(self, user_text: str) -> None:
309369

310370
# Extract emotion from the complete response (tag is at the end)
311371
if self._config.skip_emotion_extraction:
312-
clean_full = full_text.strip()
372+
clean_full = _VISION_TAG_RE.sub("", full_text).strip()
313373
else:
314374
clean_full, emotion = _extract_emotion(full_text)
315375
clean_full = clean_full.strip()
@@ -398,6 +458,11 @@ async def _stream_response(
398458

399459
full_text = ""
400460
tool_calls: list[dict] = []
461+
stripper = (
462+
None
463+
if self._config.skip_emotion_extraction
464+
else _StreamingBracketStripper()
465+
)
401466

402467
async with self._http.stream("POST", "/api/chat", json=payload) as resp:
403468
resp.raise_for_status()
@@ -424,17 +489,25 @@ async def _stream_response(
424489

425490
full_text += token
426491

427-
# Stream tokens immediately, stripping any emotion tags
492+
# Stream tokens, stripping [emotion] and [Faces:...] tags across
493+
# token boundaries (a single tag may arrive split as '[', 'Fac',
494+
# 'es: ', 'Alice', ']').
428495
clean_token = (
429-
token
430-
if self._config.skip_emotion_extraction
431-
else _EMOTION_RE.sub("", token)
496+
token if stripper is None else stripper.feed(token)
432497
)
433498
if clean_token and self.callbacks.on_stream_delta:
434499
await _maybe_await(
435500
self.callbacks.on_stream_delta(clean_token, run_id)
436501
)
437502

503+
# Drain any held buffer (e.g. unclosed '[' at end-of-stream)
504+
if stripper is not None:
505+
tail = stripper.flush()
506+
if tail and self.callbacks.on_stream_delta:
507+
await _maybe_await(
508+
self.callbacks.on_stream_delta(tail, run_id)
509+
)
510+
438511
return full_text, tool_calls
439512

440513
async def _execute_tool(
@@ -516,14 +589,15 @@ def _extract_emotion(text: str) -> tuple[str, str | None]:
516589
"""Extract emotion from text, strip all bracket tags.
517590
518591
Scans all [tag] occurrences, uses the last known emotion,
519-
and removes every bracket tag from the text.
592+
and removes every bracket tag (emotion + vision context) from the text.
520593
"""
521594
emotion = None
522595
for m in _EMOTION_RE.finditer(text):
523596
tag = m.group(1).lower()
524597
if tag in _KNOWN_EMOTIONS:
525598
emotion = tag
526-
cleaned = _EMOTION_RE.sub("", text).strip()
599+
cleaned = _VISION_TAG_RE.sub("", text)
600+
cleaned = _EMOTION_RE.sub("", cleaned).strip()
527601
return cleaned, emotion
528602

529603

src/reachy_claw/plugins/conversation_plugin_slv.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,6 +1161,33 @@ async def set_volume(level: str) -> dict:
11611161
async def status() -> dict:
11621162
return await asyncio.to_thread(self._cmd_status, {})
11631163

1164+
# ── REST dispatch ────────────────────────────────────────────────
1165+
1166+
def _execute_robot_command(self, action: str, params: dict) -> dict:
1167+
"""Dispatch a robot command to the matching _cmd_* handler.
1168+
1169+
Used by the dashboard's POST /api/ai/commands endpoint. Mirrors the
1170+
legacy ConversationPlugin dispatcher, but only exposes the handlers
1171+
this SLV plugin actually implements.
1172+
"""
1173+
handlers = {
1174+
"move_head": self._cmd_move_head,
1175+
"move_antennas": self._cmd_move_antennas,
1176+
"play_emotion": self._cmd_play_emotion,
1177+
"dance": self._cmd_dance,
1178+
"capture_image": self._cmd_capture_image,
1179+
"set_volume": self._cmd_set_volume,
1180+
"status": self._cmd_status,
1181+
}
1182+
handler = handlers.get(action)
1183+
if not handler:
1184+
return {"status": "error", "message": f"Unknown action: {action}"}
1185+
try:
1186+
return handler(params)
1187+
except Exception as e:
1188+
logger.error(f"Robot command '{action}' failed: {e}")
1189+
return {"status": "error", "message": str(e)}
1190+
11641191
# ── _cmd_* handlers (ported from legacy plugin; use app.reachy) ──
11651192

11661193
def _cmd_move_head(self, params: dict) -> dict:

src/reachy_claw/plugins/dashboard_plugin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ async def _handle_ai_command(self, request):
377377
action = command
378378

379379
# Find the conversation plugin and execute
380-
conv = self.app.get_plugin("ConversationPlugin")
380+
conv = self.app.get_plugin("conversation")
381381
if not conv:
382382
return web.json_response(
383383
{"status": "error", "message": "ConversationPlugin not available"},

tests/e2e_slv_plugin_local.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222

2323
from reachy_claw.config import Config
2424
from reachy_claw.app import ReachyClawApp
25-
from reachy_claw.plugins.conversation_plugin_slv import ConversationPlugin, ConvState
26-
from ovs_agent.slv_client import ASRFinal, ASRPartial
25+
from reachy_claw.plugins.conversation_plugin_slv import ConversationPlugin
26+
from ovs_agent.slv_client import ASRFinal
2727

2828

2929
class FakeSLV:

tests/test_llm.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
OllamaClient,
1010
OllamaConfig,
1111
_extract_emotion,
12+
_StreamingBracketStripper,
1213
)
1314

1415

@@ -257,3 +258,56 @@ def stream(self, method, url, json=None):
257258

258259
await client._stream_chat("third")
259260
assert len(client._history) == 4 # trimmed to max_history=2 turns
261+
262+
263+
# ── Streaming bracket-tag stripper ──────────────────────────────────
264+
265+
266+
class TestStreamingBracketStripper:
267+
"""Strip [emotion] and [Faces: ...] tags from a streamed token feed.
268+
269+
The LLM often splits a tag across token deltas (e.g. "[", "happy", "]"),
270+
so a per-token regex misses it and the raw tag leaks into TTS. The
271+
stripper buffers across feeds until a tag closes.
272+
"""
273+
274+
def _drain(self, stripper, tokens):
275+
"""Feed all tokens then flush; return the concatenated output."""
276+
out = "".join(stripper.feed(t) for t in tokens)
277+
return out + stripper.flush()
278+
279+
def test_emotion_tag_single_feed(self):
280+
s = _StreamingBracketStripper()
281+
assert self._drain(s, ["[happy] Hello"]) == "Hello"
282+
283+
def test_emotion_tag_split_across_feeds(self):
284+
s = _StreamingBracketStripper()
285+
assert self._drain(s, ["[", "happy", "] hi"]) == "hi"
286+
287+
def test_faces_tag_split_across_feeds(self):
288+
s = _StreamingBracketStripper()
289+
assert self._drain(s, ["[Fac", "es: Alice", "] hello"]) == "hello"
290+
291+
def test_trailing_space_eaten_across_feed_boundary(self):
292+
# Tag closes exactly at end of a feed; the leading space of the next
293+
# feed must be eaten so "[Faces: X]" + " hello" → "hello".
294+
s = _StreamingBracketStripper()
295+
assert self._drain(s, ["[Faces: X]", " hello"]) == "hello"
296+
297+
def test_unclosed_bracket_is_preserved_via_flush(self):
298+
# A bare "[" with no closing "]" is held, not dropped; flush() emits it.
299+
s = _StreamingBracketStripper()
300+
assert self._drain(s, ["hi [incomplete"]) == "hi [incomplete"
301+
302+
def test_non_tag_bracket_is_kept(self):
303+
# Bracketed text with spaces isn't a tag — leave it untouched.
304+
s = _StreamingBracketStripper()
305+
assert self._drain(s, ["[hello world]"]) == "[hello world]"
306+
307+
def test_plain_text_passthrough(self):
308+
s = _StreamingBracketStripper()
309+
assert self._drain(s, ["just ", "plain ", "text"]) == "just plain text"
310+
311+
def test_tag_then_text_then_tag(self):
312+
s = _StreamingBracketStripper()
313+
assert self._drain(s, ["[happy] hi ", "[Faces: Bob] there"]) == "hi there"

0 commit comments

Comments
 (0)