Merge pull request nesquena#4106 from nesquena/stage-4016b

nesquena-hermes · web-flow · commit ae90cf620ba9 · 2026-06-13T00:47:20.000-07:00
Release MY (v0.51.386): voice mode survives dropped speechSynthesis onend (nesquena#3983)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@
 
 ## [Unreleased]
 
+## [v0.51.386] — 2026-06-13 — Release MY (voice mode survives a dropped speechSynthesis onend, #3983)
+
+### Fixed
+
+- **Hands-free voice mode no longer dead-ends after the first browser-TTS reply (#3983).** Chromium intermittently drops the `speechSynthesis` utterance's `onend` event, which left voice mode stuck "speaking" and never re-armed listening. A watchdog now forces a return to listening if `onend` never fires, with the recovery handles cleared on normal completion and on deactivation. The fix is scoped to the browser `speechSynthesis` path — the Edge `Audio` branch (which has a reliable `onended`) is untouched. (#3983)
+
 ## [v0.51.385] — 2026-06-13 — Release MX (profile-cookie env var aligned to HERMES_WEBUI_ prefix, #803)
 
 ### Changed
diff --git a/static/boot.js b/static/boot.js
@@ -853,8 +853,48 @@ window._micPendingSend=window._micPendingSend||false;
   // a different session's last assistant reply if the user navigated away
   // between send and stream completion. (Opus pre-release advisor.)
   let _voiceModeThinkingSid=null;
+  let _browserTtsKeepAlive=null;
+  let _browserTtsWatchdog=null;
+  let _browserTtsSuppressNextErrorRearm=false;
   const SILENCE_MS=1800; // auto-send after 1.8s silence
 
+  function _clearBrowserTtsRecovery(){
+    if(_browserTtsKeepAlive){
+      clearInterval(_browserTtsKeepAlive);
+      _browserTtsKeepAlive=null;
+    }
+    if(_browserTtsWatchdog){
+      clearTimeout(_browserTtsWatchdog);
+      _browserTtsWatchdog=null;
+    }
+  }
+
+  function _armBrowserTtsRecovery(clean, rate){
+    _clearBrowserTtsRecovery();
+    _browserTtsSuppressNextErrorRearm=false;
+    const safeRate=(Number.isFinite(rate)&&rate>0)?rate:1;
+    // Chromium can drop utter.onend on later turns, so force a recovery path.
+    const watchdogMs=Math.max(4000,Math.round((String(clean||'').length/(12*safeRate))*1000)+10000);
+    _browserTtsWatchdog=setTimeout(()=>{
+      if(!_voiceModeActive||_voiceModeState!=='speaking') return;
+      _browserTtsSuppressNextErrorRearm=true;
+      try{ speechSynthesis.cancel(); }catch(_){}
+      _clearBrowserTtsRecovery();
+      _startListening();
+    },watchdogMs);
+    _browserTtsKeepAlive=setInterval(()=>{
+      if(!_voiceModeActive||_voiceModeState!=='speaking'){
+        _clearBrowserTtsRecovery();
+        return;
+      }
+      if(!speechSynthesis.speaking) return;
+      try{
+        speechSynthesis.pause();
+        speechSynthesis.resume();
+      }catch(_){}
+    },10000);
+  }
+
   function _setState(state){
     _voiceModeState=state;
     indicator.className='voice-mode-indicator '+state;
@@ -867,6 +907,7 @@ window._micPendingSend=window._micPendingSend||false;
 
   function _startListening(){
     if(!_voiceModeActive) return;
+    _clearBrowserTtsRecovery();
     _setState('listening');
 
     _recognition=new SpeechRecognition();
@@ -1057,14 +1098,27 @@ window._micPendingSend=window._micPendingSend||false;
     if(!isNaN(savedPitch)) utter.pitch=Math.min(2,Math.max(0,savedPitch));
 
     utter.onend=()=>{
+      _browserTtsSuppressNextErrorRearm=false;
+      _clearBrowserTtsRecovery();
       // After speaking, go back to listening
-      if(_voiceModeActive) setTimeout(()=>_startListening(),500);
+      if(_voiceModeActive&&_voiceModeState==='speaking') setTimeout(()=>_startListening(),500);
     };
     utter.onerror=()=>{
+      _clearBrowserTtsRecovery();
+      if(_browserTtsSuppressNextErrorRearm){
+        _browserTtsSuppressNextErrorRearm=false;
+        return;
+      }
       if(_voiceModeActive) setTimeout(()=>_startListening(),1000);
     };
 
-    speechSynthesis.speak(utter);
+    _armBrowserTtsRecovery(clean, utter.rate);
+    try{
+      speechSynthesis.speak(utter);
+    }catch(_){
+      _clearBrowserTtsRecovery();
+      if(_voiceModeActive) setTimeout(()=>_startListening(),1000);
+    }
   }
 
   // Hook into response completion — observe when the agent finishes
@@ -1121,10 +1175,12 @@ window._micPendingSend=window._micPendingSend||false;
     _voiceModeActive=false;
     _voiceModeState='idle';
     _voiceModeThinkingSid=null;
+    _browserTtsSuppressNextErrorRearm=false;
     modeBtn.classList.remove('active');
     _setButtonTooltip(modeBtn, t('voice_mode_toggle'));
     bar.style.display='none';
     clearTimeout(_silenceTimer);
+    _clearBrowserTtsRecovery();
     try{ if(_recognition) _recognition.abort(); }catch(_){}
     _recognition=null;
     if(typeof stopTTS==='function') stopTTS();
diff --git a/tests/test_issue3983_browser_tts_watchdog.py b/tests/test_issue3983_browser_tts_watchdog.py
@@ -0,0 +1,82 @@
+from pathlib import Path
+import re
+
+
+REPO = Path(__file__).resolve().parents[1]
+
+
+def _extract_function(src: str, name: str) -> str:
+    anchor = f"function {name}("
+    start = src.find(anchor)
+    assert start != -1, f"{name}() must exist"
+    body_start = src.find("{", start)
+    assert body_start != -1, f"{name}() must have a body"
+    depth = 1
+    idx = body_start + 1
+    while depth and idx < len(src):
+        if src[idx] == "{":
+            depth += 1
+        elif src[idx] == "}":
+            depth -= 1
+        idx += 1
+    assert depth == 0, f"{name}() body must balance braces"
+    return src[start:idx]
+
+
+def test_boot_js_declares_browser_tts_recovery_helpers():
+    src = (REPO / "static" / "boot.js").read_text(encoding="utf-8")
+    assert "let _browserTtsKeepAlive=null;" in src
+    assert "let _browserTtsWatchdog=null;" in src
+    assert "let _browserTtsSuppressNextErrorRearm=false;" in src
+    assert "function _clearBrowserTtsRecovery()" in src
+    assert "function _armBrowserTtsRecovery(clean, rate)" in src
+
+
+def test_browser_tts_watchdog_rearms_listening_if_onend_drops():
+    src = (REPO / "static" / "boot.js").read_text(encoding="utf-8")
+    arm_body = _extract_function(src, "_armBrowserTtsRecovery")
+    assert "_browserTtsWatchdog=setTimeout" in arm_body
+    assert "_voiceModeState!=='speaking'" in arm_body
+    assert "_browserTtsSuppressNextErrorRearm=true;" in arm_body
+    assert "speechSynthesis.cancel()" in arm_body
+    assert "_startListening();" in arm_body
+    assert "_browserTtsKeepAlive=setInterval" in arm_body
+    assert "speechSynthesis.pause();" in arm_body
+    assert "speechSynthesis.resume();" in arm_body
+
+
+def test_browser_tts_callbacks_and_deactivate_clear_recovery_handles():
+    src = (REPO / "static" / "boot.js").read_text(encoding="utf-8")
+    speak_body = _extract_function(src, "_speakResponse")
+    assert "const utter=new SpeechSynthesisUtterance(clean);" in speak_body
+    assert "utter.onend=()=>{" in speak_body
+    assert "utter.onerror=()=>{" in speak_body
+    assert speak_body.count("_clearBrowserTtsRecovery();") >= 2, (
+        "Both browser TTS completion callbacks must clear watchdog/keep-alive handles."
+    )
+    assert "_browserTtsSuppressNextErrorRearm=false;" in speak_body
+    assert "_voiceModeActive&&_voiceModeState==='speaking'" in speak_body
+    assert "if(_browserTtsSuppressNextErrorRearm){" in speak_body
+    assert "_armBrowserTtsRecovery(clean, utter.rate);" in speak_body
+
+    deactivate_body = _extract_function(src, "_deactivate")
+    assert "_clearBrowserTtsRecovery();" in deactivate_body, (
+        "_deactivate() must clear browser TTS watchdog/keep-alive handles."
+    )
+    assert "_browserTtsSuppressNextErrorRearm=false;" in deactivate_body
+
+
+def test_edge_audio_branch_stays_separate():
+    src = (REPO / "static" / "boot.js").read_text(encoding="utf-8")
+    edge_match = re.search(
+        r'if\(engine==="edge"\)\{(.*?)\n\s+return;\n\s+\}',
+        src,
+        re.DOTALL,
+    )
+    assert edge_match, "Edge audio branch must exist"
+    edge_body = edge_match.group(1)
+    assert "const audio = new Audio(url);" in edge_body
+    assert "audio.onended = () => {" in edge_body
+    assert "_armBrowserTtsRecovery" not in edge_body, (
+        "The browser speechSynthesis workaround must not be injected into the Edge audio branch."
+    )