fix(audio): ALSA capture/playback retries and timeline event matching

tokk-nv · tokk-nv · commit 9cc3e784a2aa · 2026-03-14T11:32:26.000-07:00
- capture: retry arecord up to 8 times with backoff when device drops
  (e.g. USB bus contention); log resume/give-up
- playback: retry aplay up to 3 times with 0.3s delay on start failure
- voice_pipeline: backoff 3s after server speaker start failure to avoid
  log spam and repeated retries
- app.js: match LLM/TTS timeline events by timestamp range instead of
  array index so missing events don't misalign subsequent turns

Made-with: Cursor
diff --git a/src/multi_modal_ai_studio/devices/capture.py b/src/multi_modal_ai_studio/devices/capture.py
@@ -22,6 +22,11 @@
 CHUNK_BYTES = CHUNK_SAMPLES * 2
 
 
+MAX_CAPTURE_RETRIES = 8
+RETRY_BACKOFF_BASE = 0.5  # seconds; doubles each attempt up to a cap
+RETRY_BACKOFF_MAX = 5.0
+
+
 def _capture_alsa(
     device: str,
     out_queue: "queue.Queue[Optional[bytes]]",
@@ -31,63 +36,104 @@ def _capture_alsa(
     """Capture from ALSA device via arecord; put PCM chunks in out_queue. Runs in thread.
     Uses plughw when device is hw:X,Y so ALSA can do sample-rate conversion (many USB mics only support 48kHz).
     If proc_holder is a list, the subprocess is stored as proc_holder[0] so the caller can terminate it to release the device quickly.
+
+    Auto-restarts arecord up to MAX_CAPTURE_RETRIES times when the device
+    disappears transiently (e.g. USB bus contention with a camera).
     """
     dev = (device or "default").strip()
     if dev.startswith("hw:") and not dev.startswith("plughw:"):
         dev = "plug" + dev
         logger.debug("ALSA using %s for rate conversion (requested 16kHz)", dev)
     cmd = ["arecord", "-D", dev, "-f", "S16_LE", "-r", str(SAMPLE_RATE), "-c", str(CHANNELS), "-t", "raw"]
-    logger.info("ALSA capture starting: %s (device=%s)", " ".join(cmd), device)
-    try:
-        proc = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            bufsize=CHUNK_BYTES,
-        )
-        if proc_holder is not None:
-            proc_holder.append(proc)
-    except FileNotFoundError:
-        logger.warning("arecord not found; cannot capture from ALSA device %s", device)
-        out_queue.put(None)
-        return
-    except Exception as e:
-        logger.warning("Failed to start arecord for %s: %s", device, e)
-        out_queue.put(None)
-        return
-    first_chunk = True
-    try:
-        while not stop_event.is_set() and proc.poll() is None:
-            chunk = proc.stdout.read(CHUNK_BYTES)
-            if not chunk:
-                try:
-                    err = proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else ""
-                    if err:
-                        logger.warning("ALSA capture read empty (device %s). arecord stderr: %s", device, err)
-                    else:
-                        logger.warning("ALSA capture read returned empty (device %s); check device/sample rate", device)
-                except Exception:
-                    logger.warning("ALSA capture read returned empty (device %s)", device)
-                break
-            if first_chunk:
-                first_chunk = False
-                logger.info("ALSA first PCM chunk received from %s (%d bytes); pipeline will get amplitude", device, len(chunk))
-            out_queue.put(chunk)
-    except Exception as e:
-        logger.warning("ALSA capture read error for %s: %s", device, e)
-    finally:
+
+    retries = 0
+    ever_produced_chunk = False
+
+    while not stop_event.is_set():
+        logger.info("ALSA capture starting: %s (device=%s)", " ".join(cmd), device)
         try:
-            proc.terminate()
-            proc.wait(timeout=1)
-        except Exception:
-            pass
-        out_queue.put(None)
-        if proc_holder is not None and proc_holder and proc_holder[0] is proc:
+            proc = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                bufsize=CHUNK_BYTES,
+            )
+            if proc_holder is not None:
+                if proc_holder:
+                    proc_holder.clear()
+                proc_holder.append(proc)
+        except FileNotFoundError:
+            logger.warning("arecord not found; cannot capture from ALSA device %s", device)
+            out_queue.put(None)
+            return
+        except Exception as e:
+            logger.warning("Failed to start arecord for %s: %s", device, e)
+            if retries >= MAX_CAPTURE_RETRIES:
+                logger.error("ALSA capture giving up after %d retries for %s", retries, device)
+                out_queue.put(None)
+                return
+            retries += 1
+            delay = min(RETRY_BACKOFF_BASE * (2 ** (retries - 1)), RETRY_BACKOFF_MAX)
+            logger.info("ALSA capture retry %d/%d in %.1fs for %s", retries, MAX_CAPTURE_RETRIES, delay, device)
+            stop_event.wait(delay)
+            continue
+
+        first_chunk_this_run = True
+        died_unexpectedly = False
+        try:
+            while not stop_event.is_set() and proc.poll() is None:
+                chunk = proc.stdout.read(CHUNK_BYTES)
+                if not chunk:
+                    try:
+                        err = proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else ""
+                        if err:
+                            logger.warning("ALSA capture read empty (device %s). arecord stderr: %s", device, err)
+                        else:
+                            logger.warning("ALSA capture read returned empty (device %s); check device/sample rate", device)
+                    except Exception:
+                        logger.warning("ALSA capture read returned empty (device %s)", device)
+                    died_unexpectedly = True
+                    break
+                if first_chunk_this_run:
+                    first_chunk_this_run = False
+                    if not ever_produced_chunk:
+                        logger.info("ALSA first PCM chunk received from %s (%d bytes); pipeline will get amplitude", device, len(chunk))
+                    else:
+                        logger.info("ALSA capture resumed from %s (%d bytes) after retry", device, len(chunk))
+                    retries = 0
+                    ever_produced_chunk = True
+                out_queue.put(chunk)
+        except Exception as e:
+            logger.warning("ALSA capture read error for %s: %s", device, e)
+            died_unexpectedly = True
+        finally:
             try:
-                proc_holder.clear()
+                proc.terminate()
+                proc.wait(timeout=1)
             except Exception:
                 pass
-        if first_chunk:
+            if proc_holder is not None and proc_holder and proc_holder[0] is proc:
+                try:
+                    proc_holder.clear()
+                except Exception:
+                    pass
+
+        if stop_event.is_set():
+            break
+
+        if died_unexpectedly and retries < MAX_CAPTURE_RETRIES:
+            retries += 1
+            delay = min(RETRY_BACKOFF_BASE * (2 ** (retries - 1)), RETRY_BACKOFF_MAX)
+            logger.warning(
+                "ALSA capture died unexpectedly for %s; retry %d/%d in %.1fs",
+                device, retries, MAX_CAPTURE_RETRIES, delay,
+            )
+            stop_event.wait(delay)
+            continue
+
+        if died_unexpectedly:
+            logger.error("ALSA capture giving up after %d retries for %s", retries, device)
+        elif first_chunk_this_run and not ever_produced_chunk:
             try:
                 err = proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else ""
                 if err:
@@ -96,6 +142,9 @@ def _capture_alsa(
                     logger.warning("ALSA capture ended without sending any chunks (device %s); check arecord -D %s", device, dev)
             except Exception:
                 logger.warning("ALSA capture ended without sending any chunks (device %s)", device)
+        break
+
+    out_queue.put(None)
 
 
 def _capture_pyaudio(
diff --git a/src/multi_modal_ai_studio/devices/playback.py b/src/multi_modal_ai_studio/devices/playback.py
@@ -15,6 +15,8 @@
 logger = logging.getLogger(__name__)
 
 CHANNELS = 1
+PLAYBACK_RETRIES = 3
+PLAYBACK_RETRY_DELAY = 0.3  # seconds between retries
 
 
 def start_server_speaker_playback(
@@ -28,6 +30,9 @@ def start_server_speaker_playback(
     then close stdin when done so aplay exits. Use plughw when device is
     hw:X,Y so ALSA can do sample-rate conversion if needed.
 
+    Retries up to PLAYBACK_RETRIES times for transient device errors
+    (e.g. USB audio device momentarily unavailable).
+
     Args:
         device: ALSA device (e.g. hw:2,0).
         sample_rate: PCM sample rate in Hz (e.g. 24000 from TTS).
@@ -52,28 +57,46 @@ def start_server_speaker_playback(
         "-c", str(CHANNELS),
         "-t", "raw",
     ]
-    logger.info("ALSA playback starting: %s (device=%s, rate=%s)", " ".join(cmd), device, sample_rate)
-    try:
-        proc = subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        # If same device is used for mic (arecord) and speaker, aplay may exit with "Device or resource busy"
-        time.sleep(0.15)
-        if proc.poll() is not None:
-            err = (proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else "") or "(no stderr)"
-            logger.warning("aplay exited immediately for %s: %s", device, err)
+
+    last_err = ""
+    for attempt in range(1, PLAYBACK_RETRIES + 1):
+        logger.info("ALSA playback starting: %s (device=%s, rate=%s)", " ".join(cmd), device, sample_rate)
+        try:
+            proc = subprocess.Popen(
+                cmd,
+                stdin=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            time.sleep(0.15)
+            if proc.poll() is not None:
+                last_err = (proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else "") or "(no stderr)"
+                if attempt < PLAYBACK_RETRIES:
+                    logger.warning(
+                        "aplay exited immediately for %s (attempt %d/%d): %s — retrying in %.1fs",
+                        device, attempt, PLAYBACK_RETRIES, last_err, PLAYBACK_RETRY_DELAY,
+                    )
+                    time.sleep(PLAYBACK_RETRY_DELAY)
+                    continue
+                logger.warning("aplay exited immediately for %s after %d attempts: %s", device, attempt, last_err)
+                return None
+            if proc_holder is not None:
+                proc_holder.append(proc)
+            return proc
+        except FileNotFoundError:
+            logger.warning("aplay not found; cannot play to ALSA device %s", device)
             return None
-        if proc_holder is not None:
-            proc_holder.append(proc)
-        return proc
-    except FileNotFoundError:
-        logger.warning("aplay not found; cannot play to ALSA device %s", device)
-        return None
-    except Exception as e:
-        logger.warning("Failed to start aplay for %s: %s", device, e)
-        return None
+        except Exception as e:
+            last_err = str(e)
+            if attempt < PLAYBACK_RETRIES:
+                logger.warning(
+                    "Failed to start aplay for %s (attempt %d/%d): %s — retrying in %.1fs",
+                    device, attempt, PLAYBACK_RETRIES, e, PLAYBACK_RETRY_DELAY,
+                )
+                time.sleep(PLAYBACK_RETRY_DELAY)
+                continue
+            logger.warning("Failed to start aplay for %s after %d attempts: %s", device, attempt, e)
+            return None
+    return None
 
 
 def stop_server_speaker_playback(proc: Optional[subprocess.Popen]) -> None:
diff --git a/src/multi_modal_ai_studio/webui/static/app.js b/src/multi_modal_ai_studio/webui/static/app.js
@@ -3562,10 +3562,13 @@ function drawTimelineEvents(ctx, timeline, lanes, LANE_HEIGHTS, laneYOffsets, LA
     const llmCompletes = timeline.filter(e => e.event_type === 'llm_complete').sort((a, b) => (a.timestamp || 0) - (b.timestamp || 0));
 
     // LLM: prefill (start → first token) and generate (first token → complete). Like Live RIVA WebUI; first-token boundary from pipeline.
+    // Match llm_first_token and llm_complete to each llm_start by timestamp
+    // range (not array index) so missing events don't shift all subsequent turns.
     llmStarts.forEach((startEv, i) => {
-        const firstToken = llmFirstTokens[i];
-        const complete = llmCompletes[i];
         const startTime = startEv.timestamp || 0;
+        const nextStartTime = (i + 1 < llmStarts.length) ? (llmStarts[i + 1].timestamp || 0) : Infinity;
+        const complete = llmCompletes.find(e => (e.timestamp || 0) >= startTime && (e.timestamp || 0) < nextStartTime) || null;
+        const firstToken = llmFirstTokens.find(e => (e.timestamp || 0) >= startTime && (e.timestamp || 0) < nextStartTime) || null;
         const endTime = (complete && (complete.timestamp || 0)) || startTime;
         if (firstToken && (firstToken.timestamp || 0) > startTime) {
             inferredRectangles.push({
@@ -3740,9 +3743,12 @@ function drawTimelineEvents(ctx, timeline, lanes, LANE_HEIGHTS, laneYOffsets, LA
     });
 
     // TTS lane: one magenta rectangle per turn from tts_start to tts_complete (tts_first_audio shown as thin vertical line later)
+    // Match by timestamp range (not array index) so missing events don't misalign.
     ttsStarts.forEach((startEv, i) => {
-        const complete = ttsCompletes[i];
-        if (complete && (complete.timestamp || 0) > (startEv.timestamp || 0)) {
+        const startTime = startEv.timestamp || 0;
+        const nextStartTime = (i + 1 < ttsStarts.length) ? (ttsStarts[i + 1].timestamp || 0) : Infinity;
+        const complete = ttsCompletes.find(e => (e.timestamp || 0) >= startTime && (e.timestamp || 0) < nextStartTime) || null;
+        if (complete && (complete.timestamp || 0) > startTime) {
             inferredRectangles.push({
                 event_type: 'tts_segment',
                 lane: 'tts',
diff --git a/src/multi_modal_ai_studio/webui/voice_pipeline.py b/src/multi_modal_ai_studio/webui/voice_pipeline.py
@@ -1397,10 +1397,11 @@ async def turn_executor() -> None:
                 last_tts_amplitude_time = 0.0
                 tts_amplitude_interval = 0.05
                 server_speaker_proc = None
+                _speaker_fail_until = 0.0  # backoff: skip aplay retries until this epoch
                 tts_consumer_error: Optional[Exception] = None
 
                 async def _send_tts_audio(chunk):
-                    nonlocal tts_first_sent, ts_tts_first, last_tts_amplitude_time, server_speaker_proc
+                    nonlocal tts_first_sent, ts_tts_first, last_tts_amplitude_time, server_speaker_proc, _speaker_fail_until
                     if not tts_first_sent:
                         ts_tts_first = (time.time() - session.timeline.start_time) if session.timeline.start_time else 0
                         ref_label = "llm_first_token" if use_stream_tts else "llm_complete"
@@ -1414,11 +1415,12 @@ async def _send_tts_audio(chunk):
                     )
                     _out_device = session.config.devices.audio_output_device
                     if _use_speaker and chunk.audio:
-                        if server_speaker_proc is None:
+                        if server_speaker_proc is None and time.time() >= _speaker_fail_until:
                             server_speaker_proc = start_server_speaker_playback(_out_device, chunk.sample_rate)
                             if server_speaker_proc is None:
+                                _speaker_fail_until = time.time() + 3.0
                                 logger.warning(
-                                    "Server speaker playback could not start for %s; check aplay and device",
+                                    "Server speaker playback could not start for %s; check aplay and device (suppressing retries for 3s)",
                                     _out_device,
                                 )
                         if server_speaker_proc is not None and server_speaker_proc.stdin and not server_speaker_proc.stdin.closed: