Skip to content

Commit 9cc3e78

Browse files
committed
fix(audio): ALSA capture/playback retries and timeline event matching
- capture: retry arecord up to 8 times with backoff when device drops (e.g. USB bus contention); log resume/give-up - playback: retry aplay up to 3 times with 0.3s delay on start failure - voice_pipeline: backoff 3s after server speaker start failure to avoid log spam and repeated retries - app.js: match LLM/TTS timeline events by timestamp range instead of array index so missing events don't misalign subsequent turns Made-with: Cursor
1 parent d08e3ba commit 9cc3e78

4 files changed

Lines changed: 155 additions & 75 deletions

File tree

src/multi_modal_ai_studio/devices/capture.py

Lines changed: 96 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
CHUNK_BYTES = CHUNK_SAMPLES * 2
2323

2424

25+
MAX_CAPTURE_RETRIES = 8
26+
RETRY_BACKOFF_BASE = 0.5 # seconds; doubles each attempt up to a cap
27+
RETRY_BACKOFF_MAX = 5.0
28+
29+
2530
def _capture_alsa(
2631
device: str,
2732
out_queue: "queue.Queue[Optional[bytes]]",
@@ -31,63 +36,104 @@ def _capture_alsa(
3136
"""Capture from ALSA device via arecord; put PCM chunks in out_queue. Runs in thread.
3237
Uses plughw when device is hw:X,Y so ALSA can do sample-rate conversion (many USB mics only support 48kHz).
3338
If proc_holder is a list, the subprocess is stored as proc_holder[0] so the caller can terminate it to release the device quickly.
39+
40+
Auto-restarts arecord up to MAX_CAPTURE_RETRIES times when the device
41+
disappears transiently (e.g. USB bus contention with a camera).
3442
"""
3543
dev = (device or "default").strip()
3644
if dev.startswith("hw:") and not dev.startswith("plughw:"):
3745
dev = "plug" + dev
3846
logger.debug("ALSA using %s for rate conversion (requested 16kHz)", dev)
3947
cmd = ["arecord", "-D", dev, "-f", "S16_LE", "-r", str(SAMPLE_RATE), "-c", str(CHANNELS), "-t", "raw"]
40-
logger.info("ALSA capture starting: %s (device=%s)", " ".join(cmd), device)
41-
try:
42-
proc = subprocess.Popen(
43-
cmd,
44-
stdout=subprocess.PIPE,
45-
stderr=subprocess.PIPE,
46-
bufsize=CHUNK_BYTES,
47-
)
48-
if proc_holder is not None:
49-
proc_holder.append(proc)
50-
except FileNotFoundError:
51-
logger.warning("arecord not found; cannot capture from ALSA device %s", device)
52-
out_queue.put(None)
53-
return
54-
except Exception as e:
55-
logger.warning("Failed to start arecord for %s: %s", device, e)
56-
out_queue.put(None)
57-
return
58-
first_chunk = True
59-
try:
60-
while not stop_event.is_set() and proc.poll() is None:
61-
chunk = proc.stdout.read(CHUNK_BYTES)
62-
if not chunk:
63-
try:
64-
err = proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else ""
65-
if err:
66-
logger.warning("ALSA capture read empty (device %s). arecord stderr: %s", device, err)
67-
else:
68-
logger.warning("ALSA capture read returned empty (device %s); check device/sample rate", device)
69-
except Exception:
70-
logger.warning("ALSA capture read returned empty (device %s)", device)
71-
break
72-
if first_chunk:
73-
first_chunk = False
74-
logger.info("ALSA first PCM chunk received from %s (%d bytes); pipeline will get amplitude", device, len(chunk))
75-
out_queue.put(chunk)
76-
except Exception as e:
77-
logger.warning("ALSA capture read error for %s: %s", device, e)
78-
finally:
48+
49+
retries = 0
50+
ever_produced_chunk = False
51+
52+
while not stop_event.is_set():
53+
logger.info("ALSA capture starting: %s (device=%s)", " ".join(cmd), device)
7954
try:
80-
proc.terminate()
81-
proc.wait(timeout=1)
82-
except Exception:
83-
pass
84-
out_queue.put(None)
85-
if proc_holder is not None and proc_holder and proc_holder[0] is proc:
55+
proc = subprocess.Popen(
56+
cmd,
57+
stdout=subprocess.PIPE,
58+
stderr=subprocess.PIPE,
59+
bufsize=CHUNK_BYTES,
60+
)
61+
if proc_holder is not None:
62+
if proc_holder:
63+
proc_holder.clear()
64+
proc_holder.append(proc)
65+
except FileNotFoundError:
66+
logger.warning("arecord not found; cannot capture from ALSA device %s", device)
67+
out_queue.put(None)
68+
return
69+
except Exception as e:
70+
logger.warning("Failed to start arecord for %s: %s", device, e)
71+
if retries >= MAX_CAPTURE_RETRIES:
72+
logger.error("ALSA capture giving up after %d retries for %s", retries, device)
73+
out_queue.put(None)
74+
return
75+
retries += 1
76+
delay = min(RETRY_BACKOFF_BASE * (2 ** (retries - 1)), RETRY_BACKOFF_MAX)
77+
logger.info("ALSA capture retry %d/%d in %.1fs for %s", retries, MAX_CAPTURE_RETRIES, delay, device)
78+
stop_event.wait(delay)
79+
continue
80+
81+
first_chunk_this_run = True
82+
died_unexpectedly = False
83+
try:
84+
while not stop_event.is_set() and proc.poll() is None:
85+
chunk = proc.stdout.read(CHUNK_BYTES)
86+
if not chunk:
87+
try:
88+
err = proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else ""
89+
if err:
90+
logger.warning("ALSA capture read empty (device %s). arecord stderr: %s", device, err)
91+
else:
92+
logger.warning("ALSA capture read returned empty (device %s); check device/sample rate", device)
93+
except Exception:
94+
logger.warning("ALSA capture read returned empty (device %s)", device)
95+
died_unexpectedly = True
96+
break
97+
if first_chunk_this_run:
98+
first_chunk_this_run = False
99+
if not ever_produced_chunk:
100+
logger.info("ALSA first PCM chunk received from %s (%d bytes); pipeline will get amplitude", device, len(chunk))
101+
else:
102+
logger.info("ALSA capture resumed from %s (%d bytes) after retry", device, len(chunk))
103+
retries = 0
104+
ever_produced_chunk = True
105+
out_queue.put(chunk)
106+
except Exception as e:
107+
logger.warning("ALSA capture read error for %s: %s", device, e)
108+
died_unexpectedly = True
109+
finally:
86110
try:
87-
proc_holder.clear()
111+
proc.terminate()
112+
proc.wait(timeout=1)
88113
except Exception:
89114
pass
90-
if first_chunk:
115+
if proc_holder is not None and proc_holder and proc_holder[0] is proc:
116+
try:
117+
proc_holder.clear()
118+
except Exception:
119+
pass
120+
121+
if stop_event.is_set():
122+
break
123+
124+
if died_unexpectedly and retries < MAX_CAPTURE_RETRIES:
125+
retries += 1
126+
delay = min(RETRY_BACKOFF_BASE * (2 ** (retries - 1)), RETRY_BACKOFF_MAX)
127+
logger.warning(
128+
"ALSA capture died unexpectedly for %s; retry %d/%d in %.1fs",
129+
device, retries, MAX_CAPTURE_RETRIES, delay,
130+
)
131+
stop_event.wait(delay)
132+
continue
133+
134+
if died_unexpectedly:
135+
logger.error("ALSA capture giving up after %d retries for %s", retries, device)
136+
elif first_chunk_this_run and not ever_produced_chunk:
91137
try:
92138
err = proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else ""
93139
if err:
@@ -96,6 +142,9 @@ def _capture_alsa(
96142
logger.warning("ALSA capture ended without sending any chunks (device %s); check arecord -D %s", device, dev)
97143
except Exception:
98144
logger.warning("ALSA capture ended without sending any chunks (device %s)", device)
145+
break
146+
147+
out_queue.put(None)
99148

100149

101150
def _capture_pyaudio(

src/multi_modal_ai_studio/devices/playback.py

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
logger = logging.getLogger(__name__)
1616

1717
CHANNELS = 1
18+
PLAYBACK_RETRIES = 3
19+
PLAYBACK_RETRY_DELAY = 0.3 # seconds between retries
1820

1921

2022
def start_server_speaker_playback(
@@ -28,6 +30,9 @@ def start_server_speaker_playback(
2830
then close stdin when done so aplay exits. Use plughw when device is
2931
hw:X,Y so ALSA can do sample-rate conversion if needed.
3032
33+
Retries up to PLAYBACK_RETRIES times for transient device errors
34+
(e.g. USB audio device momentarily unavailable).
35+
3136
Args:
3237
device: ALSA device (e.g. hw:2,0).
3338
sample_rate: PCM sample rate in Hz (e.g. 24000 from TTS).
@@ -52,28 +57,46 @@ def start_server_speaker_playback(
5257
"-c", str(CHANNELS),
5358
"-t", "raw",
5459
]
55-
logger.info("ALSA playback starting: %s (device=%s, rate=%s)", " ".join(cmd), device, sample_rate)
56-
try:
57-
proc = subprocess.Popen(
58-
cmd,
59-
stdin=subprocess.PIPE,
60-
stderr=subprocess.PIPE,
61-
)
62-
# If same device is used for mic (arecord) and speaker, aplay may exit with "Device or resource busy"
63-
time.sleep(0.15)
64-
if proc.poll() is not None:
65-
err = (proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else "") or "(no stderr)"
66-
logger.warning("aplay exited immediately for %s: %s", device, err)
60+
61+
last_err = ""
62+
for attempt in range(1, PLAYBACK_RETRIES + 1):
63+
logger.info("ALSA playback starting: %s (device=%s, rate=%s)", " ".join(cmd), device, sample_rate)
64+
try:
65+
proc = subprocess.Popen(
66+
cmd,
67+
stdin=subprocess.PIPE,
68+
stderr=subprocess.PIPE,
69+
)
70+
time.sleep(0.15)
71+
if proc.poll() is not None:
72+
last_err = (proc.stderr.read().decode("utf-8", errors="replace").strip() if proc.stderr else "") or "(no stderr)"
73+
if attempt < PLAYBACK_RETRIES:
74+
logger.warning(
75+
"aplay exited immediately for %s (attempt %d/%d): %s — retrying in %.1fs",
76+
device, attempt, PLAYBACK_RETRIES, last_err, PLAYBACK_RETRY_DELAY,
77+
)
78+
time.sleep(PLAYBACK_RETRY_DELAY)
79+
continue
80+
logger.warning("aplay exited immediately for %s after %d attempts: %s", device, attempt, last_err)
81+
return None
82+
if proc_holder is not None:
83+
proc_holder.append(proc)
84+
return proc
85+
except FileNotFoundError:
86+
logger.warning("aplay not found; cannot play to ALSA device %s", device)
6787
return None
68-
if proc_holder is not None:
69-
proc_holder.append(proc)
70-
return proc
71-
except FileNotFoundError:
72-
logger.warning("aplay not found; cannot play to ALSA device %s", device)
73-
return None
74-
except Exception as e:
75-
logger.warning("Failed to start aplay for %s: %s", device, e)
76-
return None
88+
except Exception as e:
89+
last_err = str(e)
90+
if attempt < PLAYBACK_RETRIES:
91+
logger.warning(
92+
"Failed to start aplay for %s (attempt %d/%d): %s — retrying in %.1fs",
93+
device, attempt, PLAYBACK_RETRIES, e, PLAYBACK_RETRY_DELAY,
94+
)
95+
time.sleep(PLAYBACK_RETRY_DELAY)
96+
continue
97+
logger.warning("Failed to start aplay for %s after %d attempts: %s", device, attempt, e)
98+
return None
99+
return None
77100

78101

79102
def stop_server_speaker_playback(proc: Optional[subprocess.Popen]) -> None:

src/multi_modal_ai_studio/webui/static/app.js

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3562,10 +3562,13 @@ function drawTimelineEvents(ctx, timeline, lanes, LANE_HEIGHTS, laneYOffsets, LA
35623562
const llmCompletes = timeline.filter(e => e.event_type === 'llm_complete').sort((a, b) => (a.timestamp || 0) - (b.timestamp || 0));
35633563

35643564
// LLM: prefill (start → first token) and generate (first token → complete). Like Live RIVA WebUI; first-token boundary from pipeline.
3565+
// Match llm_first_token and llm_complete to each llm_start by timestamp
3566+
// range (not array index) so missing events don't shift all subsequent turns.
35653567
llmStarts.forEach((startEv, i) => {
3566-
const firstToken = llmFirstTokens[i];
3567-
const complete = llmCompletes[i];
35683568
const startTime = startEv.timestamp || 0;
3569+
const nextStartTime = (i + 1 < llmStarts.length) ? (llmStarts[i + 1].timestamp || 0) : Infinity;
3570+
const complete = llmCompletes.find(e => (e.timestamp || 0) >= startTime && (e.timestamp || 0) < nextStartTime) || null;
3571+
const firstToken = llmFirstTokens.find(e => (e.timestamp || 0) >= startTime && (e.timestamp || 0) < nextStartTime) || null;
35693572
const endTime = (complete && (complete.timestamp || 0)) || startTime;
35703573
if (firstToken && (firstToken.timestamp || 0) > startTime) {
35713574
inferredRectangles.push({
@@ -3740,9 +3743,12 @@ function drawTimelineEvents(ctx, timeline, lanes, LANE_HEIGHTS, laneYOffsets, LA
37403743
});
37413744

37423745
// TTS lane: one magenta rectangle per turn from tts_start to tts_complete (tts_first_audio shown as thin vertical line later)
3746+
// Match by timestamp range (not array index) so missing events don't misalign.
37433747
ttsStarts.forEach((startEv, i) => {
3744-
const complete = ttsCompletes[i];
3745-
if (complete && (complete.timestamp || 0) > (startEv.timestamp || 0)) {
3748+
const startTime = startEv.timestamp || 0;
3749+
const nextStartTime = (i + 1 < ttsStarts.length) ? (ttsStarts[i + 1].timestamp || 0) : Infinity;
3750+
const complete = ttsCompletes.find(e => (e.timestamp || 0) >= startTime && (e.timestamp || 0) < nextStartTime) || null;
3751+
if (complete && (complete.timestamp || 0) > startTime) {
37463752
inferredRectangles.push({
37473753
event_type: 'tts_segment',
37483754
lane: 'tts',

src/multi_modal_ai_studio/webui/voice_pipeline.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1397,10 +1397,11 @@ async def turn_executor() -> None:
13971397
last_tts_amplitude_time = 0.0
13981398
tts_amplitude_interval = 0.05
13991399
server_speaker_proc = None
1400+
_speaker_fail_until = 0.0 # backoff: skip aplay retries until this epoch
14001401
tts_consumer_error: Optional[Exception] = None
14011402

14021403
async def _send_tts_audio(chunk):
1403-
nonlocal tts_first_sent, ts_tts_first, last_tts_amplitude_time, server_speaker_proc
1404+
nonlocal tts_first_sent, ts_tts_first, last_tts_amplitude_time, server_speaker_proc, _speaker_fail_until
14041405
if not tts_first_sent:
14051406
ts_tts_first = (time.time() - session.timeline.start_time) if session.timeline.start_time else 0
14061407
ref_label = "llm_first_token" if use_stream_tts else "llm_complete"
@@ -1414,11 +1415,12 @@ async def _send_tts_audio(chunk):
14141415
)
14151416
_out_device = session.config.devices.audio_output_device
14161417
if _use_speaker and chunk.audio:
1417-
if server_speaker_proc is None:
1418+
if server_speaker_proc is None and time.time() >= _speaker_fail_until:
14181419
server_speaker_proc = start_server_speaker_playback(_out_device, chunk.sample_rate)
14191420
if server_speaker_proc is None:
1421+
_speaker_fail_until = time.time() + 3.0
14201422
logger.warning(
1421-
"Server speaker playback could not start for %s; check aplay and device",
1423+
"Server speaker playback could not start for %s; check aplay and device (suppressing retries for 3s)",
14221424
_out_device,
14231425
)
14241426
if server_speaker_proc is not None and server_speaker_proc.stdin and not server_speaker_proc.stdin.closed:

0 commit comments

Comments
 (0)