Skip to content

Commit 6b2f158

Browse files
wehosHongzhi Wenclaude
authored
fix: suppress mic input during proactive nudge injection (Project-N-E-K-O#662)
* fix: suppress mic input during proactive nudge injection When the mic is physically muted but not UI-muted, silent audio frames keep flowing into the input_audio_buffer alongside the nudge audio. This prevents the server-side VAD from detecting end-of-speech, so the model never triggers a response. Add a _proactive_injecting flag that gates stream_audio() during stream_proactive() execution, ensuring the nudge audio exclusively occupies the buffer. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: keep VAD updates running during proactive injection Move the _proactive_injecting guard after the VAD update block so client-side speech detection stays live. If the user speaks during nudge injection the abort check in stream_proactive still fires. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Hongzhi Wen <cartabio.coder1@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 72916f4 commit 6b2f158

1 file changed

Lines changed: 72 additions & 61 deletions

File tree

main_logic/omni_realtime_client.py

Lines changed: 72 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,8 @@ def __init__(
237237
self._image_description = "[实时屏幕截图或相机画面正在分析中。先不要瞎编内容,可以稍等片刻。在此期间不要用搜索功能应付。等收到画面分析结果后再描述画面。]"
238238
self._latest_image_b64 = None # Cached latest screenshot for proactive injection
239239
self._proactive_image_consumed = True # Whether the cached image has been used by a proactive nudge
240-
240+
self._proactive_injecting = False # True while stream_proactive is injecting audio — suppresses mic input
241+
241242
# Silence detection for auto-closing inactive sessions
242243
# 只在 GLM 和 free API 时启用90秒静默超时,Qwen 和 Step 放行
243244
self._last_speech_time = None
@@ -821,15 +822,15 @@ async def update_session(self, config: Dict[str, Any]) -> None:
821822

822823
async def stream_audio(self, audio_chunk: bytes) -> None:
823824
"""Stream raw audio data to the API.
824-
825+
825826
Supports two input modes:
826827
- 48kHz from PC: Apply RNNoise then downsample to 16kHz
827828
- 16kHz from mobile: Pass through directly (no RNNoise)
828829
"""
829830
# 检查是否已发生致命错误,如果是则直接返回
830831
if self._fatal_error_occurred:
831832
return
832-
833+
833834
current_time = time.time()
834835
# 本地音量判定:用原始输入做 RMS,避免 VAD 延迟时误清 buffer
835836
raw_samples = np.frombuffer(audio_chunk, dtype=np.int16)
@@ -885,11 +886,15 @@ async def stream_audio(self, audio_chunk: bytes) -> None:
885886
self._client_vad_last_speech_time = current_time
886887
self._client_vad_active = True
887888

889+
# Suppress mic → server during proactive nudge injection (VAD above still updates)
890+
if self._proactive_injecting:
891+
return
892+
888893
# 静音清 buffer:有 RNNoise 以 RNNoise 为准,否则 VAD + 连续本地静音(见 _should_clear_audio_buffer_on_silence)
889894
if self._should_clear_audio_buffer_on_silence(current_time, use_rnnoise_path):
890895
self._silence_reset_pending = False
891896
await self.clear_audio_buffer()
892-
897+
893898
# Gemini uses different API
894899
if self._is_gemini:
895900
await self._stream_audio_gemini(audio_chunk)
@@ -1210,6 +1215,9 @@ async def stream_proactive(self, instruction: str = "", *, language: str = "zh")
12101215
})
12111216
logger.info("stream_proactive: injected vision text description for non-native backend")
12121217

1218+
# ── Suppress mic input during injection ────────────────────────
1219+
self._proactive_injecting = True
1220+
12131221
# ── Send audio chunks (same pacing as hot-swap flush) ─────────
12141222
# 320 bytes = 10 ms @16 kHz 16-bit mono, ×5 multiplier → 1600 bytes
12151223
chunk_size = 320 * 5 # 1600 bytes = 50 ms of audio
@@ -1224,69 +1232,72 @@ async def stream_proactive(self, instruction: str = "", *, language: str = "zh")
12241232
mid_chunk = total_chunks // 2 # Insert image at the midpoint
12251233
image_injected = False
12261234

1227-
for chunk_idx, i in enumerate(range(0, len(pcm_data), chunk_size)):
1228-
# Abort if AI starts responding, or user speaking (only when RNNoise VAD active)
1229-
if self._is_responding or (self._rnnoise_vad_active and self._client_vad_active):
1230-
logger.info("stream_proactive: aborted — user spoke or response started")
1231-
await self.clear_audio_buffer()
1232-
return False
1233-
1234-
chunk = pcm_data[i : i + chunk_size]
1235-
if self._is_gemini:
1236-
if self._gemini_session:
1237-
await self._gemini_session.send_realtime_input(
1238-
audio={"data": chunk, "mime_type": "audio/pcm"}
1239-
)
1240-
else:
1241-
audio_b64 = base64.b64encode(chunk).decode()
1242-
await self.send_event({
1243-
"type": "input_audio_buffer.append",
1244-
"audio": audio_b64,
1245-
})
1246-
1247-
# Inject cached screenshot at midpoint (only for native-image backends)
1248-
if can_inject_image and not image_injected and chunk_idx >= mid_chunk and snapshot_image_b64:
1235+
try:
1236+
for chunk_idx, i in enumerate(range(0, len(pcm_data), chunk_size)):
1237+
# Abort if AI starts responding, or user speaking (only when RNNoise VAD active)
1238+
if self._is_responding or (self._rnnoise_vad_active and self._client_vad_active):
1239+
logger.info("stream_proactive: aborted — user spoke or response started")
1240+
await self.clear_audio_buffer()
1241+
return False
1242+
1243+
chunk = pcm_data[i : i + chunk_size]
12491244
if self._is_gemini:
12501245
if self._gemini_session:
1251-
image_bytes = base64.b64decode(snapshot_image_b64)
12521246
await self._gemini_session.send_realtime_input(
1253-
media={"data": image_bytes, "mime_type": "image/jpeg"}
1247+
audio={"data": chunk, "mime_type": "audio/pcm"}
12541248
)
1255-
elif "gpt" in self._model_lower:
1256-
await self.send_event({
1257-
"type": "conversation.item.create",
1258-
"item": {
1259-
"type": "message",
1260-
"role": "user",
1261-
"content": [{
1262-
"type": "input_image",
1263-
"image_url": "data:image/jpeg;base64," + snapshot_image_b64,
1264-
}],
1265-
},
1266-
})
1267-
elif "qwen" in self._model_lower or ("lanlan.app" in self.base_url and "free" in self._model_lower):
1268-
await self.send_event({
1269-
"type": "input_image_buffer.append",
1270-
"image": snapshot_image_b64,
1271-
})
1272-
elif "glm" in self._model_lower:
1249+
else:
1250+
audio_b64 = base64.b64encode(chunk).decode()
12731251
await self.send_event({
1274-
"type": "input_audio_buffer.append_video_frame",
1275-
"video_frame": snapshot_image_b64,
1252+
"type": "input_audio_buffer.append",
1253+
"audio": audio_b64,
12761254
})
1277-
image_injected = True
1278-
logger.info("stream_proactive: injected screenshot at chunk %d/%d", chunk_idx, total_chunks)
1279-
1280-
await asyncio.sleep(sleep_interval)
1281-
1282-
# Mark vision context consumed only if the shared image hasn't been
1283-
# replaced by a newer frame from stream_image() during our async loop.
1284-
if has_vision and self._latest_image_b64 == snapshot_image_b64:
1285-
self._proactive_image_consumed = True
1286-
logger.info("stream_proactive: audio injection complete (%s%s), waiting for VAD → response",
1287-
"vision" if has_vision else "general",
1288-
"+image" if image_injected else "")
1289-
return True
1255+
1256+
# Inject cached screenshot at midpoint (only for native-image backends)
1257+
if can_inject_image and not image_injected and chunk_idx >= mid_chunk and snapshot_image_b64:
1258+
if self._is_gemini:
1259+
if self._gemini_session:
1260+
image_bytes = base64.b64decode(snapshot_image_b64)
1261+
await self._gemini_session.send_realtime_input(
1262+
media={"data": image_bytes, "mime_type": "image/jpeg"}
1263+
)
1264+
elif "gpt" in self._model_lower:
1265+
await self.send_event({
1266+
"type": "conversation.item.create",
1267+
"item": {
1268+
"type": "message",
1269+
"role": "user",
1270+
"content": [{
1271+
"type": "input_image",
1272+
"image_url": "data:image/jpeg;base64," + snapshot_image_b64,
1273+
}],
1274+
},
1275+
})
1276+
elif "qwen" in self._model_lower or ("lanlan.app" in self.base_url and "free" in self._model_lower):
1277+
await self.send_event({
1278+
"type": "input_image_buffer.append",
1279+
"image": snapshot_image_b64,
1280+
})
1281+
elif "glm" in self._model_lower:
1282+
await self.send_event({
1283+
"type": "input_audio_buffer.append_video_frame",
1284+
"video_frame": snapshot_image_b64,
1285+
})
1286+
image_injected = True
1287+
logger.info("stream_proactive: injected screenshot at chunk %d/%d", chunk_idx, total_chunks)
1288+
1289+
await asyncio.sleep(sleep_interval)
1290+
1291+
# Mark vision context consumed only if the shared image hasn't been
1292+
# replaced by a newer frame from stream_image() during our async loop.
1293+
if has_vision and self._latest_image_b64 == snapshot_image_b64:
1294+
self._proactive_image_consumed = True
1295+
logger.info("stream_proactive: audio injection complete (%s%s), waiting for VAD → response",
1296+
"vision" if has_vision else "general",
1297+
"+image" if image_injected else "")
1298+
return True
1299+
finally:
1300+
self._proactive_injecting = False
12901301

12911302
async def cancel_response(self) -> None:
12921303
"""Cancel the current response."""

0 commit comments

Comments
 (0)