@@ -237,7 +237,8 @@ def __init__(
237237 self ._image_description = "[实时屏幕截图或相机画面正在分析中。先不要瞎编内容,可以稍等片刻。在此期间不要用搜索功能应付。等收到画面分析结果后再描述画面。]"
238238 self ._latest_image_b64 = None # Cached latest screenshot for proactive injection
239239 self ._proactive_image_consumed = True # Whether the cached image has been used by a proactive nudge
240-
240+ self ._proactive_injecting = False # True while stream_proactive is injecting audio — suppresses mic input
241+
241242 # Silence detection for auto-closing inactive sessions
242243 # 只在 GLM 和 free API 时启用90秒静默超时,Qwen 和 Step 放行
243244 self ._last_speech_time = None
@@ -821,15 +822,15 @@ async def update_session(self, config: Dict[str, Any]) -> None:
821822
822823 async def stream_audio (self , audio_chunk : bytes ) -> None :
823824 """Stream raw audio data to the API.
824-
825+
825826 Supports two input modes:
826827 - 48kHz from PC: Apply RNNoise then downsample to 16kHz
827828 - 16kHz from mobile: Pass through directly (no RNNoise)
828829 """
829830 # 检查是否已发生致命错误,如果是则直接返回
830831 if self ._fatal_error_occurred :
831832 return
832-
833+
833834 current_time = time .time ()
834835 # 本地音量判定:用原始输入做 RMS,避免 VAD 延迟时误清 buffer
835836 raw_samples = np .frombuffer (audio_chunk , dtype = np .int16 )
@@ -885,11 +886,15 @@ async def stream_audio(self, audio_chunk: bytes) -> None:
885886 self ._client_vad_last_speech_time = current_time
886887 self ._client_vad_active = True
887888
889+ # Suppress mic → server during proactive nudge injection (VAD above still updates)
890+ if self ._proactive_injecting :
891+ return
892+
888893 # 静音清 buffer:有 RNNoise 以 RNNoise 为准,否则 VAD + 连续本地静音(见 _should_clear_audio_buffer_on_silence)
889894 if self ._should_clear_audio_buffer_on_silence (current_time , use_rnnoise_path ):
890895 self ._silence_reset_pending = False
891896 await self .clear_audio_buffer ()
892-
897+
893898 # Gemini uses different API
894899 if self ._is_gemini :
895900 await self ._stream_audio_gemini (audio_chunk )
@@ -1210,6 +1215,9 @@ async def stream_proactive(self, instruction: str = "", *, language: str = "zh")
12101215 })
12111216 logger .info ("stream_proactive: injected vision text description for non-native backend" )
12121217
1218+ # ── Suppress mic input during injection ────────────────────────
1219+ self ._proactive_injecting = True
1220+
12131221 # ── Send audio chunks (same pacing as hot-swap flush) ─────────
12141222 # 320 bytes = 10 ms @16 kHz 16-bit mono, ×5 multiplier → 1600 bytes
12151223 chunk_size = 320 * 5 # 1600 bytes = 50 ms of audio
@@ -1224,69 +1232,72 @@ async def stream_proactive(self, instruction: str = "", *, language: str = "zh")
12241232 mid_chunk = total_chunks // 2 # Insert image at the midpoint
12251233 image_injected = False
12261234
1227- for chunk_idx , i in enumerate (range (0 , len (pcm_data ), chunk_size )):
1228- # Abort if AI starts responding, or user speaking (only when RNNoise VAD active)
1229- if self ._is_responding or (self ._rnnoise_vad_active and self ._client_vad_active ):
1230- logger .info ("stream_proactive: aborted — user spoke or response started" )
1231- await self .clear_audio_buffer ()
1232- return False
1233-
1234- chunk = pcm_data [i : i + chunk_size ]
1235- if self ._is_gemini :
1236- if self ._gemini_session :
1237- await self ._gemini_session .send_realtime_input (
1238- audio = {"data" : chunk , "mime_type" : "audio/pcm" }
1239- )
1240- else :
1241- audio_b64 = base64 .b64encode (chunk ).decode ()
1242- await self .send_event ({
1243- "type" : "input_audio_buffer.append" ,
1244- "audio" : audio_b64 ,
1245- })
1246-
1247- # Inject cached screenshot at midpoint (only for native-image backends)
1248- if can_inject_image and not image_injected and chunk_idx >= mid_chunk and snapshot_image_b64 :
1235+ try :
1236+ for chunk_idx , i in enumerate (range (0 , len (pcm_data ), chunk_size )):
1237+ # Abort if AI starts responding, or user speaking (only when RNNoise VAD active)
1238+ if self ._is_responding or (self ._rnnoise_vad_active and self ._client_vad_active ):
1239+ logger .info ("stream_proactive: aborted — user spoke or response started" )
1240+ await self .clear_audio_buffer ()
1241+ return False
1242+
1243+ chunk = pcm_data [i : i + chunk_size ]
12491244 if self ._is_gemini :
12501245 if self ._gemini_session :
1251- image_bytes = base64 .b64decode (snapshot_image_b64 )
12521246 await self ._gemini_session .send_realtime_input (
1253- media = {"data" : image_bytes , "mime_type" : "image/jpeg " }
1247+ audio = {"data" : chunk , "mime_type" : "audio/pcm " }
12541248 )
1255- elif "gpt" in self ._model_lower :
1256- await self .send_event ({
1257- "type" : "conversation.item.create" ,
1258- "item" : {
1259- "type" : "message" ,
1260- "role" : "user" ,
1261- "content" : [{
1262- "type" : "input_image" ,
1263- "image_url" : "data:image/jpeg;base64," + snapshot_image_b64 ,
1264- }],
1265- },
1266- })
1267- elif "qwen" in self ._model_lower or ("lanlan.app" in self .base_url and "free" in self ._model_lower ):
1268- await self .send_event ({
1269- "type" : "input_image_buffer.append" ,
1270- "image" : snapshot_image_b64 ,
1271- })
1272- elif "glm" in self ._model_lower :
1249+ else :
1250+ audio_b64 = base64 .b64encode (chunk ).decode ()
12731251 await self .send_event ({
1274- "type" : "input_audio_buffer.append_video_frame " ,
1275- "video_frame " : snapshot_image_b64 ,
1252+ "type" : "input_audio_buffer.append " ,
1253+ "audio " : audio_b64 ,
12761254 })
1277- image_injected = True
1278- logger .info ("stream_proactive: injected screenshot at chunk %d/%d" , chunk_idx , total_chunks )
1279-
1280- await asyncio .sleep (sleep_interval )
1281-
1282- # Mark vision context consumed only if the shared image hasn't been
1283- # replaced by a newer frame from stream_image() during our async loop.
1284- if has_vision and self ._latest_image_b64 == snapshot_image_b64 :
1285- self ._proactive_image_consumed = True
1286- logger .info ("stream_proactive: audio injection complete (%s%s), waiting for VAD → response" ,
1287- "vision" if has_vision else "general" ,
1288- "+image" if image_injected else "" )
1289- return True
1255+
1256+ # Inject cached screenshot at midpoint (only for native-image backends)
1257+ if can_inject_image and not image_injected and chunk_idx >= mid_chunk and snapshot_image_b64 :
1258+ if self ._is_gemini :
1259+ if self ._gemini_session :
1260+ image_bytes = base64 .b64decode (snapshot_image_b64 )
1261+ await self ._gemini_session .send_realtime_input (
1262+ media = {"data" : image_bytes , "mime_type" : "image/jpeg" }
1263+ )
1264+ elif "gpt" in self ._model_lower :
1265+ await self .send_event ({
1266+ "type" : "conversation.item.create" ,
1267+ "item" : {
1268+ "type" : "message" ,
1269+ "role" : "user" ,
1270+ "content" : [{
1271+ "type" : "input_image" ,
1272+ "image_url" : "data:image/jpeg;base64," + snapshot_image_b64 ,
1273+ }],
1274+ },
1275+ })
1276+ elif "qwen" in self ._model_lower or ("lanlan.app" in self .base_url and "free" in self ._model_lower ):
1277+ await self .send_event ({
1278+ "type" : "input_image_buffer.append" ,
1279+ "image" : snapshot_image_b64 ,
1280+ })
1281+ elif "glm" in self ._model_lower :
1282+ await self .send_event ({
1283+ "type" : "input_audio_buffer.append_video_frame" ,
1284+ "video_frame" : snapshot_image_b64 ,
1285+ })
1286+ image_injected = True
1287+ logger .info ("stream_proactive: injected screenshot at chunk %d/%d" , chunk_idx , total_chunks )
1288+
1289+ await asyncio .sleep (sleep_interval )
1290+
1291+ # Mark vision context consumed only if the shared image hasn't been
1292+ # replaced by a newer frame from stream_image() during our async loop.
1293+ if has_vision and self ._latest_image_b64 == snapshot_image_b64 :
1294+ self ._proactive_image_consumed = True
1295+ logger .info ("stream_proactive: audio injection complete (%s%s), waiting for VAD → response" ,
1296+ "vision" if has_vision else "general" ,
1297+ "+image" if image_injected else "" )
1298+ return True
1299+ finally :
1300+ self ._proactive_injecting = False
12901301
12911302 async def cancel_response (self ) -> None :
12921303 """Cancel the current response."""
0 commit comments