Skip to content

Commit 99e48af

Browse files
Hongzhi Wenclaude
andcommitted
fix(openai-realtime): 防打断丢 input_transcript + 清 response.done 状态 + bump model IDs
- omni_realtime_client.py: 把 conversation.item.input_audio_transcription.completed 移到 top-level elif,打断场景下不再被 _interrupted 挡掉导致用户转录丢失 - omni_realtime_client.py: response.done 补 self._print_input_transcript = False, 防止空响应/被打断时该标志泄漏到下一轮、干扰输出转录路由 - omni_realtime_client.py: 移除默认 system instructions 末尾硬编码的"卡哇伊声音"指令 - api_providers.json: gpt-realtime-mini-2025-12-15 → gpt-realtime-1.5; doubao emotion_model 补齐 -260215 版本号 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3346798 commit 99e48af

2 files changed

Lines changed: 7 additions & 7 deletions

File tree

config/api_providers.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"name": "GPT-Realtime(OpenAI)",
2929
"description": "智能水平最高,但国内无法使用且价格昂贵",
3030
"core_url": "wss://api.openai.com/v1/realtime",
31-
"core_model": "gpt-realtime-mini-2025-12-15"
31+
"core_model": "gpt-realtime-1.5"
3232
},
3333
"step": {
3434
"key": "step",
@@ -176,7 +176,7 @@
176176
"conversation_model": "doubao-seed-2-0-lite-260215",
177177
"summary_model": "doubao-seed-2-0-lite-260215",
178178
"correction_model": "doubao-seed-2-0-lite-260215",
179-
"emotion_model": "doubao-seed-2-0-mini",
179+
"emotion_model": "doubao-seed-2-0-mini-260215",
180180
"vision_model": "doubao-seed-2-0-lite-260215",
181181
"agent_model": "doubao-seed-2-0-pro-260215"
182182
},

main_logic/omni_realtime_client.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ async def connect(self, instructions: str, native_audio=True) -> None:
548548
await self.update_session({
549549
"type": "realtime",
550550
"model": self.model,
551-
"instructions": instructions + '\n请使用卡哇伊的声音与用户交流。\n',
551+
"instructions": instructions,
552552
"output_modalities": ['audio'] if 'audio' in self._modalities else ['text'],
553553
"audio": {
554554
"input": {
@@ -1503,6 +1503,7 @@ async def handle_messages(self) -> None:
15031503
self._audio_delta_count = 0
15041504
# 确保 buffer 被清空
15051505
self._output_transcript_buffer = ""
1506+
self._print_input_transcript = False
15061507
self._image_recognized_this_turn = False
15071508
self._image_sent_this_turn = False
15081509
if self.on_response_done:
@@ -1538,6 +1539,9 @@ async def handle_messages(self) -> None:
15381539
self._client_vad_last_speech_time = time.time()
15391540
elif event_type == "conversation.item.input_audio_transcription.completed":
15401541
self._print_input_transcript = True
1542+
transcript = event.get("transcript", "")
1543+
if self.on_input_transcript:
1544+
await self.on_input_transcript(transcript)
15411545
elif event_type in ["response.audio_transcript.done", "response.output_audio_transcript.done"]:
15421546
self._print_input_transcript = False
15431547
if self._output_transcript_buffer and self.on_output_transcript and not self._skip_until_next_response and not self._interrupted:
@@ -1558,10 +1562,6 @@ async def handle_messages(self) -> None:
15581562
if self.on_audio_delta:
15591563
audio_bytes = base64.b64decode(event["delta"])
15601564
await self.on_audio_delta(audio_bytes)
1561-
elif event_type == "conversation.item.input_audio_transcription.completed":
1562-
transcript = event.get("transcript", "")
1563-
if self.on_input_transcript:
1564-
await self.on_input_transcript(transcript)
15651565
elif event_type in ["response.audio_transcript.done", "response.output_audio_transcript.done"]:
15661566
if self.on_output_transcript and self._is_first_transcript_chunk:
15671567
transcript = event.get("transcript", "")

0 commit comments

Comments
 (0)