import time
import asyncio
import base64
import os
import signal
import wave
from io import BytesIO
from typing import Optional
import pyaudio
from rtclient import RTLowLevelClient
from rtclient.models import (
InputAudioBufferAppendMessage,
ServerVAD,
SessionUpdateMessage,
SessionUpdateParams,
InputVideoFrameAppendMessage
)
shutdown_event: Optional[asyncio.Event] = None
base_timestamp = int(time.time() * 1000)
VIDEO_INTERVAL = 500 # 每500ms发送一帧,2fps
def encode_image_to_base64(image_path: str) -> str:
"""
将图片文件转换为base64编码
Args:
image_path: 图片文件路径
Returns:
base64编码的字符串
"""
try:
with open(image_path, 'rb') as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
except Exception as e:
print(f"图片文件处理错误: {str(e)}")
return None
async def send_video(client: RTLowLevelClient, image_file_path):
image_base64 = encode_image_to_base64(image_file_path)
"""异步发送视频帧"""
video_timestamp = base_timestamp
for _ in range(2): # 2fps
video_message = InputVideoFrameAppendMessage(
video_frame=image_base64,
client_timestamp=video_timestamp
)
await client.send(video_message)
video_timestamp += VIDEO_INTERVAL
await asyncio.sleep(VIDEO_INTERVAL / 1000)
def handle_shutdown(sig=None, frame=None):
"""处理关闭信号"""
if shutdown_event:
print("\n正在关闭程序...")
shutdown_event.set()
async def send_audio(client: RTLowLevelClient):
"""
使用麦克风实时捕获音频并发送
"""
try:
# 初始化pyaudio
p = pyaudio.PyAudio()
# 设置音频流参数
format = pyaudio.paInt16 # 16位深度
channels = 1 # 单声道
rate = 16000 # 采样率16kHz
frame_size = 1536 # 固定帧大小(采样点数)
step_ms = 32 # 发送间隔(毫秒)
step_samples = int(rate * step_ms / 1000) # 每步采样点数
bytes_per_sample = 2 # 16位深度,2字节
# 打开音频流
stream = p.open(format=format,
channels=channels,
rate=rate,
input=True,
frames_per_buffer=step_samples)
print("开始捕获麦克风音频...")
while not shutdown_event.is_set():
# 读取音频数据
frame_bytes = stream.read(step_samples, exception_on_overflow=False)
# 构造WAV格式
wav_io = BytesIO()
with wave.open(wav_io, 'wb') as wav_out:
wav_out.setnchannels(channels)
wav_out.setsampwidth(bytes_per_sample)
wav_out.setframerate(rate)
wav_out.writeframes(frame_bytes)
# 发送数据
wav_io.seek(0)
base64_data = base64.b64encode(wav_io.getvalue()).decode('utf-8')
message = InputAudioBufferAppendMessage(
audio=base64_data,
client_timestamp=int(asyncio.get_event_loop().time() * 1000)
)
try:
await client.send(message)
# await asyncio.sleep(step_ms / 1000) # 等待下一帧
except Exception as e:
print(f"发送失败: {e}")
break
except Exception as e:
print(f"音频处理失败: {e}")
finally:
if stream:
stream.stop_stream()
stream.close()
if p:
p.terminate()
async def receive_messages(client: RTLowLevelClient):
try:
while not client.closed:
if shutdown_event.is_set():
print("正在停止消息接收...")
break
try:
message = await asyncio.wait_for(client.recv(), timeout=1.0)
if message is None:
continue
msg_type = message.type if hasattr(message, 'type') else message.get('type')
if msg_type is None:
print("收到未知类型的消息:", message)
continue
match msg_type:
case "session.created":
print("会话创建消息")
print(f" Session Id: {message.session.id}")
case "error":
print("错误消息")
print(f" Error: {message.error}")
case "session.updated":
print("会话更新消息")
print(f"updated session: {message.session}")
case "input_audio_buffer.speech_started":
print("语音开始消息")
case "input_audio_buffer.speech_stopped":
print("语音结束消息")
case "input_audio_buffer.committed":
print("输入音频缓冲区提交消息")
case "conversation.item.created":
print("会话项目创建消息")
case "conversation.item.input_audio_transcription.completed":
print("输入音频转写完成消息")
print(f" Transcript: {message.transcript}")
case "response.created":
print("响应创建消息")
print(f" Response Id: {message.response.id}")
case "response.done":
print("响应完成消息")
if hasattr(message, 'response'):
print(f" Response Id: {message.response.id}")
print(f" Status: {message.response.status}")
case "response.audio.delta":
print("模型音频增量消息")
print(f" Response Id: {message.response_id}")
if message.delta:
print(f" Delta Length: {len(message.delta)}")
else:
print(" Delta: None")
case "response.audio_transcript.delta":
print("模型音频文本增量消息")
print(f" Response Id: {message.response_id}")
print(f" Delta: {message.delta if message.delta else 'None'}")
case "response.function_call_arguments.done":
print("函数调用参数完成消息")
print(f" Response Id: {message.response_id}")
print(f" Arguments: {message.arguments if message.arguments else 'None'}")
case "response.audio.done":
print("模型音频完成消息")
case "response.audio_transcript.done":
print("模型音频文本完成消息")
case "heartbeat":
print("心跳消息")
case _:
print(f"未处理的消息类型: {msg_type}")
print(message)
except TimeoutError:
continue
except Exception as e:
if not shutdown_event.is_set():
print(f"接收消息时发生错误: {[e]}")
break
finally:
if not client.closed:
await client.close()
print("WebSocket连接已关闭")
def get_env_var(var_name: str) -> str:
value = os.environ.get(var_name)
if not value:
raise OSError(f"环境变量 '{var_name}' 未设置或为空。")
return value
async def with_zhipu(image_path):
global shutdown_event
shutdown_event = asyncio.Event()
for sig in (signal.SIGINT, signal.SIGTERM):
signal.signal(sig, handle_shutdown)
api_key = 'key'
try:
async with RTLowLevelClient(url="wss://open.bigmodel.cn/api/paas/v4/realtime",
headers={"Authorization": f"Bearer {api_key}"}) as client:
if shutdown_event.is_set():
return
session_message = SessionUpdateMessage(
session=SessionUpdateParams(
input_audio_format="wav",
output_audio_format="pcm",
modalities={"audio", "text"},
turn_detection=ServerVAD(),
beta_fields={
"chat_mode": "video_passive",
"tts_source": "e2e",
"auto_search": False
},
tools=[]
)
)
await client.send(session_message)
if shutdown_event.is_set():
return
send_audio_task = asyncio.create_task(send_audio(client))
send_image_task = asyncio.create_task(send_video(client, image_path))
receive_task = asyncio.create_task(receive_messages(client))
try:
await asyncio.gather(send_audio_task, send_image_task, receive_task)
except Exception as e:
print(f"任务执行出错: {e}")
for task in [send_audio_task, send_image_task, receive_task]:
if not task.done():
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
except Exception as e:
print(f"发生错误: {e}")
finally:
if shutdown_event.is_set():
print("程序已完成退出")
if __name__ == "__main__":
image_path = 'programmer.jpg'
try:
asyncio.run(with_zhipu(image_path))
except KeyboardInterrupt:
print("\n程序被用户中断")
except Exception as e:
print(f"程序执行出错: {e}")
finally:
print("程序已退出")