Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 42 additions & 30 deletions quickstarts/Get_started_LiveAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
"""

import asyncio
import base64
import io
import os
import sys
Expand Down Expand Up @@ -75,13 +74,12 @@
CHUNK_SIZE = 1024

# --- Model Configuration ---
MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
MODEL = "gemini-3.1-flash-live-preview"
Comment thread
Giom-V marked this conversation as resolved.
DEFAULT_MODE = "camera"


client = genai.Client(
api_key=os.environ.get("GEMINI_API_KEY"),
http_options={"api_version": "v1beta"},
api_key=os.environ.get("GOOGLE_API_KEY"),
)

# Live session configuration
Expand All @@ -91,12 +89,15 @@
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name = "Zephyr")
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Zephyr")
)
),
# Enable transcription of both user speech and model audio output.
input_audio_transcription=types.AudioTranscriptionConfig(),
output_audio_transcription=types.AudioTranscriptionConfig(),
context_window_compression=types.ContextWindowCompressionConfig(
trigger_tokens = 25600,
sliding_window = types.SlidingWindow(target_tokens=12800),
trigger_tokens=25600,
sliding_window=types.SlidingWindow(target_tokens=12800),
),
)

Expand Down Expand Up @@ -136,7 +137,7 @@ async def listen_audio(self):
data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE, **kwargs)
payload = {
"data": data,
"mime_type": "audio/pcm"
"mime_type": "audio/pcm;rate=16000"
Comment thread
thorwebdev marked this conversation as resolved.
}
# To reduce latency instead of watiing to push in queue we pop oldest item in queue if its full
# This helps to keep the audio stream real time
Expand Down Expand Up @@ -175,21 +176,33 @@ async def play_audio(self):
async def receive_audio(self):
"""Read from the websocket and write PCM chunks to the output queue."""
try:
# session.receive() yields responses for one turn then returns.
# The outer while True re-enters it for every subsequent turn.
while True:
turn = self.session.receive()
async for response in turn:
if data := response.data:
self.audio_in_queue.put_nowait(data)
async for response in self.session.receive():
server_content = response.server_content
if server_content is None:
continue
if text := response.text:
print(text, end="")

# If you interrupt the model, it sends a turn_complete.
# For interruptions to work, we need to stop playback.
# So empty out the audio queue because it may have loaded
# much more audio than has played yet.
while not self.audio_in_queue.empty():
self.audio_in_queue.get_nowait()

# Clear the playback queue on interruption, but don't skip
# the rest of this response — a transcription may arrive
# on the same message.
if server_content.interrupted:
while not self.audio_in_queue.empty():
self.audio_in_queue.get_nowait()

# Process ALL parts in each server event — a single event
# can contain multiple content parts simultaneously.
if server_content.model_turn:
for part in server_content.model_turn.parts:
if part.inline_data:
self.audio_in_queue.put_nowait(part.inline_data.data)

if server_content.input_transcription:
print(f"\nYou: {server_content.input_transcription.text}", end="")

if server_content.output_transcription:
print(f"\nGemini: {server_content.output_transcription.text}", end="")
Comment thread
thorwebdev marked this conversation as resolved.
except asyncio.CancelledError:
pass

Expand All @@ -215,7 +228,7 @@ def _capture_frame(self, cap):

mime_type = "image/jpeg"
image_bytes = image_io.read()
return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
return {"mime_type": mime_type, "data": image_bytes}

async def capture_frames(self):
cap = await asyncio.to_thread(
Expand Down Expand Up @@ -249,7 +262,7 @@ def _capture_screen(self):

mime_type = "image/jpeg"
image_bytes = image_io.read()
return {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}
return {"mime_type": mime_type, "data": image_bytes}

async def capture_screen(self):
try:
Expand All @@ -270,26 +283,25 @@ async def send_text(self):
while True:
text = await asyncio.to_thread(
input,
"message > ",
"speak or type 'q' to quit > ",
)
if text.lower() == "q":
print("👋 Exiting on user request...")
break
await self.session.send_client_content(
turns=types.Content(parts=[types.Part(text=text or "")]),
turn_complete=True,
)
await self.session.send_realtime_input(text=text)
Comment thread
thorwebdev marked this conversation as resolved.
except asyncio.CancelledError:
pass

async def send_realtime(self):
try:
while True:
msg = await self.out_queue.get()
blob = types.Blob(data=msg["data"], mime_type=msg["mime_type"])
if msg["mime_type"].startswith("audio/"):
await self.session.send_realtime_input(audio=msg)
await self.session.send_realtime_input(audio=blob)
else:
await self.session.send_realtime_input(media=msg)
# Use video= (not the deprecated media=) for image/video frames.
await self.session.send_realtime_input(video=blob)
Comment thread
thorwebdev marked this conversation as resolved.
except asyncio.CancelledError:
pass

Expand Down
Loading