Skip to content

Commit dd1b649

Browse files
author
Tarek Mahmoud Sayed
committed
Fix Gemini realtime provider for gemini-3.1-flash-live-preview
- Use SendRealtimeInputAsync for all input types (text, image, audio) to avoid interleaving with SendClientContentAsync which causes WebSocket close - Fix VAD handling: use ActivityStart/ActivityEnd framing when VAD is disabled, AudioStreamEnd when VAD is enabled for push-to-talk - Fix image input: send as Video blob without activity framing, use minimal text trigger in CreateResponse since Gemini treats images as streaming context - Fix function calling: convert MEAI JsonSchema to Google Schema type with proper uppercase type names (STRING, OBJECT, etc.) - Text input auto-triggers model response without framing
1 parent dc649bd commit dd1b649

File tree

2 files changed

+150
-93
lines changed

2 files changed

+150
-93
lines changed

Google.GenAI/GoogleGenAIRealtimeSession.cs

Lines changed: 150 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,15 @@ public sealed class GoogleGenAIRealtimeSession : IRealtimeClientSession
6060
// audio frames → ActivityEnd) that must be atomic.
6161
private readonly SemaphoreSlim _sendLock = new(1, 1);
6262

63-
// Track whether audio was sent via SendRealtimeInputAsync to avoid mixing with SendClientContentAsync.
64-
private bool _lastInputWasRealtime;
65-
6663
// Track whether a tool response was just sent. After SendToolResponseAsync, the server
6764
// automatically continues generating — sending TurnComplete would be unexpected.
6865
private bool _lastSendWasToolResponse;
6966

67+
// Track whether the last content sent was media (image/video/audio via CreateConversationItem)
68+
// that does not auto-trigger a model response. Unlike text, media input requires an explicit
69+
// ActivityEnd signal in CreateResponse to prompt the model to respond.
70+
private bool _pendingMediaNeedsTrigger;
71+
7072
// Maps function call IDs to function names. Populated when ToolCall messages arrive,
7173
// consumed when sending FunctionResponse back to the server.
7274
private readonly ConcurrentDictionary<string, string> _callIdToFunctionName = new();
@@ -175,15 +177,27 @@ await _asyncSession.SendToolResponseAsync(
175177
if (_lastSendWasToolResponse)
176178
{
177179
// After a tool response, Gemini automatically continues generating.
178-
// Do not send TurnComplete — it would cause the server to close the connection.
180+
// Do not send ActivityEnd — it would be unexpected.
179181
_lastSendWasToolResponse = false;
180182
}
181-
else if (!_lastInputWasRealtime)
183+
else if (_pendingMediaNeedsTrigger)
182184
{
183-
await _asyncSession.SendClientContentAsync(
184-
new LiveSendClientContentParameters { TurnComplete = true },
185+
// Media inputs (image, video) via SendRealtimeInputAsync are added to
186+
// the model's context but don't auto-trigger a response. Gemini's Live API
187+
// has no equivalent to OpenAI's CreateResponse command — the only way to
188+
// trigger a response is via text input. Send a minimal whitespace text to
189+
// prompt the model to respond about the media in context without biasing
190+
// the response content.
191+
_pendingMediaNeedsTrigger = false;
192+
await _asyncSession.SendRealtimeInputAsync(
193+
new LiveSendRealtimeInputParameters
194+
{
195+
Text = " ",
196+
},
185197
cancellationToken).ConfigureAwait(false);
186198
}
199+
// For text input: auto-triggers, no signal needed.
200+
// For audio commit: ActivityEnd/AudioStreamEnd already sent in HandleAudioCommitAsync.
187201
break;
188202

189203
default:
@@ -350,11 +364,13 @@ private async Task HandleAudioCommitAsync(CancellationToken cancellationToken)
350364
_audioBufferSize = 0;
351365
}
352366

353-
_lastInputWasRealtime = true;
367+
_lastSendWasToolResponse = false;
368+
_pendingMediaNeedsTrigger = false;
354369

355-
// When VAD is disabled, explicit ActivityStart/ActivityEnd framing is required.
356-
// ActivityStart marks the beginning of user speech; ActivityEnd triggers model response.
357-
// When VAD is enabled, the server auto-detects speech boundaries — skip framing.
370+
// When VAD is disabled, explicit ActivityStart/ActivityEnd framing is required
371+
// to mark speech boundaries and trigger the model to respond.
372+
// When VAD is enabled, the server auto-detects speech boundaries —
373+
// sending explicit framing conflicts with automatic detection.
358374
if (!_vadEnabled)
359375
{
360376
await _asyncSession.SendRealtimeInputAsync(
@@ -387,8 +403,10 @@ await _asyncSession.SendRealtimeInputAsync(
387403
}
388404
}
389405

390-
// When VAD is disabled, signal end of user activity — this triggers the model to respond.
391-
// When VAD is enabled, the server detects end of speech automatically.
406+
// When VAD is disabled, signal end of user activity to trigger the model's response.
407+
// When VAD is enabled, send AudioStreamEnd to indicate the mic was turned off and the
408+
// server should process the buffered audio. AudioStreamEnd is specifically designed for
409+
// the push-to-talk pattern with automatic activity detection.
392410
if (!_vadEnabled)
393411
{
394412
await _asyncSession.SendRealtimeInputAsync(
@@ -398,6 +416,15 @@ await _asyncSession.SendRealtimeInputAsync(
398416
},
399417
cancellationToken).ConfigureAwait(false);
400418
}
419+
else
420+
{
421+
await _asyncSession.SendRealtimeInputAsync(
422+
new LiveSendRealtimeInputParameters
423+
{
424+
AudioStreamEnd = true
425+
},
426+
cancellationToken).ConfigureAwait(false);
427+
}
401428
}
402429

403430
private Task SendAudioFrameAsync(byte[] data, CancellationToken cancellationToken)
@@ -451,68 +478,68 @@ private async Task HandleConversationItemCreateAsync(
451478
return;
452479
}
453480

454-
// Otherwise, treat as text/content conversation input
455-
var parts = new List<Part>();
481+
// Send text and media via SendRealtimeInputAsync without activity framing.
482+
// Text auto-triggers a model response. Images/audio are treated as streaming
483+
// context by Gemini's Live API — they do NOT auto-trigger a response.
484+
// When only media is sent (no accompanying text), we append a brief text prompt
485+
// so the model knows to respond about the media content.
486+
bool hasText = false;
487+
bool hasMedia = false;
456488
foreach (var content in itemCreate.Item.Contents)
457489
{
458490
if (content is TextContent textContent && !string.IsNullOrEmpty(textContent.Text))
459491
{
460-
parts.Add(new Part { Text = textContent.Text });
492+
hasText = true;
493+
_lastSendWasToolResponse = false;
494+
await _asyncSession.SendRealtimeInputAsync(
495+
new LiveSendRealtimeInputParameters
496+
{
497+
Text = textContent.Text,
498+
},
499+
cancellationToken).ConfigureAwait(false);
461500
}
462501
else if (content is DataContent dataContent)
463502
{
464-
if (dataContent.HasTopLevelMediaType("audio"))
503+
if (dataContent.HasTopLevelMediaType("image"))
465504
{
466-
parts.Add(new Part
467-
{
468-
InlineData = new Blob
505+
hasMedia = true;
506+
_lastSendWasToolResponse = false;
507+
await _asyncSession.SendRealtimeInputAsync(
508+
new LiveSendRealtimeInputParameters
469509
{
470-
Data = ExtractDataBytes(dataContent),
471-
MimeType = dataContent.MediaType ?? "audio/pcm",
472-
}
473-
});
510+
Video = new Blob
511+
{
512+
Data = ExtractDataBytes(dataContent),
513+
MimeType = dataContent.MediaType ?? "image/jpeg",
514+
}
515+
},
516+
cancellationToken).ConfigureAwait(false);
474517
}
475-
else if (dataContent.HasTopLevelMediaType("image"))
518+
else if (dataContent.HasTopLevelMediaType("audio"))
476519
{
477-
byte[] imageBytes = ExtractDataBytes(dataContent);
478-
parts.Add(new Part
479-
{
480-
InlineData = new Blob
520+
hasMedia = true;
521+
_lastSendWasToolResponse = false;
522+
await _asyncSession.SendRealtimeInputAsync(
523+
new LiveSendRealtimeInputParameters
481524
{
482-
Data = imageBytes,
483-
MimeType = dataContent.MediaType ?? "image/png",
484-
}
485-
});
525+
Audio = new Blob
526+
{
527+
Data = ExtractDataBytes(dataContent),
528+
MimeType = dataContent.MediaType ?? _inputAudioMimeType,
529+
}
530+
},
531+
cancellationToken).ConfigureAwait(false);
486532
}
487533
}
488534
}
489535

490-
if (parts.Count == 0)
536+
if (hasMedia && !hasText)
491537
{
492-
return;
538+
// Gemini treats media as streaming context (like a video frame) and won't
539+
// respond until it receives a text/voice prompt. Send a brief text to
540+
// trigger a response about the media content.
541+
_pendingMediaNeedsTrigger = true;
493542
}
494-
495-
string role = itemCreate.Item.Role?.Value switch
496-
{
497-
"assistant" => "model",
498-
_ => "user",
499-
};
500-
501-
_lastInputWasRealtime = false;
502-
_lastSendWasToolResponse = false;
503-
await _asyncSession.SendClientContentAsync(
504-
new LiveSendClientContentParameters
505-
{
506-
Turns = new List<Content>
507-
{
508-
new Content
509-
{
510-
Parts = parts,
511-
Role = role,
512-
}
513-
},
514-
},
515-
cancellationToken).ConfigureAwait(false);
516543
}
517544

518545
internal static byte[] ExtractDataBytes(DataContent content)
@@ -733,16 +760,79 @@ internal static FunctionDeclaration ToGoogleFunctionDeclaration(AIFunction aiFun
733760
Description = aiFunction.Description,
734761
};
735762

736-
// Map the JSON schema for parameters
763+
// Convert the MEAI JSON schema to a Google Schema object.
764+
// Google's API expects the Schema type with uppercase type names (STRING, OBJECT, etc.),
765+
// not raw JSON schema with lowercase types. Using Parameters instead of ParametersJsonSchema
766+
// ensures compatibility with the Live API's function calling.
737767
if (aiFunction.JsonSchema is JsonElement schemaElement &&
738-
schemaElement.ValueKind != JsonValueKind.Undefined)
768+
schemaElement.ValueKind == JsonValueKind.Object)
739769
{
740-
declaration.ParametersJsonSchema = schemaElement;
770+
declaration.Parameters = ConvertJsonSchemaToGoogleSchema(schemaElement);
741771
}
742772

743773
return declaration;
744774
}
745775

776+
/// <summary>
777+
/// Recursively converts a standard JSON Schema <see cref="JsonElement"/> to a Google GenAI
778+
/// <see cref="Schema"/> object, mapping lowercase type names to Google's uppercase enum values.
779+
/// </summary>
780+
internal static Schema ConvertJsonSchemaToGoogleSchema(JsonElement element)
781+
{
782+
var schema = new Schema();
783+
784+
if (element.TryGetProperty("type", out var typeValue))
785+
{
786+
schema.Type = typeValue.GetString()?.ToLowerInvariant() switch
787+
{
788+
"object" => Google.GenAI.Types.Type.Object,
789+
"string" => Google.GenAI.Types.Type.String,
790+
"integer" => Google.GenAI.Types.Type.Integer,
791+
"number" => Google.GenAI.Types.Type.Number,
792+
"boolean" => Google.GenAI.Types.Type.Boolean,
793+
"array" => Google.GenAI.Types.Type.Array,
794+
_ => null
795+
};
796+
}
797+
798+
if (element.TryGetProperty("description", out var desc) &&
799+
desc.ValueKind == JsonValueKind.String)
800+
{
801+
schema.Description = desc.GetString();
802+
}
803+
804+
if (element.TryGetProperty("properties", out var props) &&
805+
props.ValueKind == JsonValueKind.Object)
806+
{
807+
schema.Properties = new Dictionary<string, Schema>();
808+
foreach (var prop in props.EnumerateObject())
809+
{
810+
schema.Properties[prop.Name] = ConvertJsonSchemaToGoogleSchema(prop.Value);
811+
}
812+
}
813+
814+
if (element.TryGetProperty("required", out var req) &&
815+
req.ValueKind == JsonValueKind.Array)
816+
{
817+
schema.Required = new List<string>();
818+
foreach (var item in req.EnumerateArray())
819+
{
820+
if (item.ValueKind == JsonValueKind.String)
821+
{
822+
schema.Required.Add(item.GetString()!);
823+
}
824+
}
825+
}
826+
827+
if (element.TryGetProperty("items", out var items) &&
828+
items.ValueKind == JsonValueKind.Object)
829+
{
830+
schema.Items = ConvertJsonSchemaToGoogleSchema(items);
831+
}
832+
833+
return schema;
834+
}
835+
746836
#endregion
747837
}
748838

Google.GenAI/packages.lock.json

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -37,27 +37,6 @@
3737
"Microsoft.NETCore.Platforms": "1.1.0"
3838
}
3939
},
40-
"System.Collections.Immutable": {
41-
"type": "Direct",
42-
"requested": "[9.0.0, )",
43-
"resolved": "9.0.0",
44-
"contentHash": "QhkXUl2gNrQtvPmtBTQHb0YsUrDiDQ2QS09YbtTTiSjGcf7NBqtYbrG/BE06zcBPCKEwQGzIv13IVdXNOSub2w==",
45-
"dependencies": {
46-
"System.Memory": "4.5.5",
47-
"System.Runtime.CompilerServices.Unsafe": "6.0.0"
48-
}
49-
},
50-
"System.Net.ServerSentEvents": {
51-
"type": "Direct",
52-
"requested": "[9.0.0, )",
53-
"resolved": "9.0.0",
54-
"contentHash": "VTWjeyx9nPb4+hkjGcAaDw1nOckypMtvABmxSWm6PPYwrXoIiVG3jwtNlAGhaGVjDkBrERABox67wYTAcHxg7Q==",
55-
"dependencies": {
56-
"Microsoft.Bcl.AsyncInterfaces": "9.0.0",
57-
"System.Memory": "4.5.5",
58-
"System.Threading.Tasks.Extensions": "4.5.4"
59-
}
60-
},
6140
"Google.Apis": {
6241
"type": "Transitive",
6342
"resolved": "1.69.0",
@@ -202,18 +181,6 @@
202181
"resolved": "2.5.2",
203182
"contentHash": "vm4xrNt+i6OVRQ8vhfCcmDIUg3qvjyCTkSTNVTDFohsG6CXEpMaVFkidECL6yRYpHDnz4TqXhDoEQAcnHCu/tw=="
204183
},
205-
"System.Collections.Immutable": {
206-
"type": "Direct",
207-
"requested": "[9.0.0, )",
208-
"resolved": "9.0.0",
209-
"contentHash": "QhkXUl2gNrQtvPmtBTQHb0YsUrDiDQ2QS09YbtTTiSjGcf7NBqtYbrG/BE06zcBPCKEwQGzIv13IVdXNOSub2w=="
210-
},
211-
"System.Net.ServerSentEvents": {
212-
"type": "Direct",
213-
"requested": "[9.0.0, )",
214-
"resolved": "9.0.0",
215-
"contentHash": "VTWjeyx9nPb4+hkjGcAaDw1nOckypMtvABmxSWm6PPYwrXoIiVG3jwtNlAGhaGVjDkBrERABox67wYTAcHxg7Q=="
216-
},
217184
"Google.Apis": {
218185
"type": "Transitive",
219186
"resolved": "1.69.0",

0 commit comments

Comments
 (0)