@@ -60,13 +60,15 @@ public sealed class GoogleGenAIRealtimeSession : IRealtimeClientSession
6060 // audio frames → ActivityEnd) that must be atomic.
6161 private readonly SemaphoreSlim _sendLock = new ( 1 , 1 ) ;
6262
63- // Track whether audio was sent via SendRealtimeInputAsync to avoid mixing with SendClientContentAsync.
64- private bool _lastInputWasRealtime ;
65-
6663 // Track whether a tool response was just sent. After SendToolResponseAsync, the server
6764 // automatically continues generating — sending TurnComplete would be unexpected.
6865 private bool _lastSendWasToolResponse ;
6966
67+ // Track whether the last content sent was media (image/video/audio via CreateConversationItem)
68+ // that does not auto-trigger a model response. Unlike text, media input requires an explicit
69+ // ActivityEnd signal in CreateResponse to prompt the model to respond.
70+ private bool _pendingMediaNeedsTrigger ;
71+
7072 // Maps function call IDs to function names. Populated when ToolCall messages arrive,
7173 // consumed when sending FunctionResponse back to the server.
7274 private readonly ConcurrentDictionary < string , string > _callIdToFunctionName = new ( ) ;
@@ -175,15 +177,27 @@ await _asyncSession.SendToolResponseAsync(
175177 if ( _lastSendWasToolResponse )
176178 {
177179 // After a tool response, Gemini automatically continues generating.
178- // Do not send TurnComplete — it would cause the server to close the connection .
180+ // Do not send ActivityEnd — it would be unexpected .
179181 _lastSendWasToolResponse = false ;
180182 }
181- else if ( ! _lastInputWasRealtime )
183+ else if ( _pendingMediaNeedsTrigger )
182184 {
183- await _asyncSession . SendClientContentAsync (
184- new LiveSendClientContentParameters { TurnComplete = true } ,
185+ // Media inputs (image, video) via SendRealtimeInputAsync are added to
186+ // the model's context but don't auto-trigger a response. Gemini's Live API
187+ // has no equivalent to OpenAI's CreateResponse command — the only way to
188+ // trigger a response is via text input. Send a minimal whitespace text to
189+ // prompt the model to respond about the media in context without biasing
190+ // the response content.
191+ _pendingMediaNeedsTrigger = false ;
192+ await _asyncSession . SendRealtimeInputAsync (
193+ new LiveSendRealtimeInputParameters
194+ {
195+ Text = " " ,
196+ } ,
185197 cancellationToken ) . ConfigureAwait ( false ) ;
186198 }
199+ // For text input: auto-triggers, no signal needed.
200+ // For audio commit: ActivityEnd/AudioStreamEnd already sent in HandleAudioCommitAsync.
187201 break ;
188202
189203 default :
@@ -350,11 +364,13 @@ private async Task HandleAudioCommitAsync(CancellationToken cancellationToken)
350364 _audioBufferSize = 0 ;
351365 }
352366
353- _lastInputWasRealtime = true ;
367+ _lastSendWasToolResponse = false ;
368+ _pendingMediaNeedsTrigger = false ;
354369
355- // When VAD is disabled, explicit ActivityStart/ActivityEnd framing is required.
356- // ActivityStart marks the beginning of user speech; ActivityEnd triggers model response.
357- // When VAD is enabled, the server auto-detects speech boundaries — skip framing.
370+ // When VAD is disabled, explicit ActivityStart/ActivityEnd framing is required
371+ // to mark speech boundaries and trigger the model to respond.
372+ // When VAD is enabled, the server auto-detects speech boundaries —
373+ // sending explicit framing conflicts with automatic detection.
358374 if ( ! _vadEnabled )
359375 {
360376 await _asyncSession . SendRealtimeInputAsync (
@@ -387,8 +403,10 @@ await _asyncSession.SendRealtimeInputAsync(
387403 }
388404 }
389405
390- // When VAD is disabled, signal end of user activity — this triggers the model to respond.
391- // When VAD is enabled, the server detects end of speech automatically.
406+ // When VAD is disabled, signal end of user activity to trigger the model's response.
407+ // When VAD is enabled, send AudioStreamEnd to indicate the mic was turned off and the
408+ // server should process the buffered audio. AudioStreamEnd is specifically designed for
409+ // the push-to-talk pattern with automatic activity detection.
392410 if ( ! _vadEnabled )
393411 {
394412 await _asyncSession . SendRealtimeInputAsync (
@@ -398,6 +416,15 @@ await _asyncSession.SendRealtimeInputAsync(
398416 } ,
399417 cancellationToken ) . ConfigureAwait ( false ) ;
400418 }
419+ else
420+ {
421+ await _asyncSession . SendRealtimeInputAsync (
422+ new LiveSendRealtimeInputParameters
423+ {
424+ AudioStreamEnd = true
425+ } ,
426+ cancellationToken ) . ConfigureAwait ( false ) ;
427+ }
401428 }
402429
403430 private Task SendAudioFrameAsync ( byte [ ] data , CancellationToken cancellationToken )
@@ -451,68 +478,68 @@ private async Task HandleConversationItemCreateAsync(
451478 return ;
452479 }
453480
454- // Otherwise, treat as text/content conversation input
455- var parts = new List < Part > ( ) ;
481+ // Send text and media via SendRealtimeInputAsync without activity framing.
482+ // Text auto-triggers a model response. Images/audio are treated as streaming
483+ // context by Gemini's Live API — they do NOT auto-trigger a response.
484+ // When only media is sent (no accompanying text), we append a brief text prompt
485+ // so the model knows to respond about the media content.
486+ bool hasText = false ;
487+ bool hasMedia = false ;
456488 foreach ( var content in itemCreate . Item . Contents )
457489 {
458490 if ( content is TextContent textContent && ! string . IsNullOrEmpty ( textContent . Text ) )
459491 {
460- parts . Add ( new Part { Text = textContent . Text } ) ;
492+ hasText = true ;
493+ _lastSendWasToolResponse = false ;
494+ await _asyncSession . SendRealtimeInputAsync (
495+ new LiveSendRealtimeInputParameters
496+ {
497+ Text = textContent . Text ,
498+ } ,
499+ cancellationToken ) . ConfigureAwait ( false ) ;
461500 }
462501 else if ( content is DataContent dataContent )
463502 {
464- if ( dataContent . HasTopLevelMediaType ( "audio " ) )
503+ if ( dataContent . HasTopLevelMediaType ( "image " ) )
465504 {
466- parts . Add ( new Part
467- {
468- InlineData = new Blob
505+ hasMedia = true ;
506+ _lastSendWasToolResponse = false ;
507+ await _asyncSession . SendRealtimeInputAsync (
508+ new LiveSendRealtimeInputParameters
469509 {
470- Data = ExtractDataBytes ( dataContent ) ,
471- MimeType = dataContent . MediaType ?? "audio/pcm" ,
472- }
473- } ) ;
510+ Video = new Blob
511+ {
512+ Data = ExtractDataBytes ( dataContent ) ,
513+ MimeType = dataContent . MediaType ?? "image/jpeg" ,
514+ }
515+ } ,
516+ cancellationToken ) . ConfigureAwait ( false ) ;
474517 }
475- else if ( dataContent . HasTopLevelMediaType ( "image " ) )
518+ else if ( dataContent . HasTopLevelMediaType ( "audio " ) )
476519 {
477- byte [ ] imageBytes = ExtractDataBytes ( dataContent ) ;
478- parts . Add ( new Part
479- {
480- InlineData = new Blob
520+ hasMedia = true ;
521+ _lastSendWasToolResponse = false ;
522+ await _asyncSession . SendRealtimeInputAsync (
523+ new LiveSendRealtimeInputParameters
481524 {
482- Data = imageBytes ,
483- MimeType = dataContent . MediaType ?? "image/png" ,
484- }
485- } ) ;
525+ Audio = new Blob
526+ {
527+ Data = ExtractDataBytes ( dataContent ) ,
528+ MimeType = dataContent . MediaType ?? _inputAudioMimeType ,
529+ }
530+ } ,
531+ cancellationToken ) . ConfigureAwait ( false ) ;
486532 }
487533 }
488534 }
489535
490- if ( parts . Count == 0 )
536+ if ( hasMedia && ! hasText )
491537 {
492- return ;
538+ // Gemini treats media as streaming context (like a video frame) and won't
539+ // respond until it receives a text/voice prompt. Send a brief text to
540+ // trigger a response about the media content.
541+ _pendingMediaNeedsTrigger = true ;
493542 }
494-
495- string role = itemCreate . Item . Role ? . Value switch
496- {
497- "assistant" => "model" ,
498- _ => "user" ,
499- } ;
500-
501- _lastInputWasRealtime = false ;
502- _lastSendWasToolResponse = false ;
503- await _asyncSession . SendClientContentAsync (
504- new LiveSendClientContentParameters
505- {
506- Turns = new List < Content >
507- {
508- new Content
509- {
510- Parts = parts ,
511- Role = role ,
512- }
513- } ,
514- } ,
515- cancellationToken ) . ConfigureAwait ( false ) ;
516543 }
517544
518545 internal static byte [ ] ExtractDataBytes ( DataContent content )
@@ -733,16 +760,79 @@ internal static FunctionDeclaration ToGoogleFunctionDeclaration(AIFunction aiFun
733760 Description = aiFunction . Description ,
734761 } ;
735762
736- // Map the JSON schema for parameters
763+ // Convert the MEAI JSON schema to a Google Schema object.
764+ // Google's API expects the Schema type with uppercase type names (STRING, OBJECT, etc.),
765+ // not raw JSON schema with lowercase types. Using Parameters instead of ParametersJsonSchema
766+ // ensures compatibility with the Live API's function calling.
737767 if ( aiFunction . JsonSchema is JsonElement schemaElement &&
738- schemaElement . ValueKind != JsonValueKind . Undefined )
768+ schemaElement . ValueKind == JsonValueKind . Object )
739769 {
740- declaration . ParametersJsonSchema = schemaElement ;
770+ declaration . Parameters = ConvertJsonSchemaToGoogleSchema ( schemaElement ) ;
741771 }
742772
743773 return declaration ;
744774 }
745775
776+ /// <summary>
777+ /// Recursively converts a standard JSON Schema <see cref="JsonElement"/> to a Google GenAI
778+ /// <see cref="Schema"/> object, mapping lowercase type names to Google's uppercase enum values.
779+ /// </summary>
780+ internal static Schema ConvertJsonSchemaToGoogleSchema ( JsonElement element )
781+ {
782+ var schema = new Schema ( ) ;
783+
784+ if ( element . TryGetProperty ( "type" , out var typeValue ) )
785+ {
786+ schema . Type = typeValue . GetString ( ) ? . ToLowerInvariant ( ) switch
787+ {
788+ "object" => Google . GenAI . Types . Type . Object ,
789+ "string" => Google . GenAI . Types . Type . String ,
790+ "integer" => Google . GenAI . Types . Type . Integer ,
791+ "number" => Google . GenAI . Types . Type . Number ,
792+ "boolean" => Google . GenAI . Types . Type . Boolean ,
793+ "array" => Google . GenAI . Types . Type . Array ,
794+ _ => null
795+ } ;
796+ }
797+
798+ if ( element . TryGetProperty ( "description" , out var desc ) &&
799+ desc . ValueKind == JsonValueKind . String )
800+ {
801+ schema . Description = desc . GetString ( ) ;
802+ }
803+
804+ if ( element . TryGetProperty ( "properties" , out var props ) &&
805+ props . ValueKind == JsonValueKind . Object )
806+ {
807+ schema . Properties = new Dictionary < string , Schema > ( ) ;
808+ foreach ( var prop in props . EnumerateObject ( ) )
809+ {
810+ schema . Properties [ prop . Name ] = ConvertJsonSchemaToGoogleSchema ( prop . Value ) ;
811+ }
812+ }
813+
814+ if ( element . TryGetProperty ( "required" , out var req ) &&
815+ req . ValueKind == JsonValueKind . Array )
816+ {
817+ schema . Required = new List < string > ( ) ;
818+ foreach ( var item in req . EnumerateArray ( ) )
819+ {
820+ if ( item . ValueKind == JsonValueKind . String )
821+ {
822+ schema . Required . Add ( item . GetString ( ) ! ) ;
823+ }
824+ }
825+ }
826+
827+ if ( element . TryGetProperty ( "items" , out var items ) &&
828+ items . ValueKind == JsonValueKind . Object )
829+ {
830+ schema . Items = ConvertJsonSchemaToGoogleSchema ( items ) ;
831+ }
832+
833+ return schema ;
834+ }
835+
746836 #endregion
747837}
748838
0 commit comments