@@ -404,48 +404,23 @@ func registerRealtime(application *application.Application, model, intent string
404404 case types .ClientEventTypeInputAudioBufferCommit :
405405 log .Debug ().Msgf ("recv: %s" , msg )
406406
407- // TODO: Trigger transcription.
408- // TODO: Ignore this if VAD enabled or interrupt VAD?
407+ sessionLock .Lock ()
408+ td := session .TurnDetection .Type
409+ sessionLock .Unlock ()
409410
410- if session .TranscriptionOnly {
411+ // TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
412+ if td == types .ServerTurnDetectionTypeServerVad {
413+ sendNotImplemented (c , "input_audio_buffer.commit in conjunction with VAD" )
411414 continue
412415 }
413416
414- // Commit the audio buffer to the conversation as a new item
415- item := & types.MessageItem {
416- ID : generateItemID (),
417- Type : "message" ,
418- Status : "completed" ,
419- Role : "user" ,
420- Content : []types.MessageContentPart {
421- {
422- Type : "input_audio" ,
423- Audio : base64 .StdEncoding .EncodeToString (session .InputAudioBuffer ),
424- },
425- },
426- }
427-
428- // Add item to conversation
429- conversation .Lock .Lock ()
430- conversation .Items = append (conversation .Items , item )
431- conversation .Lock .Unlock ()
432-
433- // Reset InputAudioBuffer
434417 session .AudioBufferLock .Lock ()
418+ allAudio := make ([]byte , len (session .InputAudioBuffer ))
419+ copy (allAudio , session .InputAudioBuffer )
435420 session .InputAudioBuffer = nil
436421 session .AudioBufferLock .Unlock ()
437422
438- // Send item.created event
439- sendEvent (c , types.ConversationItemCreatedEvent {
440- ServerEventBase : types.ServerEventBase {
441- EventID : "event_TODO" ,
442- Type : "conversation.item.created" ,
443- },
444- Item : types.ResponseMessageItem {
445- Object : "realtime.item" ,
446- MessageItem : * item ,
447- },
448- })
423+ go commitUtterance (context .TODO (), allAudio , cfg , evaluator , session , conversation , c )
449424
450425 case types .ClientEventTypeConversationItemCreate :
451426 log .Debug ().Msgf ("recv: %s" , msg )
@@ -569,6 +544,8 @@ func updateTransSession(session *Session, update *types.ClientSession, cl *confi
569544 trUpd := update .InputAudioTranscription
570545 trCur := session .InputAudioTranscription
571546
547+ session .TranscriptionOnly = true
548+
572549 if trUpd != nil && trUpd .Model != "" && trUpd .Model != trCur .Model {
573550 pipeline := config.Pipeline {
574551 VAD : vadModel ,
@@ -601,6 +578,8 @@ func updateSession(session *Session, update *types.ClientSession, cl *config.Mod
601578 sessionLock .Lock ()
602579 defer sessionLock .Unlock ()
603580
581+ session .TranscriptionOnly = false
582+
604583 if update .Model != "" {
605584 pipeline := config.Pipeline {
606585 LLM : update .Model ,
@@ -808,34 +787,8 @@ func commitUtterance(ctx context.Context, utt []byte, cfg *config.ModelConfig, e
808787 // TODO: Update the prompt with transcription result?
809788 }
810789
811- // TODO: Commit the audio and/or transcribed text to the conversation
812- // Commit logic: create item, broadcast item.created, etc.
813- item := & types.MessageItem {
814- ID : generateItemID (),
815- Type : "message" ,
816- Status : "completed" ,
817- Role : "user" ,
818- Content : []types.MessageContentPart {
819- {
820- Type : types .MessageContentTypeInputAudio ,
821- Audio : base64 .StdEncoding .EncodeToString (utt ),
822- Transcript : transcript ,
823- },
824- },
825- }
826- conv .Lock .Lock ()
827- conv .Items = append (conv .Items , item )
828- conv .Lock .Unlock ()
829-
830- sendEvent (c , types.ConversationItemAddedEvent {
831- ServerEventBase : types.ServerEventBase {
832- Type : types .ServerEventTypeConversationItemAdded ,
833- },
834- Item : * item ,
835- })
836-
837790 // trigger the response generation
838- generateResponse (cfg , evaluator , session , conv , ResponseCreate {}, c , websocket .TextMessage )
791+ generateResponse (cfg , evaluator , session , transcript , conv , ResponseCreate {}, c , websocket .TextMessage )
839792}
840793
841794func runVAD (ctx context.Context , session * Session , adata []int16 ) ([]* proto.VADSegment , error ) {
@@ -859,15 +812,41 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]*proto.VADS
859812}
860813
861814// Function to generate a response based on the conversation
862- func generateResponse (config * config.ModelConfig , evaluator * templates.Evaluator , session * Session , conversation * Conversation , responseCreate ResponseCreate , c * websocket.Conn , mt int ) {
815+ func generateResponse (config * config.ModelConfig , evaluator * templates.Evaluator , session * Session , transcript string , conv * Conversation , responseCreate ResponseCreate , c * websocket.Conn , mt int ) {
863816
864817 log .Debug ().Msg ("Generating realtime response..." )
865818
819+ // TODO: Commit the audio and/or transcribed text to the conversation
820+ // Commit logic: create item, broadcast item.created, etc.
821+ item := & types.MessageItem {
822+ ID : generateItemID (),
823+ Type : "message" ,
824+ Status : "completed" ,
825+ Role : "user" ,
826+ Content : []types.MessageContentPart {
827+ {
828+ Type : types .MessageContentTypeInputAudio ,
829+ Audio : base64 .StdEncoding .EncodeToString (utt ),
830+ Transcript : transcript ,
831+ },
832+ },
833+ }
834+ conv .Lock .Lock ()
835+ conv .Items = append (conv .Items , item )
836+ conv .Lock .Unlock ()
837+
838+ sendEvent (c , types.ConversationItemAddedEvent {
839+ ServerEventBase : types.ServerEventBase {
840+ Type : types .ServerEventTypeConversationItemAdded ,
841+ },
842+ Item : * item ,
843+ })
844+
866845 // Compile the conversation history
867- conversation .Lock .Lock ()
846+ conv .Lock .Lock ()
868847 var conversationHistory []schema.Message
869848 var latestUserAudio string
870- for _ , item := range conversation .Items {
849+ for _ , item := range conv .Items {
871850 for _ , content := range item .Content {
872851 switch content .Type {
873852 case types .MessageContentTypeInputText , types .MessageContentTypeText :
@@ -888,7 +867,7 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
888867 }
889868 }
890869
891- conversation .Lock .Unlock ()
870+ conv .Lock .Unlock ()
892871
893872 var generatedText string
894873 var generatedAudio []byte
@@ -987,9 +966,9 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
987966 }
988967
989968 // Add item to conversation
990- conversation .Lock .Lock ()
991- conversation .Items = append (conversation .Items , item )
992- conversation .Lock .Unlock ()
969+ conv .Lock .Lock ()
970+ conv .Items = append (conv .Items , item )
971+ conv .Lock .Unlock ()
993972
994973 // Send item.created event
995974 sendEvent (c , OutgoingMessage {
@@ -1058,9 +1037,9 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
10581037 }
10591038
10601039 // Add item to conversation
1061- conversation .Lock .Lock ()
1062- conversation .Items = append (conversation .Items , item )
1063- conversation .Lock .Unlock ()
1040+ conv .Lock .Lock ()
1041+ conv .Items = append (conv .Items , item )
1042+ conv .Lock .Unlock ()
10641043
10651044 // Send item.created event
10661045 sendEvent (c , OutgoingMessage {
0 commit comments