Skip to content

Commit 06fa944

Browse files
committed
fixup commit audio event and more
1 parent a24bc9c commit 06fa944

File tree

1 file changed

+50
-71
lines changed

1 file changed

+50
-71
lines changed

core/http/endpoints/openai/realtime.go

Lines changed: 50 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -404,48 +404,23 @@ func registerRealtime(application *application.Application, model, intent string
404404
case types.ClientEventTypeInputAudioBufferCommit:
405405
log.Debug().Msgf("recv: %s", msg)
406406

407-
// TODO: Trigger transcription.
408-
// TODO: Ignore this if VAD enabled or interrupt VAD?
407+
sessionLock.Lock()
408+
td := session.TurnDetection.Type
409+
sessionLock.Unlock()
409410

410-
if session.TranscriptionOnly {
411+
// TODO: At the least need to check locking and timer state in the VAD Go routine before allowing this
412+
if td == types.ServerTurnDetectionTypeServerVad {
413+
sendNotImplemented(c, "input_audio_buffer.commit in conjunction with VAD")
411414
continue
412415
}
413416

414-
// Commit the audio buffer to the conversation as a new item
415-
item := &types.MessageItem{
416-
ID: generateItemID(),
417-
Type: "message",
418-
Status: "completed",
419-
Role: "user",
420-
Content: []types.MessageContentPart{
421-
{
422-
Type: "input_audio",
423-
Audio: base64.StdEncoding.EncodeToString(session.InputAudioBuffer),
424-
},
425-
},
426-
}
427-
428-
// Add item to conversation
429-
conversation.Lock.Lock()
430-
conversation.Items = append(conversation.Items, item)
431-
conversation.Lock.Unlock()
432-
433-
// Reset InputAudioBuffer
434417
session.AudioBufferLock.Lock()
418+
allAudio := make([]byte, len(session.InputAudioBuffer))
419+
copy(allAudio, session.InputAudioBuffer)
435420
session.InputAudioBuffer = nil
436421
session.AudioBufferLock.Unlock()
437422

438-
// Send item.created event
439-
sendEvent(c, types.ConversationItemCreatedEvent{
440-
ServerEventBase: types.ServerEventBase{
441-
EventID: "event_TODO",
442-
Type: "conversation.item.created",
443-
},
444-
Item: types.ResponseMessageItem{
445-
Object: "realtime.item",
446-
MessageItem: *item,
447-
},
448-
})
423+
go commitUtterance(context.TODO(), allAudio, cfg, evaluator, session, conversation, c)
449424

450425
case types.ClientEventTypeConversationItemCreate:
451426
log.Debug().Msgf("recv: %s", msg)
@@ -569,6 +544,8 @@ func updateTransSession(session *Session, update *types.ClientSession, cl *confi
569544
trUpd := update.InputAudioTranscription
570545
trCur := session.InputAudioTranscription
571546

547+
session.TranscriptionOnly = true
548+
572549
if trUpd != nil && trUpd.Model != "" && trUpd.Model != trCur.Model {
573550
pipeline := config.Pipeline{
574551
VAD: vadModel,
@@ -601,6 +578,8 @@ func updateSession(session *Session, update *types.ClientSession, cl *config.Mod
601578
sessionLock.Lock()
602579
defer sessionLock.Unlock()
603580

581+
session.TranscriptionOnly = false
582+
604583
if update.Model != "" {
605584
pipeline := config.Pipeline{
606585
LLM: update.Model,
@@ -808,34 +787,8 @@ func commitUtterance(ctx context.Context, utt []byte, cfg *config.ModelConfig, e
808787
// TODO: Update the prompt with transcription result?
809788
}
810789

811-
// TODO: Commit the audio and/or transcribed text to the conversation
812-
// Commit logic: create item, broadcast item.created, etc.
813-
item := &types.MessageItem{
814-
ID: generateItemID(),
815-
Type: "message",
816-
Status: "completed",
817-
Role: "user",
818-
Content: []types.MessageContentPart{
819-
{
820-
Type: types.MessageContentTypeInputAudio,
821-
Audio: base64.StdEncoding.EncodeToString(utt),
822-
Transcript: transcript,
823-
},
824-
},
825-
}
826-
conv.Lock.Lock()
827-
conv.Items = append(conv.Items, item)
828-
conv.Lock.Unlock()
829-
830-
sendEvent(c, types.ConversationItemAddedEvent{
831-
ServerEventBase: types.ServerEventBase{
832-
Type: types.ServerEventTypeConversationItemAdded,
833-
},
834-
Item: *item,
835-
})
836-
837790
// trigger the response generation
838-
generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage)
791+
generateResponse(cfg, evaluator, session, transcript, conv, ResponseCreate{}, c, websocket.TextMessage)
839792
}
840793

841794
func runVAD(ctx context.Context, session *Session, adata []int16) ([]*proto.VADSegment, error) {
@@ -859,15 +812,41 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]*proto.VADS
859812
}
860813

861814
// Function to generate a response based on the conversation
862-
func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator, session *Session, conversation *Conversation, responseCreate ResponseCreate, c *websocket.Conn, mt int) {
815+
func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator, session *Session, transcript string, conv *Conversation, responseCreate ResponseCreate, c *websocket.Conn, mt int) {
863816

864817
log.Debug().Msg("Generating realtime response...")
865818

819+
// TODO: Commit the audio and/or transcribed text to the conversation
820+
// Commit logic: create item, broadcast item.created, etc.
821+
item := &types.MessageItem{
822+
ID: generateItemID(),
823+
Type: "message",
824+
Status: "completed",
825+
Role: "user",
826+
Content: []types.MessageContentPart{
827+
{
828+
Type: types.MessageContentTypeInputAudio,
829+
Audio: base64.StdEncoding.EncodeToString(utt),
830+
Transcript: transcript,
831+
},
832+
},
833+
}
834+
conv.Lock.Lock()
835+
conv.Items = append(conv.Items, item)
836+
conv.Lock.Unlock()
837+
838+
sendEvent(c, types.ConversationItemAddedEvent{
839+
ServerEventBase: types.ServerEventBase{
840+
Type: types.ServerEventTypeConversationItemAdded,
841+
},
842+
Item: *item,
843+
})
844+
866845
// Compile the conversation history
867-
conversation.Lock.Lock()
846+
conv.Lock.Lock()
868847
var conversationHistory []schema.Message
869848
var latestUserAudio string
870-
for _, item := range conversation.Items {
849+
for _, item := range conv.Items {
871850
for _, content := range item.Content {
872851
switch content.Type {
873852
case types.MessageContentTypeInputText, types.MessageContentTypeText:
@@ -888,7 +867,7 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
888867
}
889868
}
890869

891-
conversation.Lock.Unlock()
870+
conv.Lock.Unlock()
892871

893872
var generatedText string
894873
var generatedAudio []byte
@@ -987,9 +966,9 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
987966
}
988967

989968
// Add item to conversation
990-
conversation.Lock.Lock()
991-
conversation.Items = append(conversation.Items, item)
992-
conversation.Lock.Unlock()
969+
conv.Lock.Lock()
970+
conv.Items = append(conv.Items, item)
971+
conv.Lock.Unlock()
993972

994973
// Send item.created event
995974
sendEvent(c, OutgoingMessage{
@@ -1058,9 +1037,9 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
10581037
}
10591038

10601039
// Add item to conversation
1061-
conversation.Lock.Lock()
1062-
conversation.Items = append(conversation.Items, item)
1063-
conversation.Lock.Unlock()
1040+
conv.Lock.Lock()
1041+
conv.Items = append(conv.Items, item)
1042+
conv.Lock.Unlock()
10641043

10651044
// Send item.created event
10661045
sendEvent(c, OutgoingMessage{

0 commit comments

Comments
 (0)