Skip to content

Commit d241f9b

Browse files
committed
feat: migrate to Realtime API GA
1 parent c4106e7 commit d241f9b

15 files changed

+3406
-1299
lines changed

api.go

Lines changed: 16 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -9,65 +9,35 @@ import (
99
"net/http"
1010
)
1111

12-
type CreateSessionRequest struct {
13-
ClientSession
14-
15-
// The Realtime model used for this session.
16-
Model string `json:"model"`
17-
}
18-
1912
type ClientSecret struct {
2013
// Ephemeral key usable in client environments to authenticate connections to the Realtime API. Use this in client-side environments rather than a standard API token, which should only be used server-side.
2114
Value string `json:"value"`
2215
// Timestamp for when the token expires. Currently, all tokens expire after one minute.
2316
ExpiresAt int64 `json:"expires_at"`
2417
}
2518

26-
type CreateSessionResponse struct {
27-
ServerSession
19+
type ExpiresAfter struct {
20+
// The anchor point for the client secret expiration, meaning that seconds will be added to the created_at time of the client secret to produce an expiration timestamp. Only created_at is currently supported.
21+
Anchor string `json:"anchor,omitzero"`
2822

29-
// Ephemeral key returned by the API.
30-
ClientSecret ClientSecret `json:"client_secret"`
23+
// The number of seconds from the anchor point to the expiration. Select a value between 10 and 7200 (2 hours). This default to 600 seconds (10 minutes) if not specified.
24+
Seconds int `json:"seconds,omitzero"`
3125
}
3226

33-
// CreateTranscriptionSessionRequest is the request for creating a transcription session.
34-
type CreateTranscriptionSessionRequest struct {
35-
// The set of items to include in the transcription.
36-
Include []string `json:"include,omitempty"`
37-
// The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
38-
InputAudioFormat AudioFormat `json:"input_audio_format,omitempty"`
39-
// Configuration for input audio noise reduction.
40-
InputAudioNoiseReduction *InputAudioNoiseReduction `json:"input_audio_noise_reduction,omitempty"`
41-
// Configuration for input audio transcription.
42-
InputAudioTranscription *InputAudioTranscription `json:"input_audio_transcription,omitempty"`
43-
44-
// Attention: Keep this field empty! It's shocking that this field is documented but not supported.
45-
// You may get error of "Unknown parameter: 'modalities'." if this field is not empty.
46-
// Issue reported: https://community.openai.com/t/unknown-parameter-modalities-when-creating-transcriptionsessions/1150141/6
47-
// Docs: https://platform.openai.com/docs/api-reference/realtime-sessions/create-transcription#realtime-sessions-create-transcription-modalities
48-
// The set of modalities the model can respond with. To disable audio, set this to ["text"].
49-
Modalities []Modality `json:"modalities,omitempty"`
50-
51-
// Configuration for turn detection.
52-
TurnDetection *ClientTurnDetection `json:"turn_detection,omitempty"`
27+
type CreateClientSecretRequest struct {
28+
// Configuration for the client secret expiration. Expiration refers to the time after which a client secret will no longer be valid for creating sessions. The session itself may continue after that time once started. A secret can be used to create multiple sessions until it expires.
29+
ExpiresAfter ExpiresAfter `json:"expires_after,omitzero"`
30+
31+
// Session configuration to use for the client secret. Choose either a realtime session or a transcription session.
32+
Session SessionUnion `json:"session,omitzero"`
5333
}
5434

55-
// CreateTranscriptionSessionResponse is the response from creating a transcription session.
56-
type CreateTranscriptionSessionResponse struct {
57-
// The unique ID of the session.
58-
ID string `json:"id"`
59-
// The object type, must be "realtime.transcription_session".
60-
Object string `json:"object"`
61-
// The format of input audio.
62-
InputAudioFormat AudioFormat `json:"input_audio_format,omitempty"`
63-
// Configuration of the transcription model.
64-
InputAudioTranscription *InputAudioTranscription `json:"input_audio_transcription,omitempty"`
65-
// The set of modalities.
66-
Modalities []Modality `json:"modalities,omitempty"`
67-
// Configuration for turn detection.
68-
TurnDetection *ServerTurnDetection `json:"turn_detection,omitempty"`
35+
type CreateClientSecretResponse struct {
6936
// Ephemeral key returned by the API.
70-
ClientSecret ClientSecret `json:"client_secret"`
37+
ClientSecret
38+
39+
// Session configuration to use for the client secret. Choose either a realtime session or a transcription session.
40+
Session SessionUnion `json:"session,omitzero"`
7141
}
7242

7343
type OpenAIError struct {

api_integration_test.go

Lines changed: 67 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -9,27 +9,40 @@ import (
99
"github.com/stretchr/testify/require"
1010
)
1111

12-
func TestCreateSession(t *testing.T) {
12+
func TestCreateRealtimeSession(t *testing.T) {
1313
key := os.Getenv("OPENAI_API_KEY")
1414
if key == "" {
1515
t.Skip("OPENAI_API_KEY is not set")
1616
}
17-
client := openairt.NewClient(key)
18-
session, err := client.CreateSession(context.Background(), &openairt.CreateSessionRequest{
19-
Model: openairt.GPT4oRealtimePreview20241217,
20-
ClientSession: openairt.ClientSession{
21-
Modalities: []openairt.Modality{
22-
openairt.ModalityAudio,
23-
openairt.ModalityText,
17+
config := openairt.DefaultConfig(key)
18+
if baseUrl := os.Getenv("OPENAI_BASE_URL"); baseUrl != "" {
19+
config.BaseURL = baseUrl
20+
}
21+
client := openairt.NewClientWithConfig(config)
22+
session, err := client.CreateClientSecret(context.Background(), &openairt.CreateClientSecretRequest{
23+
ExpiresAfter: openairt.ExpiresAfter{
24+
Anchor: "created_at",
25+
Seconds: 600,
26+
},
27+
Session: openairt.SessionUnion{
28+
Realtime: &openairt.RealtimeSession{
29+
Model: openairt.GPTRealtime20250828,
30+
// If you specify ["audio", "text"], you'll get error `Invalid modalities: ['audio', 'text']. Supported combinations are: ['text'] and ['audio'].`
31+
// That's because Realtime API GA no longer accepts both text and audio for the parameter.
32+
// Just passing ["audio"] and you can receive transcription of the input/output audio
33+
// See https://github.com/openai/openai-agents-python/issues/1771#issuecomment-3317018366.
34+
// OutputModalities: []openairt.Modality{
35+
// openairt.ModalityAudio,
36+
// },
37+
Instructions: "You are a friendly assistant.",
2438
},
25-
Instructions: "You are a friendly assistant.",
2639
},
2740
})
2841
require.NoError(t, err)
2942
require.NotEmpty(t, session.ClientSecret.Value)
3043
require.NotZero(t, session.ClientSecret.ExpiresAt)
31-
require.Equal(t, openairt.GPT4oRealtimePreview20241217, session.Model)
32-
require.Equal(t, "You are a friendly assistant.", session.Instructions)
44+
require.Equal(t, openairt.GPTRealtime20250828, session.Session.Realtime.Model)
45+
require.Equal(t, "You are a friendly assistant.", session.Session.Realtime.Instructions)
3346
t.Logf("session: %+v", session)
3447
}
3548

@@ -38,41 +51,55 @@ func TestCreateTranscriptionSession(t *testing.T) {
3851
if key == "" {
3952
t.Skip("OPENAI_API_KEY is not set")
4053
}
41-
client := openairt.NewClient(key)
42-
session, err := client.CreateTranscriptionSession(context.Background(), &openairt.CreateTranscriptionSessionRequest{
43-
InputAudioFormat: openairt.AudioFormatPcm16,
44-
InputAudioTranscription: &openairt.InputAudioTranscription{
45-
Model: openairt.GPT4oTranscribe,
46-
Language: "en",
47-
},
48-
InputAudioNoiseReduction: &openairt.InputAudioNoiseReduction{
49-
Type: openairt.NearFieldNoiseReduction,
54+
config := openairt.DefaultConfig(key)
55+
if baseUrl := os.Getenv("OPENAI_BASE_URL"); baseUrl != "" {
56+
config.BaseURL = baseUrl
57+
}
58+
client := openairt.NewClientWithConfig(config)
59+
session, err := client.CreateClientSecret(context.Background(), &openairt.CreateClientSecretRequest{
60+
ExpiresAfter: openairt.ExpiresAfter{
61+
Anchor: "created_at",
62+
Seconds: 600,
5063
},
51-
// Attention: Keep this field empty! It's shocking that this field is documented but not supported.
52-
// Modalities: []openairt.Modality{
53-
// openairt.ModalityText,
54-
// },
55-
TurnDetection: &openairt.ClientTurnDetection{
56-
Type: openairt.ClientTurnDetectionTypeServerVad,
57-
TurnDetectionParams: openairt.TurnDetectionParams{
58-
Threshold: 0.6,
59-
PrefixPaddingMs: 300,
60-
SilenceDurationMs: 500,
64+
Session: openairt.SessionUnion{
65+
Transcription: &openairt.TranscriptionSession{
66+
Audio: openairt.TranscriptionSessionAudio{
67+
Input: &openairt.SessionAudioInput{
68+
Format: openairt.AudioFormatUnion{
69+
PCM: &openairt.AudioFormatPCM{
70+
Rate: 24000,
71+
},
72+
},
73+
Transcription: openairt.AudioTranscription{
74+
Model: openairt.GPT4oTranscribe,
75+
Language: "en",
76+
},
77+
NoiseReduction: openairt.AudioNoiseReduction{
78+
Type: openairt.NoiseReductionNearField,
79+
},
80+
TurnDetection: openairt.TurnDetectionUnion{
81+
ServerVad: &openairt.ServerVad{
82+
Threshold: 0.6,
83+
PrefixPaddingMs: 300,
84+
SilenceDurationMs: 500,
85+
},
86+
},
87+
},
88+
},
6189
},
6290
},
63-
Include: []string{},
6491
})
6592
require.NoError(t, err)
6693
require.NotEmpty(t, session.ClientSecret.Value)
6794
require.NotZero(t, session.ClientSecret.ExpiresAt)
68-
require.Equal(t, "realtime.transcription_session", session.Object)
69-
require.Equal(t, openairt.AudioFormatPcm16, session.InputAudioFormat)
70-
require.Equal(t, openairt.GPT4oTranscribe, session.InputAudioTranscription.Model)
71-
require.Equal(t, "en", session.InputAudioTranscription.Language)
72-
require.Equal(t, openairt.ServerTurnDetectionTypeServerVad, session.TurnDetection.Type)
73-
require.InEpsilon(t, 0.6, session.TurnDetection.Threshold, 0.0001)
74-
require.Equal(t, 300, session.TurnDetection.PrefixPaddingMs)
75-
require.Equal(t, 500, session.TurnDetection.SilenceDurationMs)
76-
require.Empty(t, session.Modalities)
95+
require.Equal(t, "realtime.transcription_session", session.Session.Transcription.Object)
96+
require.Equal(t, int(24000), session.Session.Transcription.Audio.Input.Format.PCM.Rate)
97+
require.Equal(t, openairt.GPT4oTranscribe, session.Session.Transcription.Audio.Input.Transcription.Model)
98+
require.Equal(t, "en", session.Session.Transcription.Audio.Input.Transcription.Language)
99+
require.NotNil(t, session.Session.Transcription.Audio.Input.TurnDetection.ServerVad)
100+
require.Nil(t, session.Session.Transcription.Audio.Input.TurnDetection.SemanticVad)
101+
require.InEpsilon(t, 0.6, session.Session.Transcription.Audio.Input.TurnDetection.ServerVad.Threshold, 0.0001)
102+
require.Equal(t, int64(300), session.Session.Transcription.Audio.Input.TurnDetection.ServerVad.PrefixPaddingMs)
103+
require.Equal(t, int64(500), session.Session.Transcription.Audio.Input.TurnDetection.ServerVad.SilenceDurationMs)
77104
t.Logf("transcription session: %+v", session)
78105
}

0 commit comments

Comments
 (0)