@@ -3,6 +3,7 @@ package openai
33import (
44 "bytes"
55 "context"
6+ "encoding/json"
67 "fmt"
78 "io"
89 "net/http"
@@ -20,11 +21,12 @@ const (
2021type AudioResponseFormat string
2122
2223const (
23- AudioResponseFormatJSON AudioResponseFormat = "json"
24- AudioResponseFormatText AudioResponseFormat = "text"
25- AudioResponseFormatSRT AudioResponseFormat = "srt"
26- AudioResponseFormatVerboseJSON AudioResponseFormat = "verbose_json"
27- AudioResponseFormatVTT AudioResponseFormat = "vtt"
24+ AudioResponseFormatJSON AudioResponseFormat = "json"
25+ AudioResponseFormatText AudioResponseFormat = "text"
26+ AudioResponseFormatSRT AudioResponseFormat = "srt"
27+ AudioResponseFormatVerboseJSON AudioResponseFormat = "verbose_json"
28+ AudioResponseFormatVTT AudioResponseFormat = "vtt"
29+ AudioResponseFormatDiarizedJSON AudioResponseFormat = "diarized_json"
2830)
2931
3032type TranscriptionTimestampGranularity string
@@ -34,6 +36,43 @@ const (
3436 TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
3537)
3638
39+ // AudioChunkingStrategyType defines the chunking strategy for audio transcription.
40+ type AudioChunkingStrategyType string
41+
42+ // Chunking strategy types for audio transcription.
43+ const (
44+ AudioChunkingStrategyAuto AudioChunkingStrategyType = "auto" // Server normalizes loudness and uses VAD
45+ AudioChunkingStrategyServerVAD AudioChunkingStrategyType = "server_vad" // Custom VAD parameters
46+ )
47+
48+ // TranscriptionChunkingStrategy controls how audio is cut into chunks.
49+ // When Type is ChunkingStrategyAuto ("auto"), the form field contains the literal string "auto".
50+ // When Type is ChunkingStrategyServerVAD ("server_vad"), the form field contains a JSON object with VAD parameters.
51+ // Required for gpt-4o-transcribe-diarize model on audio longer than 30 seconds.
52+ type TranscriptionChunkingStrategy struct {
53+ // Type is AudioChunkingStrategyAuto or AudioChunkingStrategyServerVAD.
54+ Type AudioChunkingStrategyType `json:"type"`
55+ // PrefixPaddingMs is padding before detected speech (ms).
56+ PrefixPaddingMs int `json:"prefix_padding_ms,omitempty"`
57+ // SilenceDurationMs is silence threshold for chunk boundaries (ms).
58+ SilenceDurationMs int `json:"silence_duration_ms,omitempty"`
59+ // Threshold is VAD detection sensitivity (0.0-1.0).
60+ Threshold float32 `json:"threshold,omitempty"`
61+ }
62+
63+ // toFormValue returns the string representation for multipart form submission.
64+ // "auto" is sent as literal string; "server_vad" is sent as JSON object.
65+ func (s TranscriptionChunkingStrategy ) toFormValue () (string , error ) {
66+ if s .Type == AudioChunkingStrategyAuto {
67+ return string (AudioChunkingStrategyAuto ), nil
68+ }
69+ data , err := json .Marshal (s )
70+ if err != nil {
71+ return "" , err
72+ }
73+ return string (data ), nil
74+ }
75+
3776// AudioRequest represents a request structure for audio API.
3877type AudioRequest struct {
3978 Model string
@@ -49,6 +88,8 @@ type AudioRequest struct {
4988 Language string // Only for transcription.
5089 Format AudioResponseFormat
5190 TimestampGranularities []TranscriptionTimestampGranularity // Only for transcription.
91+ // ChunkingStrategy controls audio chunking. Required for diarization models on audio >30s.
92+ ChunkingStrategy * TranscriptionChunkingStrategy
5293}
5394
5495// AudioResponse represents a response structure for audio API.
@@ -79,6 +120,34 @@ type AudioResponse struct {
79120 httpHeader
80121}
81122
123+ // AudioUsage represents usage statistics for audio API calls.
124+ type AudioUsage struct {
125+ Type string `json:"type"` // "duration" or "tokens"
126+ Seconds int `json:"seconds,omitempty"` // Duration in seconds (for duration-based billing)
127+ }
128+
129+ // DiarizedSegment represents a speaker-annotated segment from diarized transcription.
130+ type DiarizedSegment struct {
131+ Type string `json:"type"` // "transcript.text.segment"
132+ ID string `json:"id"` // Segment identifier (e.g., "seg_001")
133+ Start float64 `json:"start"` // Start time in seconds
134+ End float64 `json:"end"` // End time in seconds
135+ Text string `json:"text"` // Transcript text for this segment
136+ Speaker string `json:"speaker"` // Speaker label (e.g., "agent", "A")
137+ }
138+
139+ // DiarizedAudioResponse represents a diarized transcription response.
140+ // Returned when using gpt-4o-transcribe-diarize model with diarized_json format.
141+ type DiarizedAudioResponse struct {
142+ Task string `json:"task"` // "transcribe"
143+ Duration float64 `json:"duration"` // Audio duration in seconds
144+ Text string `json:"text"` // Full transcript with speaker prefixes
145+ Segments []DiarizedSegment `json:"segments"` // Speaker-annotated segments
146+ Usage * AudioUsage `json:"usage,omitempty"`
147+
148+ httpHeader
149+ }
150+
82151type audioTextResponse struct {
83152 Text string `json:"text"`
84153
@@ -100,6 +169,39 @@ func (c *Client) CreateTranscription(
100169 return c .callAudioAPI (ctx , request , "transcriptions" )
101170}
102171
172+ // CreateDiarizedTranscription transcribes audio with speaker diarization.
173+ // Use with gpt-4o-transcribe-diarize model and AudioResponseFormatDiarizedJSON format.
174+ // Requires ChunkingStrategy for audio longer than 30 seconds.
175+ func (c * Client ) CreateDiarizedTranscription (
176+ ctx context.Context ,
177+ request AudioRequest ,
178+ ) (response DiarizedAudioResponse , err error ) {
179+ var formBody bytes.Buffer
180+ builder := c .createFormBuilder (& formBody )
181+
182+ if err = audioMultipartForm (request , builder ); err != nil {
183+ return DiarizedAudioResponse {}, err
184+ }
185+
186+ urlSuffix := "/audio/transcriptions"
187+ req , err := c .newRequest (
188+ ctx ,
189+ http .MethodPost ,
190+ c .fullURL (urlSuffix , withModel (request .Model )),
191+ withBody (& formBody ),
192+ withContentType (builder .FormDataContentType ()),
193+ )
194+ if err != nil {
195+ return DiarizedAudioResponse {}, err
196+ }
197+
198+ err = c .sendRequest (req , & response )
199+ if err != nil {
200+ return DiarizedAudioResponse {}, err
201+ }
202+ return
203+ }
204+
103205// CreateTranslation — API call to translate audio into English.
104206func (c * Client ) CreateTranslation (
105207 ctx context.Context ,
@@ -148,7 +250,8 @@ func (c *Client) callAudioAPI(
148250
149251// HasJSONResponse returns true if the response format is JSON.
150252func (r AudioRequest ) HasJSONResponse () bool {
151- return r .Format == "" || r .Format == AudioResponseFormatJSON || r .Format == AudioResponseFormatVerboseJSON
253+ return r .Format == "" || r .Format == AudioResponseFormatJSON ||
254+ r .Format == AudioResponseFormatVerboseJSON || r .Format == AudioResponseFormatDiarizedJSON
152255}
153256
154257// audioMultipartForm creates a form with audio file contents and the name of the model to use for
@@ -196,19 +299,43 @@ func audioMultipartForm(request AudioRequest, b utils.FormBuilder) error {
196299 }
197300 }
198301
199- if len (request .TimestampGranularities ) > 0 {
200- for _ , tg := range request .TimestampGranularities {
201- err = b .WriteField ("timestamp_granularities[]" , string (tg ))
202- if err != nil {
203- return fmt .Errorf ("writing timestamp_granularities[]: %w" , err )
204- }
205- }
302+ if err = writeTimestampGranularities (request .TimestampGranularities , b ); err != nil {
303+ return err
304+ }
305+
306+ if err = writeChunkingStrategy (request .ChunkingStrategy , b ); err != nil {
307+ return err
206308 }
207309
208310 // Close the multipart writer
209311 return b .Close ()
210312}
211313
314+ // writeTimestampGranularities writes the timestamp_granularities[] fields if provided.
315+ func writeTimestampGranularities (granularities []TranscriptionTimestampGranularity , b utils.FormBuilder ) error {
316+ for _ , tg := range granularities {
317+ if err := b .WriteField ("timestamp_granularities[]" , string (tg )); err != nil {
318+ return fmt .Errorf ("writing timestamp_granularities[]: %w" , err )
319+ }
320+ }
321+ return nil
322+ }
323+
324+ // writeChunkingStrategy writes the chunking_strategy field if provided.
325+ func writeChunkingStrategy (cs * TranscriptionChunkingStrategy , b utils.FormBuilder ) error {
326+ if cs == nil {
327+ return nil
328+ }
329+ value , err := cs .toFormValue ()
330+ if err != nil {
331+ return fmt .Errorf ("marshaling chunking_strategy: %w" , err )
332+ }
333+ if err = b .WriteField ("chunking_strategy" , value ); err != nil {
334+ return fmt .Errorf ("writing chunking_strategy: %w" , err )
335+ }
336+ return nil
337+ }
338+
212339// createFileField creates the "file" form field from either an existing file or by using the reader.
213340func createFileField (request AudioRequest , b utils.FormBuilder ) error {
214341 if request .Reader != nil {
0 commit comments