Skip to content

Commit a85bdd8

Browse files
committed
feat(audio): add chunking_strategy field to AudioRequest
Closes #1101
1 parent 5d7a276 commit a85bdd8

File tree

2 files changed

+474
-13
lines changed

2 files changed

+474
-13
lines changed

audio.go

Lines changed: 140 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package openai
33
import (
44
"bytes"
55
"context"
6+
"encoding/json"
67
"fmt"
78
"io"
89
"net/http"
@@ -20,11 +21,12 @@ const (
2021
type AudioResponseFormat string
2122

2223
const (
23-
AudioResponseFormatJSON AudioResponseFormat = "json"
24-
AudioResponseFormatText AudioResponseFormat = "text"
25-
AudioResponseFormatSRT AudioResponseFormat = "srt"
26-
AudioResponseFormatVerboseJSON AudioResponseFormat = "verbose_json"
27-
AudioResponseFormatVTT AudioResponseFormat = "vtt"
24+
AudioResponseFormatJSON AudioResponseFormat = "json"
25+
AudioResponseFormatText AudioResponseFormat = "text"
26+
AudioResponseFormatSRT AudioResponseFormat = "srt"
27+
AudioResponseFormatVerboseJSON AudioResponseFormat = "verbose_json"
28+
AudioResponseFormatVTT AudioResponseFormat = "vtt"
29+
AudioResponseFormatDiarizedJSON AudioResponseFormat = "diarized_json"
2830
)
2931

3032
type TranscriptionTimestampGranularity string
@@ -34,6 +36,43 @@ const (
3436
TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
3537
)
3638

39+
// AudioChunkingStrategyType defines the chunking strategy for audio transcription.
40+
type AudioChunkingStrategyType string
41+
42+
// Chunking strategy types for audio transcription.
43+
const (
44+
AudioChunkingStrategyAuto AudioChunkingStrategyType = "auto" // Server normalizes loudness and uses VAD
45+
AudioChunkingStrategyServerVAD AudioChunkingStrategyType = "server_vad" // Custom VAD parameters
46+
)
47+
48+
// TranscriptionChunkingStrategy controls how audio is cut into chunks.
49+
// When Type is ChunkingStrategyAuto ("auto"), the form field contains the literal string "auto".
50+
// When Type is ChunkingStrategyServerVAD ("server_vad"), the form field contains a JSON object with VAD parameters.
51+
// Required for gpt-4o-transcribe-diarize model on audio longer than 30 seconds.
52+
type TranscriptionChunkingStrategy struct {
53+
// Type is AudioChunkingStrategyAuto or AudioChunkingStrategyServerVAD.
54+
Type AudioChunkingStrategyType `json:"type"`
55+
// PrefixPaddingMs is padding before detected speech (ms).
56+
PrefixPaddingMs int `json:"prefix_padding_ms,omitempty"`
57+
// SilenceDurationMs is silence threshold for chunk boundaries (ms).
58+
SilenceDurationMs int `json:"silence_duration_ms,omitempty"`
59+
// Threshold is VAD detection sensitivity (0.0-1.0).
60+
Threshold float32 `json:"threshold,omitempty"`
61+
}
62+
63+
// toFormValue returns the string representation for multipart form submission.
64+
// "auto" is sent as literal string; "server_vad" is sent as JSON object.
65+
func (s TranscriptionChunkingStrategy) toFormValue() (string, error) {
66+
if s.Type == AudioChunkingStrategyAuto {
67+
return string(AudioChunkingStrategyAuto), nil
68+
}
69+
data, err := json.Marshal(s)
70+
if err != nil {
71+
return "", err
72+
}
73+
return string(data), nil
74+
}
75+
3776
// AudioRequest represents a request structure for audio API.
3877
type AudioRequest struct {
3978
Model string
@@ -49,6 +88,8 @@ type AudioRequest struct {
4988
Language string // Only for transcription.
5089
Format AudioResponseFormat
5190
TimestampGranularities []TranscriptionTimestampGranularity // Only for transcription.
91+
// ChunkingStrategy controls audio chunking. Required for diarization models on audio >30s.
92+
ChunkingStrategy *TranscriptionChunkingStrategy
5293
}
5394

5495
// AudioResponse represents a response structure for audio API.
@@ -79,6 +120,34 @@ type AudioResponse struct {
79120
httpHeader
80121
}
81122

123+
// AudioUsage represents usage statistics for audio API calls.
124+
type AudioUsage struct {
125+
Type string `json:"type"` // "duration" or "tokens"
126+
Seconds int `json:"seconds,omitempty"` // Duration in seconds (for duration-based billing)
127+
}
128+
129+
// DiarizedSegment represents a speaker-annotated segment from diarized transcription.
130+
type DiarizedSegment struct {
131+
Type string `json:"type"` // "transcript.text.segment"
132+
ID string `json:"id"` // Segment identifier (e.g., "seg_001")
133+
Start float64 `json:"start"` // Start time in seconds
134+
End float64 `json:"end"` // End time in seconds
135+
Text string `json:"text"` // Transcript text for this segment
136+
Speaker string `json:"speaker"` // Speaker label (e.g., "agent", "A")
137+
}
138+
139+
// DiarizedAudioResponse represents a diarized transcription response.
140+
// Returned when using gpt-4o-transcribe-diarize model with diarized_json format.
141+
type DiarizedAudioResponse struct {
142+
Task string `json:"task"` // "transcribe"
143+
Duration float64 `json:"duration"` // Audio duration in seconds
144+
Text string `json:"text"` // Full transcript with speaker prefixes
145+
Segments []DiarizedSegment `json:"segments"` // Speaker-annotated segments
146+
Usage *AudioUsage `json:"usage,omitempty"`
147+
148+
httpHeader
149+
}
150+
82151
type audioTextResponse struct {
83152
Text string `json:"text"`
84153

@@ -100,6 +169,39 @@ func (c *Client) CreateTranscription(
100169
return c.callAudioAPI(ctx, request, "transcriptions")
101170
}
102171

172+
// CreateDiarizedTranscription transcribes audio with speaker diarization.
173+
// Use with gpt-4o-transcribe-diarize model and AudioResponseFormatDiarizedJSON format.
174+
// Requires ChunkingStrategy for audio longer than 30 seconds.
175+
func (c *Client) CreateDiarizedTranscription(
176+
ctx context.Context,
177+
request AudioRequest,
178+
) (response DiarizedAudioResponse, err error) {
179+
var formBody bytes.Buffer
180+
builder := c.createFormBuilder(&formBody)
181+
182+
if err = audioMultipartForm(request, builder); err != nil {
183+
return DiarizedAudioResponse{}, err
184+
}
185+
186+
urlSuffix := "/audio/transcriptions"
187+
req, err := c.newRequest(
188+
ctx,
189+
http.MethodPost,
190+
c.fullURL(urlSuffix, withModel(request.Model)),
191+
withBody(&formBody),
192+
withContentType(builder.FormDataContentType()),
193+
)
194+
if err != nil {
195+
return DiarizedAudioResponse{}, err
196+
}
197+
198+
err = c.sendRequest(req, &response)
199+
if err != nil {
200+
return DiarizedAudioResponse{}, err
201+
}
202+
return
203+
}
204+
103205
// CreateTranslation — API call to translate audio into English.
104206
func (c *Client) CreateTranslation(
105207
ctx context.Context,
@@ -148,7 +250,8 @@ func (c *Client) callAudioAPI(
148250

149251
// HasJSONResponse returns true if the response format is JSON.
150252
func (r AudioRequest) HasJSONResponse() bool {
151-
return r.Format == "" || r.Format == AudioResponseFormatJSON || r.Format == AudioResponseFormatVerboseJSON
253+
return r.Format == "" || r.Format == AudioResponseFormatJSON ||
254+
r.Format == AudioResponseFormatVerboseJSON || r.Format == AudioResponseFormatDiarizedJSON
152255
}
153256

154257
// audioMultipartForm creates a form with audio file contents and the name of the model to use for
@@ -196,19 +299,43 @@ func audioMultipartForm(request AudioRequest, b utils.FormBuilder) error {
196299
}
197300
}
198301

199-
if len(request.TimestampGranularities) > 0 {
200-
for _, tg := range request.TimestampGranularities {
201-
err = b.WriteField("timestamp_granularities[]", string(tg))
202-
if err != nil {
203-
return fmt.Errorf("writing timestamp_granularities[]: %w", err)
204-
}
205-
}
302+
if err = writeTimestampGranularities(request.TimestampGranularities, b); err != nil {
303+
return err
304+
}
305+
306+
if err = writeChunkingStrategy(request.ChunkingStrategy, b); err != nil {
307+
return err
206308
}
207309

208310
// Close the multipart writer
209311
return b.Close()
210312
}
211313

314+
// writeTimestampGranularities writes the timestamp_granularities[] fields if provided.
315+
func writeTimestampGranularities(granularities []TranscriptionTimestampGranularity, b utils.FormBuilder) error {
316+
for _, tg := range granularities {
317+
if err := b.WriteField("timestamp_granularities[]", string(tg)); err != nil {
318+
return fmt.Errorf("writing timestamp_granularities[]: %w", err)
319+
}
320+
}
321+
return nil
322+
}
323+
324+
// writeChunkingStrategy writes the chunking_strategy field if provided.
325+
func writeChunkingStrategy(cs *TranscriptionChunkingStrategy, b utils.FormBuilder) error {
326+
if cs == nil {
327+
return nil
328+
}
329+
value, err := cs.toFormValue()
330+
if err != nil {
331+
return fmt.Errorf("marshaling chunking_strategy: %w", err)
332+
}
333+
if err = b.WriteField("chunking_strategy", value); err != nil {
334+
return fmt.Errorf("writing chunking_strategy: %w", err)
335+
}
336+
return nil
337+
}
338+
212339
// createFileField creates the "file" form field from either an existing file or by using the reader.
213340
func createFileField(request AudioRequest, b utils.FormBuilder) error {
214341
if request.Reader != nil {

0 commit comments

Comments
 (0)