Add support for word-level audio transcription timestamp granularity (#733)

agcom · web-flow · commit 3334a9c78a9d · 2024-05-07T16:40:07.000+04:00
* Add support for audio transcription timestamp_granularities word

* Fixup multiple timestamp granularities
diff --git a/audio.go b/audio.go
@@ -27,8 +27,14 @@ const (
 	AudioResponseFormatVTT         AudioResponseFormat = "vtt"
 )
 
+type TranscriptionTimestampGranularity string
+
+const (
+	TranscriptionTimestampGranularityWord    TranscriptionTimestampGranularity = "word"
+	TranscriptionTimestampGranularitySegment TranscriptionTimestampGranularity = "segment"
+)
+
 // AudioRequest represents a request structure for audio API.
-// ResponseFormat is not supported for now. We only return JSON text, which may be sufficient.
 type AudioRequest struct {
 	Model string
 
@@ -38,10 +44,11 @@ type AudioRequest struct {
 	// Reader is an optional io.Reader when you do not want to use an existing file.
 	Reader io.Reader
 
-	Prompt      string // For translation, it should be in English
-	Temperature float32
-	Language    string // For translation, just do not use it. It seems "en" works, not confirmed...
-	Format      AudioResponseFormat
+	Prompt                 string
+	Temperature            float32
+	Language               string // Only for transcription.
+	Format                 AudioResponseFormat
+	TimestampGranularities []TranscriptionTimestampGranularity // Only for transcription.
 }
 
 // AudioResponse represents a response structure for audio API.
@@ -62,6 +69,11 @@ type AudioResponse struct {
 		NoSpeechProb     float64 `json:"no_speech_prob"`
 		Transient        bool    `json:"transient"`
 	} `json:"segments"`
+	Words []struct {
+		Word  string  `json:"word"`
+		Start float64 `json:"start"`
+		End   float64 `json:"end"`
+	} `json:"words"`
 	Text string `json:"text"`
 
 	httpHeader
@@ -179,6 +191,15 @@ func audioMultipartForm(request AudioRequest, b utils.FormBuilder) error {
 		}
 	}
 
+	if len(request.TimestampGranularities) > 0 {
+		for _, tg := range request.TimestampGranularities {
+			err = b.WriteField("timestamp_granularities[]", string(tg))
+			if err != nil {
+				return fmt.Errorf("writing timestamp_granularities[]: %w", err)
+			}
+		}
+	}
+
 	// Close the multipart writer
 	return b.Close()
 }
diff --git a/audio_api_test.go b/audio_api_test.go
@@ -105,6 +105,10 @@ func TestAudioWithOptionalArgs(t *testing.T) {
 				Temperature: 0.5,
 				Language:    "zh",
 				Format:      openai.AudioResponseFormatSRT,
+				TimestampGranularities: []openai.TranscriptionTimestampGranularity{
+					openai.TranscriptionTimestampGranularitySegment,
+					openai.TranscriptionTimestampGranularityWord,
+				},
 			}
 			_, err := tc.createFn(ctx, req)
 			checks.NoError(t, err, "audio API error")
diff --git a/audio_test.go b/audio_test.go
@@ -24,6 +24,10 @@ func TestAudioWithFailingFormBuilder(t *testing.T) {
 		Temperature: 0.5,
 		Language:    "en",
 		Format:      AudioResponseFormatSRT,
+		TimestampGranularities: []TranscriptionTimestampGranularity{
+			TranscriptionTimestampGranularitySegment,
+			TranscriptionTimestampGranularityWord,
+		},
 	}
 
 	mockFailedErr := fmt.Errorf("mock form builder fail")
@@ -47,7 +51,7 @@ func TestAudioWithFailingFormBuilder(t *testing.T) {
 		return nil
 	}
 
-	failOn := []string{"model", "prompt", "temperature", "language", "response_format"}
+	failOn := []string{"model", "prompt", "temperature", "language", "response_format", "timestamp_granularities[]"}
 	for _, failingField := range failOn {
 		failForField = failingField
 		mockFailedErr = fmt.Errorf("mock form builder fail on field %s", failingField)