Skip to content

Commit d7db8b4

Browse files
Added timestamp_granularities parameter to the Audio API
1 parent a404763 commit d7db8b4

File tree

2 files changed

+63
-2
lines changed

2 files changed

+63
-2
lines changed

Sources/OpenAI/Public/Parameters/Audio/AudioTranscriptionParameters.swift

+9-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ public struct AudioTranscriptionParameters: Encodable {
2424
let responseFormat: String?
2525
/// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. Defaults to 0
2626
let temperature: Double?
27+
/// Defaults to segment
28+
/// The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
29+
let timestampGranularities: [String]?
2730

2831
public enum Model: String {
2932
case whisperOne = "whisper-1"
@@ -36,6 +39,7 @@ public struct AudioTranscriptionParameters: Encodable {
3639
case responseFormat = "response_format"
3740
case temperature
3841
case language
42+
case timestampGranularities = "timestamp_granularities[]"
3943
}
4044

4145
public init(
@@ -45,7 +49,8 @@ public struct AudioTranscriptionParameters: Encodable {
4549
prompt: String? = nil,
4650
responseFormat: String? = nil,
4751
temperature: Double? = nil,
48-
language: String? = nil)
52+
language: String? = nil,
53+
timestampGranularities: [String]? = nil)
4954
{
5055
self.fileName = fileName
5156
self.file = file
@@ -54,6 +59,7 @@ public struct AudioTranscriptionParameters: Encodable {
5459
self.responseFormat = responseFormat
5560
self.temperature = temperature
5661
self.language = language
62+
self.timestampGranularities = timestampGranularities
5763
}
5864
}
5965

@@ -68,7 +74,8 @@ extension AudioTranscriptionParameters: MultipartFormDataParameters {
6874
.string(paramName: Self.CodingKeys.language.rawValue, value: language),
6975
.string(paramName: Self.CodingKeys.prompt.rawValue, value: prompt),
7076
.string(paramName: Self.CodingKeys.responseFormat.rawValue, value: responseFormat),
71-
.string(paramName: Self.CodingKeys.temperature.rawValue, value: temperature)
77+
.string(paramName: Self.CodingKeys.temperature.rawValue, value: temperature),
78+
.string(paramName: Self.CodingKeys.timestampGranularities.rawValue, value: timestampGranularities)
7279
]).build()
7380
}
7481
}

Sources/OpenAI/Public/ResponseModels/Audio/AudioObject.swift

+54
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,60 @@ import Foundation
1010
/// The [audio](https://platform.openai.com/docs/api-reference/audio) response.
1111
public struct AudioObject: Decodable {
1212

13+
/// The language of the input audio.
14+
public let language: String?
15+
/// The duration of the input audio.
16+
public let duration: String?
1317
/// The transcribed text if the request uses the `transcriptions` API, or the translated text if the request uses the `translations` endpoint.
1418
public let text: String
19+
/// Extracted words and their corresponding timestamps.
20+
public let words: [Word]?
21+
/// Segments of the transcribed text and their corresponding details.
22+
public let segments: [Segment]?
23+
24+
public struct Word: Decodable {
25+
26+
/// The text content of the word.
27+
public let word: String
28+
/// Start time of the word in seconds.
29+
public let start: Double
30+
/// End time of the word in seconds.
31+
public let end: Double
32+
}
33+
34+
public struct Segment: Decodable {
35+
/// Unique identifier of the segment.
36+
public let id: Int
37+
/// Seek offset of the segment.
38+
public let seek: Int
39+
/// Start time of the segment in seconds.
40+
public let start: Double
41+
/// End time of the segment in seconds.
42+
public let end: Double
43+
/// Text content of the segment.
44+
public let text: String
45+
/// Array of token IDs for the text content.
46+
public let tokens: [Int]
47+
/// Temperature parameter used for generating the segment.
48+
public let temperature: Double
49+
/// Average logprob of the segment. If the value is lower than -1, consider the logprobs failed.
50+
public let avgLogprob: Double
51+
/// Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed.
52+
public let compressionRatio: Double
53+
/// Probability of no speech in the segment. If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.
54+
public let noSpeechProb: Double
55+
56+
enum CodingKeys: String, CodingKey {
57+
case id
58+
case seek
59+
case start
60+
case end
61+
case text
62+
case tokens
63+
case temperature
64+
case avgLogprob = "avg_logprob"
65+
case compressionRatio = "compression_ratio"
66+
case noSpeechProb = "no_speech_prob"
67+
}
68+
}
1569
}

0 commit comments

Comments
 (0)