togethercomputer · stainless-app · Apr 6, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 10, 2026
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.9.0"
+  ".": "0.10.0"
 }
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 76
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2Ftogetherai-76f8801298719cc87e9dc4c64b321bcfd432416d76488499d340b4bb6bf81b9b.yml
-openapi_spec_hash: ce0b83ef0a5f174461bd7d13a379b636
-config_hash: 52d213100a0ca1a4b2cdcd2718936b51
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai/togetherai-e218fafc0c9b31bd98647d1e2de6decc55f8a7f9719b3b565f94939c2ebcf0df.yml
+openapi_spec_hash: 026cc585ef61f52d4d6c4b60b969e323
+config_hash: 6c214c91fad5ead4849be777fd9e8108
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,55 @@
 # Changelog
 
+## 0.10.0 (2026-05-08)
+
+Full Changelog: [v0.9.0...v0.10.0](https://github.com/togethercomputer/together-go/compare/v0.9.0...v0.10.0)
+
+### Features
+
+* add the pronunciation dict ([665cf91](https://github.com/togethercomputer/together-go/commit/665cf9167aa7be6f15d4db9fe0e4fdd7a2984930))
+* **api:** add auto-scale/schedule/OIDC params, SCHEDULED_CAPACITY billing to clusters ([7769868](https://github.com/togethercomputer/together-go/commit/77698685188a076d8bc5707f686f0a3d1d4b4f34))
+* **api:** add cached_input field to ModelObjectPricing ([fb7b530](https://github.com/togethercomputer/together-go/commit/fb7b53061ff0df2956ccd2bed461226057eefbbd))
+* **api:** add h100-40gb-mig, b200-192gb GPU types, remove a100-80gb from deployments ([e041234](https://github.com/togethercomputer/together-go/commit/e04123466ad7059b1819cc3c6c0a6a369d7d1115))
+* **api:** add max_seq_length field to finetuning ([0964ef9](https://github.com/togethercomputer/together-go/commit/0964ef98937683218fd6c68ced1f93607932344e))
+* **api:** add max_tokens and temperature parameters to eval judge ([c619446](https://github.com/togethercomputer/together-go/commit/c6194463afd28dac5df67be10ed2d60f195572cf))
+* **api:** add num_workers parameter to eval evaluation parameters ([d061b9d](https://github.com/togethercomputer/together-go/commit/d061b9da29801d5f77ceaa4572868615af9ae285))
+* **api:** allow string or object for audio_inputs/source_video in video params ([75b4202](https://github.com/togethercomputer/together-go/commit/75b42023f5e1843a6e38f72c7b7b64f180fad27e))
+* **api:** Update server url to .ai ([9ae586b](https://github.com/togethercomputer/together-go/commit/9ae586b932601012d6428f8c1eba6d692376af0d))
+* ENG-87042: document language on TTS WebSocket and simplify locale note ([454dc61](https://github.com/togethercomputer/together-go/commit/454dc61de881e2c95c859f76bbf8dd1b2015f213))
+* **go:** add default http client with timeout ([f235d6f](https://github.com/togethercomputer/together-go/commit/f235d6f5ef94f6bd7f798783e3a36c4276f39d85))
+* MOSH-2181: Add default note on max-seq-length ([ee06e5a](https://github.com/togethercomputer/together-go/commit/ee06e5a98c1808e45586cc7bb399611ff368cd78))
+* support setting headers via env ([56a6ac9](https://github.com/togethercomputer/together-go/commit/56a6ac9b1c4ed2814672466a02bce74886e12fdb))
+
+
+### Bug Fixes
+
+* better respect format tags from the spec ([a99fd75](https://github.com/togethercomputer/together-go/commit/a99fd75b164e618cdeb5c021eb0e6bb3da5ff8ee))
+* **client:** update base URL to api.together.ai in video service ([169e53b](https://github.com/togethercomputer/together-go/commit/169e53bfff2c83a4eedb15c7123d73d68fa9230f))
+* **go:** avoid panic when http.DefaultTransport is wrapped ([d3d1491](https://github.com/togethercomputer/together-go/commit/d3d1491d8b33d9749059fe9d63c21fe65e3ab5e0))
+* **types:** change language from enum to string in audio speech ([c7c3dd5](https://github.com/togethercomputer/together-go/commit/c7c3dd500bb4e7ca3489ed06e36313de91b8f3e8))
+* **types:** remove eval-sample/eval-output/eval-summary/batch-generated from FilePurpose ([27aa4f5](https://github.com/togethercomputer/together-go/commit/27aa4f53fe430898efb99c694347bf1aa8126ace))
+* **types:** remove task field from audiotranscription and audiotranslation responses ([c0e0c08](https://github.com/togethercomputer/together-go/commit/c0e0c08eeab810c01da3e5ba9cd0c6490506ca7a))
+* **types:** rename cuda_driver_version to cuda_version in cluster regions ([0d3c446](https://github.com/togethercomputer/together-go/commit/0d3c4461539623a60767f8690485ea21ec6bb7c5))
+* **types:** restructure driver_versions, make supported_instance_types required in betacluster ([1298ca1](https://github.com/togethercomputer/together-go/commit/1298ca1f4f905f5cbec9c8391ed018b4217bc08e))
+
+
+### Chores
+
+* avoid embedding reflect.Type for dead code elimination ([8f52477](https://github.com/togethercomputer/together-go/commit/8f52477443b1ea2088d1080ef79c3f2a064bbc87))
+* **internal:** more robust bootstrap script ([32acdb4](https://github.com/togethercomputer/together-go/commit/32acdb465404e7df8f886e6d6afdebf34c886c69))
+* redact api-key headers in debug logs ([8b4cdf7](https://github.com/togethercomputer/together-go/commit/8b4cdf78008d26bc23a44546dd4a4e9a73c6ee30))
+* **tests:** bump steady to v0.22.1 ([998c407](https://github.com/togethercomputer/together-go/commit/998c407c7892af0bc66889e11669541f11d03b7e))
+
+
+### Documentation
+
+* **api:** clarify prompt parameter docs in transcription/translation ([637afa2](https://github.com/togethercomputer/together-go/commit/637afa209837f412dab07a0ecc8322ff8a10f6d3))
+* **api:** document voice mixing support for audio speech voice parameter ([374f51b](https://github.com/togethercomputer/together-go/commit/374f51bb529fd6271283a4b25a63b5e4f80dfe6c))
+* **api:** update billing_type description in beta cluster creation ([a40ff7f](https://github.com/togethercomputer/together-go/commit/a40ff7f25557222d01444d78adfd2acabf62307d))
+* **api:** update sample_rate and response_encoding parameter docs in audiospeech ([e1b61d9](https://github.com/togethercomputer/together-go/commit/e1b61d9e4718f75de57600ed247287cde4830d9d))
+* **api:** update supported formats in audio transcription/translation file params ([3bf7862](https://github.com/togethercomputer/together-go/commit/3bf7862520e1abdc30136f58dc89dcff44ddad66))
+* improve examples ([28ad2a1](https://github.com/togethercomputer/together-go/commit/28ad2a14bcb68286110cd2947040e8f8851e0d3b))
+
 ## 0.9.0 (2026-04-03)
 
 Full Changelog: [v0.8.0...v0.9.0](https://github.com/togethercomputer/together-go/compare/v0.8.0...v0.9.0)

diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ Or to pin the version:
 <!-- x-release-please-start-version -->
 
 ```sh
-go get -u 'github.com/togethercomputer/together-go@v0.9.0'
+go get -u 'github.com/togethercomputer/together-go@v0.10.0'
 ```
 
 <!-- x-release-please-end -->

diff --git a/audiospeech.go b/audiospeech.go
@@ -74,23 +74,32 @@ type AudioSpeechNewParams struct {
 	// You can view the voices supported for each model using the /v1/voices endpoint
 	// sending the model name as the query parameter.
 	// [View all supported voices here](https://docs.together.ai/docs/text-to-speech#supported-voices).
+	//
+	// `hexgrad/Kokoro-82M` additionally supports voice mixing, where two or more
+	// voices are combined into a single blended voice by joining their names with `+`
+	// (e.g. `af_bella+af_heart`). Optional per-voice weights can be provided in
+	// parentheses (e.g. `af_bella(2)+af_heart(1)`). Other models require a single
+	// voice name.
 	Voice string `json:"voice" api:"required"`
-	// Sampling rate to use for the output audio. The default sampling rate for
-	// canopylabs/orpheus-3b-0.1-ft and hexgrad/Kokoro-82M is 24000 and for
-	// cartesia/sonic is 44100.
+	// Language or locale of input text. Accepts ISO 639-1 language codes (e.g., `en`,
+	// `fr`, `es`, `zh`) as well as locale codes for region-specific variants. Locale
+	// codes must be lowercase (e.g., `zh-hk` for Cantonese).
+	Language param.Opt[string] `json:"language,omitzero"`
+	// Sampling rate in Hz for the output audio. Cartesia and Minimax models respect
+	// this parameter. Orpheus and Kokoro models always output at 24000 Hz regardless
+	// of this setting.
 	SampleRate param.Opt[int64] `json:"sample_rate,omitzero"`
 	// Bitrate of the MP3 audio output in bits per second. Only applicable when
 	// response_format is mp3. Higher values produce better audio quality at larger
 	// file sizes. Default is 128000. Currently supported on Cartesia models.
 	//
 	// Any of 32000, 64000, 96000, 128000, 192000.
 	BitRate int64 `json:"bit_rate,omitzero"`
-	// Language of input text.
-	//
-	// Any of "en", "de", "fr", "es", "hi", "it", "ja", "ko", "nl", "pl", "pt", "ru",
-	// "sv", "tr", "zh".
-	Language AudioSpeechNewParamsLanguage `json:"language,omitzero"`
-	// Audio encoding of response
+	// Additional model-specific parameters that fine-tune speech generation behavior.
+	ExtraParams AudioSpeechNewParamsExtraParams `json:"extra_params,omitzero"`
+	// Audio encoding of response. Only applicable when response_format is raw or pcm.
+	// Cartesia models respect this parameter and support all values. Orpheus, Kokoro,
+	// and Minimax models always return pcm_s16le regardless of this setting.
 	//
 	// Any of "pcm_f32le", "pcm_s16le", "pcm_mulaw", "pcm_alaw".
 	ResponseEncoding AudioSpeechNewParamsResponseEncoding `json:"response_encoding,omitzero"`
@@ -123,28 +132,26 @@ const (
 	AudioSpeechNewParamsModelCanopylabsOrpheus3b0_1Ft AudioSpeechNewParamsModel = "canopylabs/orpheus-3b-0.1-ft"
 )
 
-// Language of input text.
-type AudioSpeechNewParamsLanguage string
+// Additional model-specific parameters that fine-tune speech generation behavior.
+type AudioSpeechNewParamsExtraParams struct {
+	// A list of pronunciation rules for specific characters or symbols. Each entry
+	// uses the format `"<source>/<replacement>"` (e.g., `["omg/oh my god"]`) to
+	// override how the model pronounces matching tokens.
+	PronunciationDict []string `json:"pronunciation_dict,omitzero"`
+	paramObj
+}
 
-const (
-	AudioSpeechNewParamsLanguageEn AudioSpeechNewParamsLanguage = "en"
-	AudioSpeechNewParamsLanguageDe AudioSpeechNewParamsLanguage = "de"
-	AudioSpeechNewParamsLanguageFr AudioSpeechNewParamsLanguage = "fr"
-	AudioSpeechNewParamsLanguageEs AudioSpeechNewParamsLanguage = "es"
-	AudioSpeechNewParamsLanguageHi AudioSpeechNewParamsLanguage = "hi"
-	AudioSpeechNewParamsLanguageIt AudioSpeechNewParamsLanguage = "it"
-	AudioSpeechNewParamsLanguageJa AudioSpeechNewParamsLanguage = "ja"
-	AudioSpeechNewParamsLanguageKo AudioSpeechNewParamsLanguage = "ko"
-	AudioSpeechNewParamsLanguageNl AudioSpeechNewParamsLanguage = "nl"
-	AudioSpeechNewParamsLanguagePl AudioSpeechNewParamsLanguage = "pl"
-	AudioSpeechNewParamsLanguagePt AudioSpeechNewParamsLanguage = "pt"
-	AudioSpeechNewParamsLanguageRu AudioSpeechNewParamsLanguage = "ru"
-	AudioSpeechNewParamsLanguageSv AudioSpeechNewParamsLanguage = "sv"
-	AudioSpeechNewParamsLanguageTr AudioSpeechNewParamsLanguage = "tr"
-	AudioSpeechNewParamsLanguageZh AudioSpeechNewParamsLanguage = "zh"
-)
+func (r AudioSpeechNewParamsExtraParams) MarshalJSON() (data []byte, err error) {
+	type shadow AudioSpeechNewParamsExtraParams
+	return param.MarshalObject(r, (*shadow)(&r))
+}
+func (r *AudioSpeechNewParamsExtraParams) UnmarshalJSON(data []byte) error {
+	return apijson.UnmarshalRoot(data, r)
+}
 
-// Audio encoding of response
+// Audio encoding of response. Only applicable when response_format is raw or pcm.
+// Cartesia models respect this parameter and support all values. Orpheus, Kokoro,
+// and Minimax models always return pcm_s16le regardless of this setting.
 type AudioSpeechNewParamsResponseEncoding string
 
 const (

diff --git a/audiospeech_test.go b/audiospeech_test.go
@@ -27,11 +27,14 @@ func TestAudioSpeechNewWithOptionalParams(t *testing.T) {
 		option.WithAPIKey("My API Key"),
 	)
 	resp, err := client.Audio.Speech.New(context.TODO(), together.AudioSpeechNewParams{
-		Input:            "input",
-		Model:            together.AudioSpeechNewParamsModelCanopylabsOrpheus3b0_1Ft,
-		Voice:            "voice",
-		BitRate:          32000,
-		Language:         together.AudioSpeechNewParamsLanguageEn,
+		Input:   "input",
+		Model:   together.AudioSpeechNewParamsModelCanopylabsOrpheus3b0_1Ft,
+		Voice:   "voice",
+		BitRate: 32000,
+		ExtraParams: together.AudioSpeechNewParamsExtraParams{
+			PronunciationDict: []string{"omg/oh my god"},
+		},
+		Language:         together.String("en"),
 		ResponseEncoding: together.AudioSpeechNewParamsResponseEncodingPcmF32le,
 		ResponseFormat:   together.AudioSpeechNewParamsResponseFormatMP3,
 		SampleRate:       together.Int(0),

diff --git a/audiotranscription.go b/audiotranscription.go
@@ -64,9 +64,6 @@ type AudioTranscriptionNewResponseUnion struct {
 	Segments []AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponseSegment `json:"segments"`
 	// This field is from variant
 	// [AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponse].
-	Task string `json:"task"`
-	// This field is from variant
-	// [AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponse].
 	SpeakerSegments []AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponseSpeakerSegment `json:"speaker_segments"`
 	// This field is from variant
 	// [AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponse].
@@ -76,7 +73,6 @@ type AudioTranscriptionNewResponseUnion struct {
 		Duration        respjson.Field
 		Language        respjson.Field
 		Segments        respjson.Field
-		Task            respjson.Field
 		SpeakerSegments respjson.Field
 		Words           respjson.Field
 		raw             string
@@ -126,10 +122,6 @@ type AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponse struct {
 	Language string `json:"language" api:"required"`
 	// Array of transcription segments
 	Segments []AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponseSegment `json:"segments" api:"required"`
-	// The task performed
-	//
-	// Any of "transcribe", "translate".
-	Task string `json:"task" api:"required"`
 	// The transcribed text
 	Text string `json:"text" api:"required"`
 	// Array of transcription speaker segments (only when diarize is enabled)
@@ -141,7 +133,6 @@ type AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponse struct {
 		Duration        respjson.Field
 		Language        respjson.Field
 		Segments        respjson.Field
-		Task            respjson.Field
 		Text            respjson.Field
 		SpeakerSegments respjson.Field
 		Words           respjson.Field
@@ -278,7 +269,7 @@ func (r *AudioTranscriptionNewResponseAudioTranscriptionVerboseJsonResponseWord)
 
 type AudioTranscriptionNewParams struct {
 	// Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a,
-	// .webm, .flac.
+	// .webm, .flac, .ogg, .opus, .aac.
 	File AudioTranscriptionNewParamsFileUnion `json:"file,omitzero" api:"required" format:"binary"`
 	// Whether to enable speaker diarization. When enabled, you will get the speaker id
 	// for each word in the transcription. In the response, in the words array, you
@@ -300,7 +291,10 @@ type AudioTranscriptionNewParams struct {
 	// Minimum number of speakers expected in the audio. Used to improve diarization
 	// accuracy when the approximate number of speakers is known.
 	MinSpeakers param.Opt[int64] `json:"min_speakers,omitzero"`
-	// Optional text to bias decoding.
+	// Optional text to bias decoding. Supported only on Whisper-family models (e.g.
+	// `openai/whisper-large-v3`). Other STT models (e.g.
+	// `nvidia/parakeet-tdt-0.6b-v3`) accept the field for API compatibility but ignore
+	// it.
 	Prompt param.Opt[string] `json:"prompt,omitzero"`
 	// Sampling temperature between 0.0 and 1.0
 	Temperature param.Opt[float64] `json:"temperature,omitzero"`

diff --git a/audiotranslation.go b/audiotranslation.go
@@ -64,16 +64,12 @@ type AudioTranslationNewResponseUnion struct {
 	Segments []AudioTranslationNewResponseAudioTranslationVerboseJsonResponseSegment `json:"segments"`
 	// This field is from variant
 	// [AudioTranslationNewResponseAudioTranslationVerboseJsonResponse].
-	Task string `json:"task"`
-	// This field is from variant
-	// [AudioTranslationNewResponseAudioTranslationVerboseJsonResponse].
 	Words []AudioTranslationNewResponseAudioTranslationVerboseJsonResponseWord `json:"words"`
 	JSON  struct {
 		Text     respjson.Field
 		Duration respjson.Field
 		Language respjson.Field
 		Segments respjson.Field
-		Task     respjson.Field
 		Words    respjson.Field
 		raw      string
 	} `json:"-"`
@@ -120,10 +116,6 @@ type AudioTranslationNewResponseAudioTranslationVerboseJsonResponse struct {
 	Language string `json:"language" api:"required"`
 	// Array of translation segments
 	Segments []AudioTranslationNewResponseAudioTranslationVerboseJsonResponseSegment `json:"segments" api:"required"`
-	// The task performed
-	//
-	// Any of "transcribe", "translate".
-	Task string `json:"task" api:"required"`
 	// The translated text
 	Text string `json:"text" api:"required"`
 	// Array of translation words (only when timestamp_granularities includes 'word')
@@ -133,7 +125,6 @@ type AudioTranslationNewResponseAudioTranslationVerboseJsonResponse struct {
 		Duration    respjson.Field
 		Language    respjson.Field
 		Segments    respjson.Field
-		Task        respjson.Field
 		Text        respjson.Field
 		Words       respjson.Field
 		ExtraFields map[string]respjson.Field
@@ -207,12 +198,15 @@ func (r *AudioTranslationNewResponseAudioTranslationVerboseJsonResponseWord) Unm
 
 type AudioTranslationNewParams struct {
 	// Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a,
-	// .webm, .flac.
+	// .webm, .flac, .ogg, .opus, .aac.
 	File AudioTranslationNewParamsFileUnion `json:"file,omitzero" api:"required" format:"binary"`
 	// Target output language. Optional ISO 639-1 language code. If omitted, language
 	// is set to English.
 	Language param.Opt[string] `json:"language,omitzero"`
-	// Optional text to bias decoding.
+	// Optional text to bias decoding. Supported only on Whisper-family models (e.g.
+	// `openai/whisper-large-v3`). Other STT models (e.g.
+	// `nvidia/parakeet-tdt-0.6b-v3`) accept the field for API compatibility but ignore
+	// it.
 	Prompt param.Opt[string] `json:"prompt,omitzero"`
 	// Sampling temperature between 0.0 and 1.0
 	Temperature param.Opt[float64] `json:"temperature,omitzero"`