Skip to content

Commit ad6b837

Browse files
authored
Merge pull request #88 from AkiChase/feature/stepfun-asr
Add StepFun ASR provider
1 parent 1cd7e9b commit ad6b837

32 files changed

Lines changed: 1727 additions & 61 deletions

Voxt/App/RecordingSessionSupport.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ enum RecordingSessionSupport {
180180
break
181181
}
182182
switch remoteProvider {
183-
case .openAIWhisper, .glmASR:
183+
case .openAIWhisper, .glmASR, .stepFunASR:
184184
return 60
185185
case .doubaoASR, .aliyunBailianASR:
186186
return 8

Voxt/ModelIcons/stepfun.svg

Lines changed: 1 addition & 0 deletions
Loading

Voxt/Settings/FeatureModelCatalogBuilder.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,11 @@ struct FeatureModelCatalogBuilder {
469469
if RemoteASRRealtimeSupport.isAliyunRealtimeModel(configuration.model) {
470470
tags.append(localized("Realtime"))
471471
}
472+
case .stepFunASR:
473+
if RemoteASRRealtimeSupport.isStepFunRealtimeModel(configuration.model) {
474+
tags.append(localized("Realtime"))
475+
}
476+
tags.append(contentsOf: [localized("Accurate"), localized("Multilingual")])
472477
}
473478
return deduplicatedFeatureTags(tags)
474479
}

Voxt/Settings/ModelLogoView.swift

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ enum ModelLogoKey: String {
2727
case volcengine
2828
case lmStudio
2929
case alibaba
30+
case stepFun
3031
case generic
3132

3233
var resourceName: String? {
@@ -81,6 +82,8 @@ enum ModelLogoKey: String {
8182
return "lmstudio"
8283
case .alibaba:
8384
return "alibaba"
85+
case .stepFun:
86+
return "stepfun"
8487
case .generic:
8588
return nil
8689
}
@@ -91,7 +94,7 @@ enum ModelLogoKey: String {
9194
case .apple, .openAI, .grok, .ollama, .openRouter, .lmStudio, .kimi:
9295
return true
9396
case .anthropic, .google, .gemini, .qwen, .zhipu, .deepSeek, .cohere, .granite, .fireRed, .sense,
94-
.mistral, .gemma, .meta, .nvidia, .minimax, .doubao, .volcengine, .alibaba, .generic:
97+
.mistral, .gemma, .meta, .nvidia, .minimax, .doubao, .volcengine, .alibaba, .stepFun, .generic:
9598
return false
9699
}
97100
}
@@ -148,6 +151,8 @@ enum ModelLogoKey: String {
148151
return "LM"
149152
case .alibaba:
150153
return ""
154+
case .stepFun:
155+
return ""
151156
case .generic:
152157
return "M"
153158
}
@@ -191,6 +196,9 @@ enum ModelLogoKey: String {
191196
if value.contains("doubao") || value.contains("豆包") {
192197
return .doubao
193198
}
199+
if value.contains("stepfun") || value.contains("阶跃") {
200+
return .stepFun
201+
}
194202
if value.contains("volc") || value.contains("火山") {
195203
return .volcengine
196204
}

Voxt/Settings/ModelSettingsCatalogBuilder.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,11 @@ struct ModelCatalogBuilder {
268268
if RemoteASRRealtimeSupport.isAliyunRealtimeModel(configuration.model) {
269269
tags.append(localizedModelCatalog("Realtime"))
270270
}
271+
case .stepFunASR:
272+
if RemoteASRRealtimeSupport.isStepFunRealtimeModel(configuration.model) {
273+
tags.append(localizedModelCatalog("Realtime"))
274+
}
275+
tags.append(contentsOf: [localizedModelCatalog("Accurate"), localizedModelCatalog("Multilingual")])
271276
}
272277
return deduplicatedTags(tags)
273278
}

Voxt/Settings/ModelSettingsView+ModelActions.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,7 @@ extension ModelSettingsView {
588588
return AppLocalization.localizedString("Doubao uses App ID + Access Token for streaming API.")
589589
case .aliyunBailianASR:
590590
return AppLocalization.localizedString("Aliyun ASR in Voxt uses realtime WebSocket only: Qwen models use /api-ws/v1/realtime, Fun/Paraformer models use /api-ws/v1/inference.")
591-
case .openAIWhisper, .glmASR:
591+
case .openAIWhisper, .glmASR, .stepFunASR:
592592
return nil
593593
}
594594
}

Voxt/Settings/OnboardingSettingsView+Data.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ extension OnboardingSettingsView {
461461
return AppLocalization.localizedString("Doubao uses App ID + Access Token for streaming API.")
462462
case .aliyunBailianASR:
463463
return AppLocalization.localizedString("Aliyun ASR in Voxt uses realtime WebSocket only: Qwen models use /api-ws/v1/realtime, Fun/Paraformer models use /api-ws/v1/inference.")
464-
case .openAIWhisper, .glmASR:
464+
case .openAIWhisper, .glmASR, .stepFunASR:
465465
return nil
466466
}
467467
}

Voxt/Settings/RemoteProviderConfigurationSheet+Sections.swift

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -501,23 +501,31 @@ extension RemoteProviderConfigurationSheet {
501501

502502
var generationPenaltyFields: some View {
503503
VStack(alignment: .leading, spacing: 10) {
504-
HStack(alignment: .top, spacing: 12) {
505-
generationNumericField(
506-
title: AppLocalization.localizedString("Presence Penalty"),
507-
placeholder: "0",
508-
text: $generationPresencePenaltyText
509-
)
504+
if isStepFunLLMProvider {
510505
generationNumericField(
511506
title: AppLocalization.localizedString("Frequency Penalty"),
512507
placeholder: "0",
513508
text: $generationFrequencyPenaltyText
514509
)
510+
} else {
511+
HStack(alignment: .top, spacing: 12) {
512+
generationNumericField(
513+
title: AppLocalization.localizedString("Presence Penalty"),
514+
placeholder: "0",
515+
text: $generationPresencePenaltyText
516+
)
517+
generationNumericField(
518+
title: AppLocalization.localizedString("Frequency Penalty"),
519+
placeholder: "0",
520+
text: $generationFrequencyPenaltyText
521+
)
522+
}
523+
generationNumericField(
524+
title: AppLocalization.localizedString("Repetition Penalty"),
525+
placeholder: "1.1",
526+
text: $generationRepetitionPenaltyText
527+
)
515528
}
516-
generationNumericField(
517-
title: AppLocalization.localizedString("Repetition Penalty"),
518-
placeholder: "1.1",
519-
text: $generationRepetitionPenaltyText
520-
)
521529
}
522530
}
523531

Voxt/Settings/RemoteProviderConfigurationSheet+State.swift

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@ extension RemoteProviderConfigurationSheet {
1818
llmProviderForPicker == .codex
1919
}
2020

21+
var isStepFunLLMProvider: Bool {
22+
llmProviderForPicker == .stepFun
23+
}
24+
25+
var supportsStepFunReasoningEffort: Bool {
26+
isStepFunLLMProvider &&
27+
resolvedModelValue().trimmingCharacters(in: .whitespacesAndNewlines).lowercased() == "step-3.5-flash-2603"
28+
}
29+
2130
var usesOpenAIResponsesOptions: Bool {
2231
isOpenAILLMProvider
2332
}
@@ -143,9 +152,12 @@ extension RemoteProviderConfigurationSheet {
143152
}
144153

145154
var shouldShowGenerationThinking: Bool {
155+
if isStepFunLLMProvider {
156+
return generationThinkingModeMenuOptions.count > 1
157+
}
146158
guard let capabilities = generationCapabilities else { return false }
147159
return capabilities.supportsThinkingToggle ||
148-
capabilities.supportsThinkingEffort ||
160+
(capabilities.supportsThinkingEffort && (!isStepFunLLMProvider || supportsStepFunReasoningEffort)) ||
149161
capabilities.supportsThinkingBudget
150162
}
151163

@@ -165,13 +177,20 @@ extension RemoteProviderConfigurationSheet {
165177
}
166178

167179
var generationThinkingModeMenuOptions: [SettingsMenuOption<String>] {
180+
if isStepFunLLMProvider {
181+
var options = [SettingsMenuOption(value: LLMThinkingMode.off.rawValue, title: AppLocalization.localizedString("Off"))]
182+
if supportsStepFunReasoningEffort {
183+
options.append(SettingsMenuOption(value: LLMThinkingMode.effort.rawValue, title: AppLocalization.localizedString("Effort")))
184+
}
185+
return options
186+
}
168187
guard let capabilities = generationCapabilities else { return [] }
169188
var options = [SettingsMenuOption(value: LLMThinkingMode.providerDefault.rawValue, title: AppLocalization.localizedString("Default"))]
170189
if capabilities.supportsThinkingToggle {
171190
options.append(SettingsMenuOption(value: LLMThinkingMode.off.rawValue, title: AppLocalization.localizedString("Off")))
172191
options.append(SettingsMenuOption(value: LLMThinkingMode.on.rawValue, title: AppLocalization.localizedString("On")))
173192
}
174-
if capabilities.supportsThinkingEffort {
193+
if capabilities.supportsThinkingEffort && (!isStepFunLLMProvider || supportsStepFunReasoningEffort) {
175194
options.append(SettingsMenuOption(value: LLMThinkingMode.effort.rawValue, title: AppLocalization.localizedString("Effort")))
176195
}
177196
if capabilities.supportsThinkingBudget {
@@ -182,12 +201,15 @@ extension RemoteProviderConfigurationSheet {
182201

183202
var generationThinkingModeSelectedTitle: String {
184203
generationThinkingModeMenuOptions.first(where: { $0.value == generationThinkingMode })?.title
185-
?? AppLocalization.localizedString("Default")
204+
?? (isStepFunLLMProvider ? AppLocalization.localizedString("Off") : AppLocalization.localizedString("Default"))
186205
}
187206

188207
var sanitizedGenerationThinkingMode: LLMThinkingMode {
189208
let mode = LLMThinkingMode(rawValue: generationThinkingMode) ?? .providerDefault
190209
let supportedValues = Set(generationThinkingModeMenuOptions.map(\.value))
210+
if isStepFunLLMProvider, !supportedValues.contains(mode.rawValue) {
211+
return .off
212+
}
191213
return supportedValues.contains(mode.rawValue) ? mode : .providerDefault
192214
}
193215

@@ -197,6 +219,8 @@ extension RemoteProviderConfigurationSheet {
197219
values = OpenAIReasoningEffort.supportedCases(forModel: resolvedModelValue())
198220
.filter { $0 != .automatic }
199221
.map(\.rawValue)
222+
} else if isStepFunLLMProvider {
223+
values = supportsStepFunReasoningEffort ? ["low", "high"] : []
200224
} else if isOllamaLLMProvider {
201225
values = [
202226
OllamaThinkMode.low.rawValue,
@@ -273,6 +297,9 @@ extension RemoteProviderConfigurationSheet {
273297
}
274298
let capabilities = LLMProviderCapabilityRegistry.capabilities(for: provider)
275299
var settings = LLMGenerationSettings()
300+
if isStepFunLLMProvider {
301+
settings.thinking = .off
302+
}
276303
settings.maxOutputTokens = capabilities.supportsMaxOutputTokens ? parsedOptionalInt(generationMaxOutputTokensText) : nil
277304
settings.temperature = capabilities.supportsTemperature ? parsedOptionalDouble(generationTemperatureText) : nil
278305
settings.topP = capabilities.supportsTopP ? parsedOptionalDouble(generationTopPText) : nil
@@ -281,9 +308,11 @@ extension RemoteProviderConfigurationSheet {
281308
settings.seed = capabilities.supportsSeed ? parsedOptionalInt(generationSeedText) : nil
282309
settings.stop = capabilities.supportsStopSequences ? parsedStopSequences() : []
283310
if capabilities.supportsPenalties {
284-
settings.presencePenalty = parsedOptionalDouble(generationPresencePenaltyText)
285311
settings.frequencyPenalty = parsedOptionalDouble(generationFrequencyPenaltyText)
286-
settings.repetitionPenalty = parsedOptionalDouble(generationRepetitionPenaltyText)
312+
if !isStepFunLLMProvider {
313+
settings.presencePenalty = parsedOptionalDouble(generationPresencePenaltyText)
314+
settings.repetitionPenalty = parsedOptionalDouble(generationRepetitionPenaltyText)
315+
}
287316
}
288317
if capabilities.supportsLogprobs {
289318
settings.logprobs = generationLogprobsEnabled
@@ -653,9 +682,11 @@ extension RemoteProviderConfigurationSheet {
653682
doubleFields.append((generationMinPText, AppLocalization.localizedString("Min P")))
654683
}
655684
if capabilities.supportsPenalties {
656-
doubleFields.append((generationPresencePenaltyText, AppLocalization.localizedString("Presence Penalty")))
657685
doubleFields.append((generationFrequencyPenaltyText, AppLocalization.localizedString("Frequency Penalty")))
658-
doubleFields.append((generationRepetitionPenaltyText, AppLocalization.localizedString("Repetition Penalty")))
686+
if !isStepFunLLMProvider {
687+
doubleFields.append((generationPresencePenaltyText, AppLocalization.localizedString("Presence Penalty")))
688+
doubleFields.append((generationRepetitionPenaltyText, AppLocalization.localizedString("Repetition Penalty")))
689+
}
659690
}
660691
for (text, fieldName) in doubleFields where !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
661692
guard Double(text.trimmingCharacters(in: .whitespacesAndNewlines)) != nil else {

Voxt/Support/ASRHintResolver.swift

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ struct ResolvedASRHintPayload {
77
var prompt: String?
88
var otherLanguages: [String] = []
99
var multilingualContext: String?
10+
var contextualPhrases: [String] = []
1011
}
1112

1213
struct ResolvedDictationSettings: Equatable {
@@ -37,6 +38,7 @@ enum ASRHintResolver {
3738
dictionaryTerms: dictionaryTerms
3839
)
3940
let otherLanguages = otherLanguageOptions.map(\.promptName)
41+
let contextualPhrases = ASRHintSettingsStore.contextualPhrases(from: settings)
4042
let usesExplicitSingleLanguageHint = settings.followsUserMainLanguage && otherLanguageOptions.isEmpty
4143
let mlxResolvedLanguage = settings.followsUserMainLanguage
4244
? resolvedMLXLanguageHint(
@@ -95,6 +97,17 @@ enum ASRHintResolver {
9597
prompt: nil,
9698
otherLanguages: otherLanguages
9799
)
100+
case .stepFunASR:
101+
let terms = resolvedStepFunTerms(
102+
contextualPhrases: contextualPhrases,
103+
dictionaryTerms: dictionaryTerms
104+
)
105+
return ResolvedASRHintPayload(
106+
language: usesExplicitSingleLanguageHint ? resolvedStepFunLanguage(mainLanguage) : nil,
107+
prompt: resolvedStepFunPrompt(terms: terms),
108+
otherLanguages: otherLanguages,
109+
contextualPhrases: terms
110+
)
98111
}
99112
}
100113

@@ -233,6 +246,39 @@ enum ASRHintResolver {
233246
"""
234247
}
235248

249+
private static func resolvedStepFunTerms(
250+
contextualPhrases: [String],
251+
dictionaryTerms: String
252+
) -> [String] {
253+
mergedTermLines(
254+
contextualPhrases + dictionaryTerms.components(separatedBy: .newlines)
255+
)
256+
}
257+
258+
private static func resolvedStepFunPrompt(
259+
terms: [String]
260+
) -> String? {
261+
guard !terms.isEmpty else { return nil }
262+
263+
return """
264+
Prefer these terms when they match the audio. Preserve names, product terms, technical terms, URLs, and code-like text exactly as spoken. Do not translate them.
265+
\(terms.joined(separator: "\n"))
266+
"""
267+
}
268+
269+
private static func mergedTermLines(_ values: [String]) -> [String] {
270+
var seen = Set<String>()
271+
var result: [String] = []
272+
for value in values {
273+
let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines)
274+
guard !trimmed.isEmpty else { continue }
275+
let key = trimmed.folding(options: [.caseInsensitive, .diacriticInsensitive], locale: .current)
276+
guard seen.insert(key).inserted else { continue }
277+
result.append(trimmed)
278+
}
279+
return result
280+
}
281+
236282
private static func resolvedOpenAILanguage(_ language: UserMainLanguageOption) -> String {
237283
language.baseLanguageCode
238284
}
@@ -282,6 +328,10 @@ enum ASRHintResolver {
282328
return Array(deduped.prefix(3))
283329
}
284330

331+
private static func resolvedStepFunLanguage(_ language: UserMainLanguageOption) -> String {
332+
language.baseLanguageCode
333+
}
334+
285335
private static func resolvedMLXLanguage(mainLanguage: UserMainLanguageOption, modelRepo: String?) -> String? {
286336
guard let modelRepo else { return nil }
287337
if modelRepo.localizedCaseInsensitiveContains("granite-4.0-1b-speech") {

0 commit comments

Comments
 (0)