Skip to content

Commit 60edbac

Browse files
committed
Improve SenseVoice integration and ASR settings
1 parent 25b3d7f commit 60edbac

27 files changed

Lines changed: 920 additions & 107 deletions

Voxt.xcodeproj/project.pbxproj

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
3C1A2B3C4D5E6F708090A0B0C /* AppIntents.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 3C1A2B3C4D5E6F708090A0B0A /* AppIntents.framework */; };
1212
A1B2C3D4E5F6078901234567 /* MLXAudioCore in Frameworks */ = {isa = PBXBuildFile; productRef = A1B2C3D4E5F6078901234568 /* MLXAudioCore */; };
1313
A1B2C3D4E5F6078901234569 /* MLXAudioSTT in Frameworks */ = {isa = PBXBuildFile; productRef = A1B2C3D4E5F607890123456A /* MLXAudioSTT */; };
14+
A1B2C3D4E5F6078901234570 /* MLXAudioVAD in Frameworks */ = {isa = PBXBuildFile; productRef = A1B2C3D4E5F6078901234571 /* MLXAudioVAD */; };
1415
B1C2D3E4F5060708090A0B0C /* Sparkle in Frameworks */ = {isa = PBXBuildFile; productRef = B1C2D3E4F5060708090A0B0D /* Sparkle */; };
1516
C1D2E3F4A5060708090A0B01 /* MLXLMCommon in Frameworks */ = {isa = PBXBuildFile; productRef = C1D2E3F4A5060708090A0B02 /* MLXLMCommon */; };
1617
C1D2E3F4A5060708090A0B06 /* MLXLLM in Frameworks */ = {isa = PBXBuildFile; productRef = C1D2E3F4A5060708090A0B07 /* MLXLLM */; };
@@ -77,6 +78,7 @@
7778
3C1A2B3C4D5E6F708090A0B0C /* AppIntents.framework in Frameworks */,
7879
A1B2C3D4E5F6078901234567 /* MLXAudioCore in Frameworks */,
7980
A1B2C3D4E5F6078901234569 /* MLXAudioSTT in Frameworks */,
81+
A1B2C3D4E5F6078901234570 /* MLXAudioVAD in Frameworks */,
8082
B1C2D3E4F5060708090A0B0C /* Sparkle in Frameworks */,
8183
C1D2E3F4A5060708090A0B01 /* MLXLMCommon in Frameworks */,
8284
C1D2E3F4A5060708090A0B06 /* MLXLLM in Frameworks */,
@@ -186,6 +188,7 @@
186188
packageProductDependencies = (
187189
A1B2C3D4E5F6078901234568 /* MLXAudioCore */,
188190
A1B2C3D4E5F607890123456A /* MLXAudioSTT */,
191+
A1B2C3D4E5F6078901234571 /* MLXAudioVAD */,
189192
B1C2D3E4F5060708090A0B0D /* Sparkle */,
190193
C1D2E3F4A5060708090A0B02 /* MLXLMCommon */,
191194
C1D2E3F4A5060708090A0B07 /* MLXLLM */,
@@ -253,7 +256,7 @@
253256
A1B2C3D4E5F607890123456B /* XCRemoteSwiftPackageReference "mlx-audio-swift" */,
254257
B1C2D3E4F5060708090A0B0E /* XCRemoteSwiftPackageReference "Sparkle" */,
255258
C1D2E3F4A5060708090A0B03 /* XCRemoteSwiftPackageReference "mlx-swift-lm" */,
256-
D1E2F3A4B5C60718293A4B5E /* XCRemoteSwiftPackageReference "WhisperKit" */,
259+
D1E2F3A4B5C60718293A4B5E /* XCRemoteSwiftPackageReference "argmax-oss-swift" */,
257260
E2A1C3B4D5F60718293A4B70 /* XCRemoteSwiftPackageReference "PermissionFlow" */,
258261
F1A2B3C4D5E6F708090A0B0E /* XCRemoteSwiftPackageReference "FaviconFinder" */,
259262
FA11DBA2FA11DBA2FA11DBA2 /* XCRemoteSwiftPackageReference "GRDB.swift" */,
@@ -911,7 +914,7 @@
911914
version = 3.31.3;
912915
};
913916
};
914-
D1E2F3A4B5C60718293A4B5E /* XCRemoteSwiftPackageReference "WhisperKit" */ = {
917+
D1E2F3A4B5C60718293A4B5E /* XCRemoteSwiftPackageReference "argmax-oss-swift" */ = {
915918
isa = XCRemoteSwiftPackageReference;
916919
repositoryURL = "https://github.com/argmaxinc/argmax-oss-swift.git";
917920
requirement = {
@@ -956,6 +959,11 @@
956959
package = A1B2C3D4E5F607890123456B /* XCRemoteSwiftPackageReference "mlx-audio-swift" */;
957960
productName = MLXAudioSTT;
958961
};
962+
A1B2C3D4E5F6078901234571 /* MLXAudioVAD */ = {
963+
isa = XCSwiftPackageProductDependency;
964+
package = A1B2C3D4E5F607890123456B /* XCRemoteSwiftPackageReference "mlx-audio-swift" */;
965+
productName = MLXAudioVAD;
966+
};
959967
B1C2D3E4F5060708090A0B0D /* Sparkle */ = {
960968
isa = XCSwiftPackageProductDependency;
961969
package = B1C2D3E4F5060708090A0B0E /* XCRemoteSwiftPackageReference "Sparkle" */;
@@ -973,7 +981,7 @@
973981
};
974982
D1E2F3A4B5C60718293A4B5D /* WhisperKit */ = {
975983
isa = XCSwiftPackageProductDependency;
976-
package = D1E2F3A4B5C60718293A4B5E /* XCRemoteSwiftPackageReference "WhisperKit" */;
984+
package = D1E2F3A4B5C60718293A4B5E /* XCRemoteSwiftPackageReference "argmax-oss-swift" */;
977985
productName = WhisperKit;
978986
};
979987
E2A1C3B4D5F60718293A4B6D /* PermissionFlow */ = {

Voxt/App/AppDelegate+PreferencesAndHistory.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,16 +337,19 @@ extension AppDelegate {
337337
let remoteASRProviderInfo: String?
338338
let remoteASRModelInfo: String?
339339
let remoteASREndpointInfo: String?
340+
let senseVoiceMetadata: SenseVoiceTranscriptMetadata?
340341
if transcriptionEngine == .remote {
341342
let provider = remoteASRSelectedProvider
342343
let config = remoteASRConfigurations[provider.rawValue]
343344
remoteASRProviderInfo = provider.title
344345
remoteASRModelInfo = config?.model.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty == false ? config?.model : nil
345346
remoteASREndpointInfo = historyDisplayEndpoint(config?.endpoint)
347+
senseVoiceMetadata = nil
346348
} else {
347349
remoteASRProviderInfo = nil
348350
remoteASRModelInfo = nil
349351
remoteASREndpointInfo = nil
352+
senseVoiceMetadata = transcriptionEngine == .mlxAudio ? mlxTranscriber?.latestSenseVoiceMetadata : nil
350353
}
351354

352355
if historyKind == .rewrite,
@@ -360,6 +363,7 @@ extension AppDelegate {
360363
whisperWordTimings: transcriptionEngine == .whisperKit && whisperTimestampsEnabled
361364
? whisperTranscriber?.latestWordTimings
362365
: nil,
366+
senseVoiceMetadata: senseVoiceMetadata,
363367
dictionaryHitTerms: dictionaryHitTerms,
364368
dictionaryCorrectedTerms: dictionaryCorrectedTerms,
365369
dictionaryCorrectionSnapshots: dictionaryCorrectionSnapshots,
@@ -405,6 +409,7 @@ extension AppDelegate {
405409
whisperWordTimings: transcriptionEngine == .whisperKit && whisperTimestampsEnabled
406410
? whisperTranscriber?.latestWordTimings
407411
: nil,
412+
senseVoiceMetadata: senseVoiceMetadata,
408413
displayTitle: trimmedDisplayTitle?.isEmpty == false ? trimmedDisplayTitle : nil,
409414
transcriptionChatMessages: historyKind == .rewrite
410415
? TranscriptionHistoryConversationSupport.initialChatMessages(
@@ -452,6 +457,7 @@ extension AppDelegate {
452457
llmDurationSeconds: TimeInterval?,
453458
pendingAudioArchiveURL: URL?,
454459
whisperWordTimings: [WhisperHistoryWordTiming]?,
460+
senseVoiceMetadata: SenseVoiceTranscriptMetadata?,
455461
dictionaryHitTerms: [String],
456462
dictionaryCorrectedTerms: [String],
457463
dictionaryCorrectionSnapshots: [DictionaryCorrectionSnapshot],
@@ -499,6 +505,7 @@ extension AppDelegate {
499505
incoming: llmDurationSeconds
500506
),
501507
whisperWordTimings: whisperWordTimings ?? existingEntry.whisperWordTimings,
508+
senseVoiceMetadata: senseVoiceMetadata ?? existingEntry.senseVoiceMetadata,
502509
transcriptionChatMessages: rewriteConversationMessages.isEmpty
503510
? TranscriptionHistoryConversationSupport.bootstrapChatMessages(for: existingEntry)
504511
: rewriteConversationMessages,

Voxt/Settings/ModelSettingsView+ModelActions.swift

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,30 @@
11
import AppKit
22
import SwiftUI
33

4+
enum MLXConfigurationSummarySupport {
5+
static func summary(for repo: String, tuning: MLXLocalTuningSettings) -> String {
6+
let family = MLXModelFamily.family(for: repo)
7+
switch family {
8+
case .qwen3ASR:
9+
let hasContext = tuning.qwenContextBias.isEmpty
10+
? AppLocalization.localizedString("Context Off")
11+
: AppLocalization.localizedString("Context On")
12+
return AppLocalization.format("%@ · %@", tuning.preset.title, hasContext)
13+
case .graniteSpeech:
14+
let hasPrompt = tuning.granitePromptBias.isEmpty
15+
? AppLocalization.localizedString("Prompt Off")
16+
: AppLocalization.localizedString("Prompt On")
17+
return AppLocalization.format("%@ · %@", tuning.preset.title, hasPrompt)
18+
case .senseVoice:
19+
return AppLocalization.localizedString(tuning.senseVoiceUseITN ? "ITN On" : "ITN Off")
20+
case .cohereTranscribe:
21+
return tuning.preset.title
22+
case .generic:
23+
return tuning.preset.title
24+
}
25+
}
26+
}
27+
428
extension ModelSettingsView {
529
func promptBinding(for storage: Binding<String>, kind: AppPromptKind) -> Binding<String> {
630
Binding(
@@ -532,26 +556,7 @@ extension ModelSettingsView {
532556

533557
var mlxConfigurationSummary: String {
534558
let tuning = resolvedMLXLocalTuningSettings(for: modelRepo)
535-
let family = MLXModelFamily.family(for: modelRepo)
536-
switch family {
537-
case .qwen3ASR:
538-
let hasContext = tuning.qwenContextBias.isEmpty
539-
? AppLocalization.localizedString("Context Off")
540-
: AppLocalization.localizedString("Context On")
541-
return AppLocalization.format("%@ · %@", tuning.preset.title, hasContext)
542-
case .graniteSpeech:
543-
let hasPrompt = tuning.granitePromptBias.isEmpty
544-
? AppLocalization.localizedString("Prompt Off")
545-
: AppLocalization.localizedString("Prompt On")
546-
return AppLocalization.format("%@ · %@", tuning.preset.title, hasPrompt)
547-
case .senseVoice:
548-
let itn = AppLocalization.localizedString(tuning.senseVoiceUseITN ? "ITN On" : "ITN Off")
549-
return AppLocalization.format("%@ · %@", tuning.preset.title, itn)
550-
case .cohereTranscribe:
551-
return tuning.preset.title
552-
case .generic:
553-
return tuning.preset.title
554-
}
559+
return MLXConfigurationSummarySupport.summary(for: modelRepo, tuning: tuning)
555560
}
556561

557562
var customLLMGenerationSummary: String {

Voxt/Settings/ModelSettingsView+Sections.swift

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,10 @@ private struct MLXASRConfigurationSheetView: View {
330330
).language ?? AppLocalization.localizedString("Automatic")
331331
}
332332

333+
private var senseVoiceSupportedLanguageSummary: String {
334+
AppLocalization.localizedString("Automatic, zh, en, yue, ja, ko")
335+
}
336+
333337
var body: some View {
334338
VStack(alignment: .leading, spacing: 14) {
335339
Text(localized("MLX ASR Configuration"))
@@ -341,26 +345,28 @@ private struct MLXASRConfigurationSheetView: View {
341345
.font(.subheadline)
342346
.foregroundStyle(.secondary)
343347

344-
VStack(alignment: .leading, spacing: 8) {
345-
Text(localized("Preset"))
346-
.font(.subheadline.weight(.medium))
347-
SettingsMenuPicker(
348-
selection: Binding(
349-
get: { tuningSettings.preset.rawValue },
350-
set: { rawValue in
351-
guard let preset = LocalASRRecognitionPreset(rawValue: rawValue) else { return }
352-
tuningSettings.preset = preset
353-
}
354-
),
355-
options: LocalASRRecognitionPreset.allCases.map {
356-
SettingsMenuOption(value: $0.rawValue, title: $0.title)
357-
},
358-
selectedTitle: tuningSettings.preset.title,
359-
width: 220
360-
)
361-
Text(tuningSettings.preset.summary)
362-
.font(.caption)
363-
.foregroundStyle(.secondary)
348+
if family.supportsRecognitionPreset {
349+
VStack(alignment: .leading, spacing: 8) {
350+
Text(localized("Preset"))
351+
.font(.subheadline.weight(.medium))
352+
SettingsMenuPicker(
353+
selection: Binding(
354+
get: { tuningSettings.preset.rawValue },
355+
set: { rawValue in
356+
guard let preset = LocalASRRecognitionPreset(rawValue: rawValue) else { return }
357+
tuningSettings.preset = preset
358+
}
359+
),
360+
options: LocalASRRecognitionPreset.allCases.map {
361+
SettingsMenuOption(value: $0.rawValue, title: $0.title)
362+
},
363+
selectedTitle: tuningSettings.preset.title,
364+
width: 220
365+
)
366+
Text(tuningSettings.preset.summary)
367+
.font(.caption)
368+
.foregroundStyle(.secondary)
369+
}
364370
}
365371

366372
Toggle(localized("Follow User Main Language"), isOn: $hintSettings.followsUserMainLanguage)
@@ -373,6 +379,16 @@ private struct MLXASRConfigurationSheetView: View {
373379

374380
localInfoRow(label: localized("Other languages"), value: secondaryLanguageSummary)
375381

382+
if family == .senseVoice {
383+
localInfoRow(
384+
label: localized("Supported routes"),
385+
value: senseVoiceSupportedLanguageSummary
386+
)
387+
Text(localized("SenseVoice only accepts explicit language routing for zh, en, yue, ja, and ko here. Any other primary language falls back to Automatic."))
388+
.font(.caption)
389+
.foregroundStyle(.secondary)
390+
}
391+
376392
if family.supportsContextBias {
377393
Text(localized("Recognition Context"))
378394
.font(.subheadline.weight(.medium))
@@ -406,6 +422,10 @@ private struct MLXASRConfigurationSheetView: View {
406422
Text(localized("This model family only exposes preset and language controls."))
407423
.font(.caption)
408424
.foregroundStyle(.secondary)
425+
} else if family == .senseVoice {
426+
Text(localized("SenseVoice only exposes language routing and ITN here. Recognition presets are not used by this model path."))
427+
.font(.caption)
428+
.foregroundStyle(.secondary)
409429
}
410430
}
411431
.frame(maxWidth: .infinity, alignment: .topLeading)

Voxt/Settings/OnboardingSettingsTypes.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ enum OnboardingStep: String, CaseIterable, Identifiable {
4848
case .rewrite:
4949
return AppLocalization.localizedString("Understand voice rewrite mode for selected text and prompt-style generation.")
5050
case .appEnhancement:
51-
return AppLocalization.localizedString("Optionally enable app-aware prompt switching.")
51+
return AppLocalization.localizedString("Review how app-aware prompt switching works across apps and pages.")
5252
case .finish:
5353
return AppLocalization.localizedString("Review your setup, then start using Voxt.")
5454
}
@@ -133,7 +133,7 @@ enum OnboardingStepStatusResolver {
133133
case .rewrite:
134134
return snapshot.hasRewriteIssues ? .needsSetup : .ready
135135
case .appEnhancement:
136-
return snapshot.appEnhancementEnabled ? .ready : .optional
136+
return .ready
137137
case .finish:
138138
return .done
139139
}

Voxt/Settings/OnboardingSettingsView+Data.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ extension OnboardingSettingsView {
6161
hasRecordingMicrophone: !inputDevices.isEmpty,
6262
hasRecordingPermissions: recordingPermissionsSatisfied,
6363
hasRewriteIssues: !rewriteIssues.isEmpty,
64-
appEnhancementEnabled: appEnhancementEnabled
64+
appEnhancementEnabled: true
6565
)
6666
}
6767

@@ -278,7 +278,7 @@ extension OnboardingSettingsView {
278278

279279
settings.rewrite.asrSelectionID = asrSelection
280280
settings.rewrite.llmSelectionID = llmSelection
281-
settings.rewrite.appEnhancementEnabled = appEnhancementEnabled
281+
settings.rewrite.appEnhancementEnabled = true
282282
}
283283

284284
featureSettings = FeatureSettingsStore.load(defaults: .standard)

Voxt/Settings/OnboardingSettingsView+Steps.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -593,7 +593,6 @@ extension OnboardingSettingsView {
593593
var appEnhancementStep: some View {
594594
VStack(alignment: .leading, spacing: 16) {
595595
GeneralSettingsCard(title: "App Enhancement") {
596-
Toggle(localized("Enable App Enhancement"), isOn: $appEnhancementEnabled)
597596
Text(localized("App Enhancement lets Voxt switch prompts based on the current app or browser tab, so translation, rewrite, and cleanup can behave differently across contexts."))
598597
.font(.caption)
599598
.foregroundStyle(.secondary)

Voxt/Settings/OnboardingSettingsView.swift

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ struct OnboardingSettingsView: View {
2727
@AppStorage(AppPreferenceKey.userMainLanguageCodes) var userMainLanguageCodesRaw = UserMainLanguageOption.defaultStoredSelectionValue
2828
@AppStorage(AppPreferenceKey.translateSelectedTextOnTranslationHotkey) var translateSelectedTextOnTranslationHotkey = true
2929
@AppStorage(AppPreferenceKey.autoCopyWhenNoFocusedInput) var autoCopyWhenNoFocusedInput = false
30-
@AppStorage(AppPreferenceKey.appEnhancementEnabled) var appEnhancementEnabled = true
3130
@AppStorage(AppPreferenceKey.modelStorageRootPath) var modelStorageRootPath = ""
3231
@AppStorage(AppPreferenceKey.useHfMirror) var useHfMirror = false
3332
@AppStorage(AppPreferenceKey.transcriptionEngine) var engineRaw = TranscriptionEngine.mlxAudio.rawValue
@@ -292,10 +291,7 @@ struct OnboardingSettingsView: View {
292291
let translationSelectionObserved = AnyView(targetLanguageObserved.onChange(of: translateSelectedTextOnTranslationHotkey) { _, _ in
293292
syncOnboardingFeatureSelections()
294293
})
295-
let appEnhancementObserved = AnyView(translationSelectionObserved.onChange(of: appEnhancementEnabled) { _, _ in
296-
syncOnboardingFeatureSelections()
297-
})
298-
let stepObserved = AnyView(appEnhancementObserved.onChange(of: currentStep) { _, newValue in
294+
let stepObserved = AnyView(translationSelectionObserved.onChange(of: currentStep) { _, newValue in
299295
OnboardingPreferenceManager.saveLastStep(newValue)
300296
prepareDemoPlayerIfNeeded(for: newValue)
301297
})

Voxt/Settings/SettingsTypes.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ struct UserMainLanguageOption: Identifiable, Hashable {
149149
.init(code: "km", promptName: "Khmer", aliases: []),
150150
.init(code: "kn", promptName: "Kannada", aliases: []),
151151
.init(code: "ko", promptName: "Korean", aliases: []),
152+
.init(code: "yue", promptName: "Cantonese", aliases: ["Yue Chinese"]),
152153
.init(code: "la", promptName: "Latin", aliases: []),
153154
.init(code: "lb", promptName: "Luxembourgish", aliases: []),
154155
.init(code: "lo", promptName: "Lao", aliases: []),

Voxt/Support/ASRHintLocalTuning.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ enum MLXModelFamily: String, CaseIterable, Codable, Identifiable {
7171
var supportsContextBias: Bool { self == .qwen3ASR }
7272
var supportsPromptBias: Bool { self == .graniteSpeech }
7373
var supportsITN: Bool { self == .senseVoice }
74+
var supportsRecognitionPreset: Bool { self != .senseVoice }
7475
}
7576

7677
struct WhisperLocalTuningSettings: Codable, Equatable {

0 commit comments

Comments
 (0)