Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Sources/WhisperKit/Core/TextDecoder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ open class TextDecoder: TextDecoding, WhisperMLModel {
timings.decodingSampling += samplingTime

isFirstTokenLogProbTooLow =
if isFirstToken, let firstTokenLogProbThreshold = options.firstTokenLogProbThreshold, nextTokenLogProb < firstTokenLogProbThreshold {
if isFirstToken, options.promptTokens == nil, let firstTokenLogProbThreshold = options.firstTokenLogProbThreshold, nextTokenLogProb < firstTokenLogProbThreshold {
true
} else {
Comment on lines 852 to 855
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change makes firstTokenLogProbThreshold effectively a no-op whenever options.promptTokens is non-nil. That’s a behavior change for an existing public option, so it should be documented (e.g., in DecodingOptions docs / README) to avoid confusing callers who set a strict threshold expecting it to be enforced.

Copilot uses AI. Check for mistakes.
false
Expand Down
33 changes: 33 additions & 0 deletions Tests/WhisperKitTests/UnitTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1702,6 +1702,39 @@ final class UnitTests: XCTestCase {
XCTAssertFalse(result.text.contains(promptText), "Prompt text should not be present in the result")
}

func testPromptTokensWithStrictFirstTokenThreshold() async throws {
// Regression test for https://github.com/argmaxinc/WhisperKit/issues/372
//
// promptTokens shift the decoder's first content token logprob downward.
// Measured on tiny model + jfk.wav:
// - Without prompt: firstToken logprob ≈ -0.087
// - With CJK prompt: firstToken logprob ≈ -0.578
//
// On larger/turbo models with longer audio, this shift is amplified
// enough to breach the default threshold (-1.5), causing empty output.
// We use -0.5 here to reliably reproduce the issue on the tiny model,
// simulating the larger shift seen on turbo variants.
let config = WhisperKitConfig(model: "tiny", verbose: true, logLevel: .debug, load: true)
let whisperKit = try await WhisperKit(config)
let tokenizer = try XCTUnwrap(whisperKit.tokenizer)

let promptText = "繁體中文語音轉錄。"
let promptTokens = tokenizer.encode(text: promptText)

let options = DecodingOptions(
skipSpecialTokens: true,
promptTokens: promptTokens,
firstTokenLogProbThreshold: -0.5
)

let result = try await XCTUnwrapAsync(
await transcribe(with: .tiny, options: options),
"Failed to transcribe"
)

XCTAssertFalse(result.text.isEmpty, "Transcription should not be empty when promptTokens are set, even with a strict firstTokenLogProbThreshold")
}

func testPrefixTokens() async throws {
let config = WhisperKitConfig(model: "tiny", verbose: true, logLevel: .debug, load: true)
let whisperKit = try await WhisperKit(config)
Expand Down
Loading