@@ -425,7 +425,13 @@ def transcribe(
425425 if progress_callback :
426426 progress_callback (0.95 ) # Diarization done
427427
428- formatted = format_result (result , detected_language , speaker_embeddings )
428+ formatted = format_result (
429+ result ,
430+ detected_language ,
431+ speaker_embeddings ,
432+ speaker_labels = options .speaker_labels ,
433+ word_timestamps = options .word_timestamps ,
434+ )
429435
430436 if progress_callback :
431437 progress_callback (1.0 ) # Complete
@@ -449,13 +455,17 @@ def format_result(
449455 result : dict [str , Any ],
450456 language : str ,
451457 speaker_embeddings : dict [str , list [float ]] | None = None ,
458+ speaker_labels : bool = False ,
459+ word_timestamps : bool = False ,
452460) -> dict [str , Any ]:
453461 """Format result to API response format.
454462
455463 Args:
456464 result: Raw result with segments.
457465 language: Detected/specified language code.
458466 speaker_embeddings: Optional speaker embedding vectors.
467+ speaker_labels: Whether speaker diarization was requested.
468+ word_timestamps: Whether word-level timestamps were requested.
459469
460470 Returns:
461471 Formatted transcript with words and utterances.
@@ -464,51 +474,46 @@ def format_result(
464474 utterances : list [dict [str , Any ]] = []
465475
466476 for segment in result .get ("segments" , []):
467- speaker = segment .get ("speaker" , "A" )
477+ # Only include speaker if diarization was requested
478+ speaker = segment .get ("speaker" ) if speaker_labels else None
468479 utterance_words : list [dict [str , Any ]] = []
469480
470481 for word in segment .get ("words" , []):
471- word_data = {
482+ word_data : dict [ str , Any ] = {
472483 "text" : word .get ("word" , "" ),
473484 "start" : int (word .get ("start" , 0 ) * 1000 ), # Convert to ms
474485 "end" : int (word .get ("end" , 0 ) * 1000 ),
475486 "confidence" : word .get ("score" , 0.0 ),
476- "speaker" : speaker ,
477487 }
488+ # Only include speaker if diarization was requested and speaker exists
489+ if speaker :
490+ word_data ["speaker" ] = speaker
478491 words .append (word_data )
479492 utterance_words .append (word_data )
480493
481494 # Build utterance from segment
495+ utterance : dict [str , Any ] = {
496+ "text" : segment .get ("text" , "" ).strip (),
497+ "start" : int (segment .get ("start" , 0 ) * 1000 ),
498+ "end" : int (segment .get ("end" , 0 ) * 1000 ),
499+ "words" : utterance_words ,
500+ }
501+
502+ # Only include speaker if diarization was requested and speaker exists
503+ if speaker :
504+ utterance ["speaker" ] = speaker
505+
506+ # Only include confidence if we have word-level data
482507 if utterance_words :
483- avg_confidence = sum (w ["confidence" ] for w in utterance_words ) / len (utterance_words )
484- else :
485- # Without word-level alignment, use 0.85 baseline (Whisper is generally accurate)
486- avg_confidence = 0.85
508+ utterance ["confidence" ] = sum (w ["confidence" ] for w in utterance_words ) / len (
509+ utterance_words
510+ )
487511
488- utterances .append (
489- {
490- "speaker" : speaker ,
491- "text" : segment .get ("text" , "" ).strip (),
492- "start" : int (segment .get ("start" , 0 ) * 1000 ),
493- "end" : int (segment .get ("end" , 0 ) * 1000 ),
494- "confidence" : avg_confidence ,
495- "words" : utterance_words ,
496- }
497- )
512+ utterances .append (utterance )
498513
499514 # Calculate overall metrics
500515 full_text = " " .join (s .get ("text" , "" ).strip () for s in result .get ("segments" , []))
501516
502- # Confidence: use word-level if available, otherwise estimate from utterance count
503- if words :
504- total_confidence = sum (w ["confidence" ] for w in words ) / len (words )
505- elif utterances :
506- # Without word-level alignment, use 0.85 as baseline (Whisper is generally accurate)
507- # This indicates "transcription worked but no word-level confidence available"
508- total_confidence = 0.85
509- else :
510- total_confidence = 0.0
511-
512517 # Audio duration: use word-level if available, otherwise use utterance end times
513518 if words :
514519 audio_duration = max ((w ["end" ] for w in words ), default = 0 )
@@ -517,15 +522,18 @@ def format_result(
517522 else :
518523 audio_duration = 0
519524
520- formatted = {
525+ formatted : dict [ str , Any ] = {
521526 "text" : full_text ,
522527 "words" : words ,
523528 "utterances" : utterances ,
524- "confidence" : total_confidence ,
525529 "audio_duration" : audio_duration ,
526530 "language_code" : language ,
527531 }
528532
533+ # Only include confidence if we have word-level data
534+ if words :
535+ formatted ["confidence" ] = sum (w ["confidence" ] for w in words ) / len (words )
536+
529537 # Include speaker embeddings if available
530538 if speaker_embeddings :
531539 formatted ["speaker_embeddings" ] = speaker_embeddings
0 commit comments