Fix: Parakeet transcription with audio resampling and improved UI

Gunn1 · claude · Gunn1 · commit 2abf29ebaa15 · 2025-11-25T09:31:32.000-06:00
- Add audio resampling to 16kHz for Parakeet compatibility (was using 48kHz) - Pin compatible versions: torch 2.1.0, torchaudio 2.1.0, pyannote.audio 3.0.0 - Catch AttributeError in speaker_diarization import for better error handling - Improve live transcription UI with animated listening indicator - Add smooth auto-scroll to transcript messages during live recording - Change default ASR model to Parakeet TDT 1.1B (faster, better quality) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/backend/main.py b/backend/main.py
@@ -99,6 +99,36 @@ async def root():
     return {"message": "Audio Transcriber API is running"}
 
 
+def resample_audio_to_16khz(audio_path: str) -> str:
+    """
+    Resample audio file to 16kHz (required by Parakeet)
+    Returns path to the resampled audio file
+    """
+    import librosa
+    import soundfile as sf
+
+    try:
+        # Load audio at original sample rate
+        audio, sr = librosa.load(audio_path, sr=None, mono=True)
+
+        # If already 16kHz, return original path
+        if sr == 16000:
+            return audio_path
+
+        # Resample to 16kHz
+        audio_16k = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+
+        # Save to temporary file
+        resampled_path = audio_path.replace('.wav', '_16k.wav')
+        sf.write(resampled_path, audio_16k, 16000)
+
+        print(f"🔄 Resampled audio from {sr}Hz to 16kHz")
+        return resampled_path
+    except Exception as e:
+        print(f"⚠️  Resampling failed, trying original audio: {e}")
+        return audio_path
+
+
 def transcribe_with_parakeet(audio_path: str, model_name: str = "parakeet-1.1b-ctc-greedy") -> str:
     """
     Transcribe audio using Parakeet model (runs locally)
@@ -162,10 +192,13 @@ def transcribe_with_parakeet(audio_path: str, model_name: str = "parakeet-1.1b-c
         else:
             parakeet_model = parakeet_model_cache[full_model_name]
             # Don't print every time to reduce noise - only on first use
-        
+
+        # Resample audio to 16kHz (required by Parakeet)
+        audio_path_16k = resample_audio_to_16khz(audio_path)
+
         # Transcribe
         with torch.no_grad():
-            transcribed_text = parakeet_model.transcribe([audio_path])
+            transcribed_text = parakeet_model.transcribe([audio_path_16k])
         
         # Extract text from result (may be Hypothesis object or string)
         if transcribed_text and len(transcribed_text) > 0:
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -8,8 +8,9 @@ aiofiles>=23.2.0
 websockets>=12.0
 librosa>=0.10.0
 soundfile>=0.12.1
-torch>=2.1.0
+torch>=2.1.0,<3.0
+torchaudio>=2.1.0,<3.0
 openai-whisper>=20231117
 nemo-toolkit[asr]>=1.22.0
-pyannote.audio>=2.1.1
+pyannote.audio>=3.0.0,<4.0
 huggingface-hub>=0.16.0
diff --git a/backend/speaker_diarization.py b/backend/speaker_diarization.py
@@ -14,8 +14,8 @@
 try:
     from pyannote.audio import Pipeline
     HAS_PYANNOTE = True
-except ImportError:
-    print("⚠️ pyannote.audio not installed. Speaker diarization will be disabled.")
+except (ImportError, AttributeError) as e:
+    print(f"⚠️ pyannote.audio not available ({type(e).__name__}). Speaker diarization will be disabled.")
 
 # Global cache for diarization pipeline
 diarization_pipeline = None
diff --git a/frontend/app/components/AudioRecorder.tsx b/frontend/app/components/AudioRecorder.tsx
@@ -31,7 +31,7 @@ export default function AudioRecorder({
   const [selectedAudioDevice, setSelectedAudioDevice] = useState<string>("default");
   const [captureSystemAudio, setCaptureSystemAudio] = useState(false);
   const [captureMicWithSystem, setCaptureMicWithSystem] = useState(true);
-  const [selectedModel, setSelectedModel] = useState<string>("whisper-base");
+  const [selectedModel, setSelectedModel] = useState<string>("parakeet-tdt-1.1b");
 
   const mediaRecorderRef = useRef<MediaRecorder | null>(null);
   const audioChunksRef = useRef<Blob[]>([]);
diff --git a/frontend/app/components/TranscriptMessages.tsx b/frontend/app/components/TranscriptMessages.tsx
@@ -33,11 +33,18 @@ export default function TranscriptMessages({
     return scrollHeight - scrollTop - clientHeight < 50;
   };
 
-  // Auto-scroll to bottom when new messages arrive (only if user is at bottom)
+  // Auto-scroll to bottom with smooth animation when new messages arrive
   const scrollToBottom = () => {
     if (!isUserScrolledUp && containerRef.current) {
-      // Scroll within the container, not the entire page
-      containerRef.current.scrollTop = containerRef.current.scrollHeight;
+      // Use requestAnimationFrame for smooth scrolling
+      requestAnimationFrame(() => {
+        if (containerRef.current) {
+          containerRef.current.scrollTo({
+            top: containerRef.current.scrollHeight,
+            behavior: 'smooth'
+          });
+        }
+      });
     }
   };
 
@@ -46,10 +53,11 @@ export default function TranscriptMessages({
     setIsUserScrolledUp(!isAtBottom());
   };
 
+  // Auto-scroll when new messages arrive
   useEffect(() => {
     scrollToBottom();
     // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [messages, isUserScrolledUp]);
+  }, [messages]);
 
   if (messages.length === 0) {
     return (
@@ -61,12 +69,38 @@ export default function TranscriptMessages({
 
   return (
     <div className="relative w-full h-full flex flex-col">
+      {/* Listening Indicator - only show when recording */}
+      {isRecording && (
+        <div className="flex items-center justify-center gap-3 px-4 py-4 bg-gradient-to-r from-red-500 via-red-600 to-red-700 shadow-md">
+          {/* Animated pulse circle background */}
+          <div className="relative flex items-center justify-center">
+            <div className="absolute w-8 h-8 bg-red-400 rounded-full animate-pulse opacity-75"></div>
+            <div className="relative w-4 h-4 bg-white rounded-full shadow-lg"></div>
+          </div>
+
+          {/* Animated sound waves */}
+          <div className="flex items-center gap-0.5">
+            <div className="w-1 h-6 bg-white rounded-full opacity-40 animate-pulse" style={{animationDelay: '0s'}}></div>
+            <div className="w-1 h-8 bg-white rounded-full opacity-60 animate-pulse" style={{animationDelay: '0.1s'}}></div>
+            <div className="w-1 h-10 bg-white rounded-full animate-pulse" style={{animationDelay: '0.2s'}}></div>
+            <div className="w-1 h-8 bg-white rounded-full opacity-60 animate-pulse" style={{animationDelay: '0.3s'}}></div>
+            <div className="w-1 h-6 bg-white rounded-full opacity-40 animate-pulse" style={{animationDelay: '0.4s'}}></div>
+          </div>
+
+          {/* Text */}
+          <span className="text-sm font-semibold text-white tracking-wider">LISTENING</span>
+        </div>
+      )}
+
       {/* Scroll to bottom button (appears when scrolled up) */}
       {isUserScrolledUp && (
         <button
           onClick={() => {
             if (containerRef.current) {
-              containerRef.current.scrollTop = containerRef.current.scrollHeight;
+              containerRef.current.scrollTo({
+                top: containerRef.current.scrollHeight,
+                behavior: 'smooth'
+              });
               setIsUserScrolledUp(false);
             }
           }}
@@ -79,7 +113,7 @@ export default function TranscriptMessages({
       <div
         ref={containerRef}
         onScroll={handleScroll}
-        className="w-full h-full flex flex-col overflow-y-auto bg-gray-50"
+        className="w-full flex-1 flex flex-col overflow-y-auto bg-gray-50"
       >
         {/* Messages displayed in order (oldest to newest) */}
         <div className="flex flex-col gap-3 p-4 w-full">