Skip to content

Commit 2abf29e

Browse files
Gunn1claude
andcommitted
Fix: Parakeet transcription with audio resampling and improved UI
- Add audio resampling to 16kHz for Parakeet compatibility (was using 48kHz) - Pin compatible versions: torch 2.1.0, torchaudio 2.1.0, pyannote.audio 3.0.0 - Catch AttributeError in speaker_diarization import for better error handling - Improve live transcription UI with animated listening indicator - Add smooth auto-scroll to transcript messages during live recording - Change default ASR model to Parakeet TDT 1.1B (faster, better quality) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent a130a2b commit 2abf29e

File tree

5 files changed

+81
-13
lines changed

5 files changed

+81
-13
lines changed

backend/main.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,36 @@ async def root():
9999
return {"message": "Audio Transcriber API is running"}
100100

101101

102+
def resample_audio_to_16khz(audio_path: str) -> str:
103+
"""
104+
Resample audio file to 16kHz (required by Parakeet)
105+
Returns path to the resampled audio file
106+
"""
107+
import librosa
108+
import soundfile as sf
109+
110+
try:
111+
# Load audio at original sample rate
112+
audio, sr = librosa.load(audio_path, sr=None, mono=True)
113+
114+
# If already 16kHz, return original path
115+
if sr == 16000:
116+
return audio_path
117+
118+
# Resample to 16kHz
119+
audio_16k = librosa.resample(audio, orig_sr=sr, target_sr=16000)
120+
121+
# Save to temporary file
122+
resampled_path = audio_path.replace('.wav', '_16k.wav')
123+
sf.write(resampled_path, audio_16k, 16000)
124+
125+
print(f"🔄 Resampled audio from {sr}Hz to 16kHz")
126+
return resampled_path
127+
except Exception as e:
128+
print(f"⚠️ Resampling failed, trying original audio: {e}")
129+
return audio_path
130+
131+
102132
def transcribe_with_parakeet(audio_path: str, model_name: str = "parakeet-1.1b-ctc-greedy") -> str:
103133
"""
104134
Transcribe audio using Parakeet model (runs locally)
@@ -162,10 +192,13 @@ def transcribe_with_parakeet(audio_path: str, model_name: str = "parakeet-1.1b-c
162192
else:
163193
parakeet_model = parakeet_model_cache[full_model_name]
164194
# Don't print every time to reduce noise - only on first use
165-
195+
196+
# Resample audio to 16kHz (required by Parakeet)
197+
audio_path_16k = resample_audio_to_16khz(audio_path)
198+
166199
# Transcribe
167200
with torch.no_grad():
168-
transcribed_text = parakeet_model.transcribe([audio_path])
201+
transcribed_text = parakeet_model.transcribe([audio_path_16k])
169202

170203
# Extract text from result (may be Hypothesis object or string)
171204
if transcribed_text and len(transcribed_text) > 0:

backend/requirements.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ aiofiles>=23.2.0
88
websockets>=12.0
99
librosa>=0.10.0
1010
soundfile>=0.12.1
11-
torch>=2.1.0
11+
torch>=2.1.0,<3.0
12+
torchaudio>=2.1.0,<3.0
1213
openai-whisper>=20231117
1314
nemo-toolkit[asr]>=1.22.0
14-
pyannote.audio>=2.1.1
15+
pyannote.audio>=3.0.0,<4.0
1516
huggingface-hub>=0.16.0

backend/speaker_diarization.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
try:
1515
from pyannote.audio import Pipeline
1616
HAS_PYANNOTE = True
17-
except ImportError:
18-
print("⚠️ pyannote.audio not installed. Speaker diarization will be disabled.")
17+
except (ImportError, AttributeError) as e:
18+
print(f"⚠️ pyannote.audio not available ({type(e).__name__}). Speaker diarization will be disabled.")
1919

2020
# Global cache for diarization pipeline
2121
diarization_pipeline = None

frontend/app/components/AudioRecorder.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ export default function AudioRecorder({
3131
const [selectedAudioDevice, setSelectedAudioDevice] = useState<string>("default");
3232
const [captureSystemAudio, setCaptureSystemAudio] = useState(false);
3333
const [captureMicWithSystem, setCaptureMicWithSystem] = useState(true);
34-
const [selectedModel, setSelectedModel] = useState<string>("whisper-base");
34+
const [selectedModel, setSelectedModel] = useState<string>("parakeet-tdt-1.1b");
3535

3636
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
3737
const audioChunksRef = useRef<Blob[]>([]);

frontend/app/components/TranscriptMessages.tsx

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,18 @@ export default function TranscriptMessages({
3333
return scrollHeight - scrollTop - clientHeight < 50;
3434
};
3535

36-
// Auto-scroll to bottom when new messages arrive (only if user is at bottom)
36+
// Auto-scroll to bottom with smooth animation when new messages arrive
3737
const scrollToBottom = () => {
3838
if (!isUserScrolledUp && containerRef.current) {
39-
// Scroll within the container, not the entire page
40-
containerRef.current.scrollTop = containerRef.current.scrollHeight;
39+
// Use requestAnimationFrame for smooth scrolling
40+
requestAnimationFrame(() => {
41+
if (containerRef.current) {
42+
containerRef.current.scrollTo({
43+
top: containerRef.current.scrollHeight,
44+
behavior: 'smooth'
45+
});
46+
}
47+
});
4148
}
4249
};
4350

@@ -46,10 +53,11 @@ export default function TranscriptMessages({
4653
setIsUserScrolledUp(!isAtBottom());
4754
};
4855

56+
// Auto-scroll when new messages arrive
4957
useEffect(() => {
5058
scrollToBottom();
5159
// eslint-disable-next-line react-hooks/exhaustive-deps
52-
}, [messages, isUserScrolledUp]);
60+
}, [messages]);
5361

5462
if (messages.length === 0) {
5563
return (
@@ -61,12 +69,38 @@ export default function TranscriptMessages({
6169

6270
return (
6371
<div className="relative w-full h-full flex flex-col">
72+
{/* Listening Indicator - only show when recording */}
73+
{isRecording && (
74+
<div className="flex items-center justify-center gap-3 px-4 py-4 bg-gradient-to-r from-red-500 via-red-600 to-red-700 shadow-md">
75+
{/* Animated pulse circle background */}
76+
<div className="relative flex items-center justify-center">
77+
<div className="absolute w-8 h-8 bg-red-400 rounded-full animate-pulse opacity-75"></div>
78+
<div className="relative w-4 h-4 bg-white rounded-full shadow-lg"></div>
79+
</div>
80+
81+
{/* Animated sound waves */}
82+
<div className="flex items-center gap-0.5">
83+
<div className="w-1 h-6 bg-white rounded-full opacity-40 animate-pulse" style={{animationDelay: '0s'}}></div>
84+
<div className="w-1 h-8 bg-white rounded-full opacity-60 animate-pulse" style={{animationDelay: '0.1s'}}></div>
85+
<div className="w-1 h-10 bg-white rounded-full animate-pulse" style={{animationDelay: '0.2s'}}></div>
86+
<div className="w-1 h-8 bg-white rounded-full opacity-60 animate-pulse" style={{animationDelay: '0.3s'}}></div>
87+
<div className="w-1 h-6 bg-white rounded-full opacity-40 animate-pulse" style={{animationDelay: '0.4s'}}></div>
88+
</div>
89+
90+
{/* Text */}
91+
<span className="text-sm font-semibold text-white tracking-wider">LISTENING</span>
92+
</div>
93+
)}
94+
6495
{/* Scroll to bottom button (appears when scrolled up) */}
6596
{isUserScrolledUp && (
6697
<button
6798
onClick={() => {
6899
if (containerRef.current) {
69-
containerRef.current.scrollTop = containerRef.current.scrollHeight;
100+
containerRef.current.scrollTo({
101+
top: containerRef.current.scrollHeight,
102+
behavior: 'smooth'
103+
});
70104
setIsUserScrolledUp(false);
71105
}
72106
}}
@@ -79,7 +113,7 @@ export default function TranscriptMessages({
79113
<div
80114
ref={containerRef}
81115
onScroll={handleScroll}
82-
className="w-full h-full flex flex-col overflow-y-auto bg-gray-50"
116+
className="w-full flex-1 flex flex-col overflow-y-auto bg-gray-50"
83117
>
84118
{/* Messages displayed in order (oldest to newest) */}
85119
<div className="flex flex-col gap-3 p-4 w-full">

0 commit comments

Comments
 (0)