-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathasr.py
More file actions
117 lines (97 loc) · 3.84 KB
/
asr.py
File metadata and controls
117 lines (97 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
MathASR - Audio Speech Recognition for Math Mentor.
Uses Google Cloud Speech-to-Text V2 (Chirp 2) for state-of-the-art accuracy.
"""
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
from typing import Dict, Optional
from backend.config import config
class MathASR:
"""
Handles audio input using Google STT V2 (Chirp model).
Why Chirp?
- Best-in-class accuracy for technical terms
- Multlingual support
- Robust to accents and noise
"""
def __init__(self):
"""Initialize Google Speech Client."""
if not config.GCP_PROJECT_ID:
# We allow initialization without creds to not crash app start,
# but methods will fail if called.
print("WARNING: GOOGLE_PROJECT_ID not set. ASR will not work.")
self.client = None
return
try:
self.client = SpeechClient()
self.project_id = config.GCP_PROJECT_ID
self.location = config.STT_LOCATION
self.recognizer_id = config.STT_RECOGNIZER
self.recognizer_path = f"projects/{self.project_id}/locations/{self.location}/recognizers/{self.recognizer_id}"
except Exception as e:
print(f"Failed to init Speech Client: {e}")
self.client = None
def transcribe(self, audio_bytes: bytes) -> Dict[str, any]:
"""
Transcribe audio bytes to text.
Args:
audio_bytes: Raw audio content (WAV/MP3/WebM)
Returns:
Dict with:
- 'text': Transcribed text
- 'confidence': Confidence score
- 'error': Error message if any
"""
if not self.client:
return {
"text": "",
"confidence": 0.0,
"error": "ASR not configured (missing Project ID)"
}
try:
# Build configuration - use auto-detect for audio format
# Streamlit audio_input returns WebM/Opus format
config_req = cloud_speech.RecognitionConfig(
# Let the API auto-detect the audio encoding
auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
language_codes=["en-US"],
model="long",
)
request = cloud_speech.RecognizeRequest(
recognizer=self.recognizer_path,
config=config_req,
content=audio_bytes
)
# Call API
response = self.client.recognize(request=request)
# Parse response
results = response.results
if not results:
return {"text": "", "confidence": 0.0, "error": "No speech detected"}
# Combine all results
full_transcript = ""
total_confidence = 0.0
for result in results:
if result.alternatives:
alt = result.alternatives[0]
full_transcript += alt.transcript + " "
total_confidence += alt.confidence
avg_confidence = total_confidence / len(results) if results else 0.0
return {
"text": full_transcript.strip(),
"confidence": avg_confidence,
"error": None
}
except Exception as e:
return {
"text": "",
"confidence": 0.0,
"error": str(e)
}
if __name__ == "__main__":
# Test initialization
asr = MathASR()
if asr.client:
print(f"ASR Initialized for project: {config.GCP_PROJECT_ID}")
else:
print("ASR Initialization failed (check config)")