MathPilot/backend/input/asr.py at main · himax12/MathPilot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
MathASR - Audio Speech Recognition for Math Mentor.
Uses Google Cloud Speech-to-Text V2 (Chirp 2) for state-of-the-art accuracy.
"""

from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
from typing import Dict, Optional

from backend.config import config

class MathASR:
    """
    Handles audio input using Google STT V2 (Chirp model).

    Why Chirp?
    - Best-in-class accuracy for technical terms
    - Multlingual support
    - Robust to accents and noise
    """

    def __init__(self):
        """Initialize Google Speech Client."""
        if not config.GCP_PROJECT_ID:
            # We allow initialization without creds to not crash app start,
            # but methods will fail if called.
            print("WARNING: GOOGLE_PROJECT_ID not set. ASR will not work.")
            self.client = None
            return

        try:
            self.client = SpeechClient()
            self.project_id = config.GCP_PROJECT_ID
            self.location = config.STT_LOCATION
            self.recognizer_id = config.STT_RECOGNIZER

            self.recognizer_path = f"projects/{self.project_id}/locations/{self.location}/recognizers/{self.recognizer_id}"
        except Exception as e:
            print(f"Failed to init Speech Client: {e}")
            self.client = None

    def transcribe(self, audio_bytes: bytes) -> Dict[str, any]:
        """
        Transcribe audio bytes to text.

        Args:
            audio_bytes: Raw audio content (WAV/MP3/WebM)

        Returns:
            Dict with:
            - 'text': Transcribed text
            - 'confidence': Confidence score
            - 'error': Error message if any
        """
        if not self.client:
            return {
                "text": "",
                "confidence": 0.0,
                "error": "ASR not configured (missing Project ID)"
            }

        try:
            # Build configuration - use auto-detect for audio format
            # Streamlit audio_input returns WebM/Opus format
            config_req = cloud_speech.RecognitionConfig(
                # Let the API auto-detect the audio encoding
                auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
                language_codes=["en-US"],
                model="long",
            )

            request = cloud_speech.RecognizeRequest(
                recognizer=self.recognizer_path,
                config=config_req,
                content=audio_bytes
            )

            # Call API
            response = self.client.recognize(request=request)

            # Parse response
            results = response.results
            if not results:
                return {"text": "", "confidence": 0.0, "error": "No speech detected"}

            # Combine all results
            full_transcript = ""
            total_confidence = 0.0

            for result in results:
                if result.alternatives:
                    alt = result.alternatives[0]
                    full_transcript += alt.transcript + " "
                    total_confidence += alt.confidence

            avg_confidence = total_confidence / len(results) if results else 0.0

            return {
                "text": full_transcript.strip(),
                "confidence": avg_confidence,
                "error": None
            }

        except Exception as e:
            return {
                "text": "",
                "confidence": 0.0,
                "error": str(e)
            }

if __name__ == "__main__":
    # Test initialization
    asr = MathASR()
    if asr.client:
        print(f"ASR Initialized for project: {config.GCP_PROJECT_ID}")
    else:
        print("ASR Initialization failed (check config)")