Real time audio preprocessing (#1100)

TonikaReddyT · web-flow · commit fb1f83a3e21f · 2025-11-19T11:08:40.000+05:30
diff --git a/education-ai-suite/smart-classroom/api/endpoints.py b/education-ai-suite/smart-classroom/api/endpoints.py
@@ -7,12 +7,15 @@
 from dto.summarizer_dto import SummaryRequest
 from pipeline import Pipeline
 import json, os
+import subprocess, re
 from fastapi.responses import StreamingResponse
 from utils.runtime_config_loader import RuntimeConfig
 from utils.storage_manager import StorageManager
 from utils.platform_info import get_platform_and_model_info
 from dto.project_settings import ProjectSettings
 from monitoring.monitor import start_monitoring, stop_monitoring, get_metrics
+from dto.audiosource import AudioSource
+from components.ffmpeg import audio_preprocessing
 from utils.audio_util import save_audio_file
 from utils.locks import audio_pipeline_lock
 import logging
@@ -114,6 +117,31 @@ async def generate_mindmap(request: SummaryRequest):
             detail=f"Mindmap generation failed: {e}"
         )
 
+@router.get("/devices")
+def list_audio_devices():
+    result = subprocess.run(
+        ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", "dummy"],
+        stderr=subprocess.PIPE,
+        text=True,
+        encoding="utf-8",
+        errors="replace"
+    )
+    audio_devices = re.findall(r'"(.*?)"\s*\(audio\)', result.stderr)
+    formatted_devices = [f"audio={d}" for d in audio_devices]
+    return {"devices": formatted_devices}
+ 
+ 
+@router.post("/stop-mic")
+def stop_microphone(session_id: str):
+    process = audio_preprocessing.FFMPEG_PROCESSES.pop(session_id, None)
+    if process:
+        logger.info(f"Stopping microphone recording for session {session_id}...")
+        process.terminate()
+        process.wait(timeout=5)
+        return {"status": "stopped", "message": f"Microphone for session {session_id} stopped successfully."}
+    else:
+        return {"status": "idle", "message": f"No active microphone session found for {session_id}."}
+
 @router.get("/performance-metrics")
 def get_summary_metrics(session_id: Optional[str] = Header(None, alias="session_id")):
     project_config = RuntimeConfig.get_section("Project")
diff --git a/education-ai-suite/smart-classroom/components/ffmpeg/audio_preprocessing.py b/education-ai-suite/smart-classroom/components/ffmpeg/audio_preprocessing.py
@@ -3,20 +3,25 @@
 from uuid import uuid4
 import atexit
 import shutil
-from utils.config_loader import config
+import platform,time
 import logging
+from utils.config_loader import config
+from utils.runtime_config_loader import RuntimeConfig
+from dto.audiosource import AudioSource
 
 logger = logging.getLogger(__name__)
 
-CHUNK_DURATION =  config.audio_preprocessing.chunk_duration_sec # seconds
-SILENCE_THRESH = config.audio_preprocessing.silence_threshold  # in dB
-SILENCE_DURATION = config.audio_preprocessing.silence_duration # in seconds
+CHUNK_DURATION = config.audio_preprocessing.chunk_duration_sec
+SILENCE_THRESH = config.audio_preprocessing.silence_threshold
+SILENCE_DURATION = config.audio_preprocessing.silence_duration
 SEARCH_WINDOW = config.audio_preprocessing.search_window_sec
 CLEAN_UP_ON_EXIT = config.app.cleanup_on_exit
 
 CHUNKS_DIR = config.audio_preprocessing.chunk_output_path
 os.makedirs(CHUNKS_DIR, exist_ok=True)
 
+FFMPEG_PROCESSES = {}
+
 @atexit.register
 def cleanup_chunks_folder():
     if os.path.exists(CHUNKS_DIR) and CLEAN_UP_ON_EXIT:
@@ -70,45 +75,128 @@ def get_closest_silence(silences, target_time, window=SEARCH_WINDOW):
 
     return closest  # None if nothing close enough
 
-def chunk_audio_by_silence(audio_path):
+def process_audio_segment(audio_path, start_time, end_time, chunk_index):
+    chunk_name = f"chunk_{chunk_index}_{uuid4().hex[:6]}.wav"
+    chunk_path = os.path.join(CHUNKS_DIR, chunk_name)
+    subprocess.run(
+        [
+            "ffmpeg", "-y", "-i", audio_path,
+            "-ss", str(start_time), "-to", str(end_time),
+            "-ar", "16000", "-ac", "1",
+            "-c:a", "pcm_s16le", "-vn",
+            chunk_path
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+        encoding="utf-8",
+        errors="replace"
+    )
+    logger.debug(f"Chunk {chunk_index} saved: {chunk_path}")
+    return {
+        "chunk_path": chunk_path,
+        "start_time": start_time,
+        "end_time": end_time,
+        "chunk_index": chunk_index
+    }
 
+def chunk_audio_by_silence(audio_path):
     if SEARCH_WINDOW > CHUNK_DURATION:
-        raise ValueError(f"Silence search window ({SEARCH_WINDOW}s) can't be more then Chunk Duration({CHUNK_DURATION}s).")
-
+        raise ValueError(
+            f"Silence search window ({SEARCH_WINDOW}s) can't be more than chunk duration ({CHUNK_DURATION}s)."
+        )
     duration = get_audio_duration(audio_path)
     silences = detect_silences(audio_path)
-
-    current_time = 0.0
-    chunk_index = 0
-
+    current_time, chunk_index = 0.0, 0
     while current_time < duration:
         ideal_end = current_time + CHUNK_DURATION
-        end_time = get_closest_silence(silences, ideal_end)
-
-        cut_by_silence = True
-        if not end_time or end_time <= current_time or end_time > duration:
+        end_time = get_closest_silence(silences, ideal_end) or min(ideal_end, duration)
+        if end_time <= current_time:
             end_time = min(ideal_end, duration)
-            cut_by_silence = False
-
-        chunk_name = f"chunk_{chunk_index}_{uuid4().hex[:6]}.wav"
-        chunk_path = os.path.join(CHUNKS_DIR, chunk_name)
-
-        subprocess.run([
-            "ffmpeg", "-y", "-i", audio_path,
-            "-ss", str(current_time), "-to", str(end_time),
-            "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", "-vn",
-            chunk_path
-        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, encoding="utf-8", errors="replace")
-
-        chunk_meta = {
-            "chunk_path": chunk_path,
-            "start_time": current_time,
-            "end_time": end_time if end_time < duration else None,
-            "chunk_index": chunk_index,
-            "cut_by_silence": cut_by_silence
-        }
-
-        yield chunk_meta
-
+        yield process_audio_segment(audio_path, current_time, end_time, chunk_index)
         current_time = end_time
         chunk_index += 1
+
+def chunk_audiostream_by_silence(session_id: str):
+    global FFMPEG_PROCESSES
+    mic_device = RuntimeConfig.get_section("Project").get("microphone", "").strip()
+    if not mic_device:
+        raise ValueError(
+            "Microphone device not set in runtime_config.yaml under Project.microphone"
+        )
+    record_file = os.path.join(CHUNKS_DIR, f"live_input_{session_id}.wav")
+    process = subprocess.Popen(
+        [
+            "ffmpeg", "-y",
+            "-f", "dshow",
+            "-i", f"audio={mic_device}",
+            "-ar", "16000", "-ac", "1",
+            "-c:a", "pcm_s16le", "-rf64", "auto",
+            record_file
+        ],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL
+    )
+    FFMPEG_PROCESSES[session_id] = process
+    logger.info(f"🎙️ Recording from {mic_device} (session={session_id}) ... use /stop-mic to stop.")
+    current_time, chunk_index = 0.0, 0
+    MAX_DURATION = 45 * 60
+    try:
+        while True:
+            if current_time >= MAX_DURATION:
+                logger.info(f"Session {session_id}: reached 45 min limit, stopping.")
+                break
+            if not os.path.exists(record_file) or os.path.getsize(record_file) < 44:
+                time.sleep(0.02)
+                continue
+            duration = get_audio_duration(record_file)
+            if (process.poll() is not None) and (duration - current_time < CHUNK_DURATION):
+                logger.info(f"Session {session_id}: FFmpeg stopped, processing final chunk...")
+                yield process_audio_segment(record_file, current_time, duration, chunk_index)
+                break
+            if duration - current_time < CHUNK_DURATION:
+                time.sleep(0.02)
+                continue
+            segment_file = os.path.join(CHUNKS_DIR, f"temp_segment_{uuid4().hex[:6]}.wav")
+            subprocess.run(
+                [
+                    "ffmpeg", "-y", "-i", record_file,
+                    "-ss", str(current_time), "-to", str(duration),
+                    "-ar", "16000", "-ac", "1",
+                    "-c:a", "pcm_s16le", "-vn",
+                    segment_file
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL
+            )
+            silences = detect_silences(segment_file)
+            silences = [
+                    {"start": s["start"] + current_time, "end": s["end"] + current_time}
+                    for s in detect_silences(segment_file)
+            ]
+            ideal_end = current_time + CHUNK_DURATION
+            end_time = get_closest_silence(silences, ideal_end) or min(ideal_end, duration)
+            if end_time <= current_time:
+                end_time = min(ideal_end, duration)
+            yield process_audio_segment(record_file, current_time, end_time, chunk_index)
+            current_time = end_time
+            chunk_index += 1
+            os.remove(segment_file)
+    finally:
+        proc = FFMPEG_PROCESSES.pop(session_id, None)
+        if proc:
+            try:
+                proc.terminate()
+            except Exception as e:
+                logger.warning(f"Error stopping FFmpeg for session {session_id}: {e}")
+        if os.path.exists(record_file):
+            try:
+                os.remove(record_file)
+            except Exception as e:
+                logger.warning(f"Could not remove {record_file}: {e}")
+        logger.info(f"🎧 Live recording stopped for session {session_id}.")
+
+def chunk_by_silence(input, session_id: str):
+    if input.source_type == AudioSource.MICROPHONE:
+        yield from chunk_audiostream_by_silence(session_id)
+    else:
+        yield from chunk_audio_by_silence(input.audio_filename)
diff --git a/education-ai-suite/smart-classroom/components/stream_reader.py b/education-ai-suite/smart-classroom/components/stream_reader.py
@@ -1,14 +1,15 @@
 from .base_component import PipelineComponent
-from components.ffmpeg.audio_preprocessing import chunk_audio_by_silence
-
-
+from components.ffmpeg.audio_preprocessing import chunk_by_silence
+ 
+ 
 class AudioStreamReader(PipelineComponent):
     def __init__(self, session_id):
         self.session_id = session_id
         pass
-
+ 
     def process(self, input_generator):
         for input_data in input_generator:
-            audio_path = input_data["audio_path"]
-            for chunk in chunk_audio_by_silence(audio_path):
-                yield chunk  # contains chunk_path, start_time, end_time, etc.
+            input = input_data["input"]
+           
+            for chunk in chunk_by_silence(input,self.session_id):
+                yield chunk  # contains chunk_path, start_time, end_time, etc.
diff --git a/education-ai-suite/smart-classroom/config.yaml b/education-ai-suite/smart-classroom/config.yaml
@@ -11,7 +11,7 @@ monitoring:
 models:
   asr:
     provider: openvino # openvino, funasr and openai supported
-    name: whisper-tiny  # can be (whisper-base, whisper-small etc) or paraformer-zh
+    name: whisper-base  # can be (whisper-base, whisper-small etc) or paraformer-zh
     device: CPU # CPU Recommended
     temperature: 0.0
     models_base_path: "models"
@@ -23,7 +23,7 @@ models:
     device: GPU # GPU or CPU
     weight_format: int8 # supports fp16, int4, int8 (Recommended)
     max_new_tokens: 1024
-    temperature: 0.5 # 0.5 default
+    temperature: 0.3 # 0.5 default
     use_cache: True
     models_base_path: "models"
     language: en # en or zh
@@ -39,10 +39,10 @@ mindmap:
   min_token: 20
 
 audio_preprocessing:
-  chunk_duration_sec: 30
+  chunk_duration_sec: 15
   silence_threshold: -35  # in dB
   silence_duration: 0.3   # minimum silence length in seconds
-  search_window_sec: 1.5    # how far to look for silence if no silence exactly at chunk boundary
+  search_window_sec: 1    # how far to look for silence if no silence exactly at chunk boundary
   chunk_output_path: chunks/
 
 audio_util:
diff --git a/education-ai-suite/smart-classroom/dto/audiosource.py b/education-ai-suite/smart-classroom/dto/audiosource.py
@@ -0,0 +1,5 @@
+from enum import Enum
+
+class AudioSource(Enum):
+    AUDIO_FILE = "audio_file"
+    MICROPHONE = "microphone"
diff --git a/education-ai-suite/smart-classroom/dto/transcription_dto.py b/education-ai-suite/smart-classroom/dto/transcription_dto.py
@@ -1,4 +1,7 @@
 from pydantic import BaseModel
-
+from dto.audiosource import AudioSource
+from typing import Optional
+ 
 class TranscriptionRequest(BaseModel):
     audio_filename: str
+    source_type: Optional[AudioSource] = AudioSource.AUDIO_FILE
diff --git a/education-ai-suite/smart-classroom/pipeline.py b/education-ai-suite/smart-classroom/pipeline.py
@@ -36,11 +36,11 @@ def __init__(self, session_id=None):
             )
         ]
 
-    def run_transcription(self, audio_path: str):
+    def run_transcription(self, input):
         project_config = RuntimeConfig.get_section("Project")
         monitor.start_monitoring(os.path.join(project_config.get("location"), project_config.get("name"), self.session_id, "utilization_logs"))
 
-        input_gen = ({"audio_path": audio_path} for _ in range(1))
+        input_gen = ({"input": input} for _ in range(1))
 
         for component in self.transcription_pipeline:
             input_gen = component.process(input_gen)
@@ -50,7 +50,6 @@ def run_transcription(self, audio_path: str):
                 yield chunk_trancription
         finally:
             monitor.stop_monitoring()
-            time.sleep(3) #time for socwatch to get clean-start
             
     
     def run_summarizer(self):