diff --git a/education-ai-suite/smart-classroom/api/endpoints.py b/education-ai-suite/smart-classroom/api/endpoints.py index a15bbb2de..dc589e089 100644 --- a/education-ai-suite/smart-classroom/api/endpoints.py +++ b/education-ai-suite/smart-classroom/api/endpoints.py @@ -7,12 +7,15 @@ from dto.summarizer_dto import SummaryRequest from pipeline import Pipeline import json, os +import subprocess, re from fastapi.responses import StreamingResponse from utils.runtime_config_loader import RuntimeConfig from utils.storage_manager import StorageManager from utils.platform_info import get_platform_and_model_info from dto.project_settings import ProjectSettings from monitoring.monitor import start_monitoring, stop_monitoring, get_metrics +from dto.audiosource import AudioSource +from components.ffmpeg import audio_preprocessing from utils.audio_util import save_audio_file from utils.locks import audio_pipeline_lock import logging @@ -114,6 +117,31 @@ async def generate_mindmap(request: SummaryRequest): detail=f"Mindmap generation failed: {e}" ) +@router.get("/devices") +def list_audio_devices(): + result = subprocess.run( + ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", "dummy"], + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace" + ) + audio_devices = re.findall(r'"(.*?)"\s*\(audio\)', result.stderr) + formatted_devices = [f"audio={d}" for d in audio_devices] + return {"devices": formatted_devices} + + +@router.post("/stop-mic") +def stop_microphone(session_id: str): + process = audio_preprocessing.FFMPEG_PROCESSES.pop(session_id, None) + if process: + logger.info(f"Stopping microphone recording for session {session_id}...") + process.terminate() + process.wait(timeout=5) + return {"status": "stopped", "message": f"Microphone for session {session_id} stopped successfully."} + else: + return {"status": "idle", "message": f"No active microphone session found for {session_id}."} + @router.get("/performance-metrics") def get_summary_metrics(session_id: Optional[str] = Header(None, alias="session_id")): project_config = RuntimeConfig.get_section("Project") diff --git a/education-ai-suite/smart-classroom/components/ffmpeg/audio_preprocessing.py b/education-ai-suite/smart-classroom/components/ffmpeg/audio_preprocessing.py index 0139d43e1..39b88f71e 100644 --- a/education-ai-suite/smart-classroom/components/ffmpeg/audio_preprocessing.py +++ b/education-ai-suite/smart-classroom/components/ffmpeg/audio_preprocessing.py @@ -3,20 +3,25 @@ from uuid import uuid4 import atexit import shutil -from utils.config_loader import config +import platform,time import logging +from utils.config_loader import config +from utils.runtime_config_loader import RuntimeConfig +from dto.audiosource import AudioSource logger = logging.getLogger(__name__) -CHUNK_DURATION = config.audio_preprocessing.chunk_duration_sec # seconds -SILENCE_THRESH = config.audio_preprocessing.silence_threshold # in dB -SILENCE_DURATION = config.audio_preprocessing.silence_duration # in seconds +CHUNK_DURATION = config.audio_preprocessing.chunk_duration_sec +SILENCE_THRESH = config.audio_preprocessing.silence_threshold +SILENCE_DURATION = config.audio_preprocessing.silence_duration SEARCH_WINDOW = config.audio_preprocessing.search_window_sec CLEAN_UP_ON_EXIT = config.app.cleanup_on_exit CHUNKS_DIR = config.audio_preprocessing.chunk_output_path os.makedirs(CHUNKS_DIR, exist_ok=True) +FFMPEG_PROCESSES = {} + @atexit.register def cleanup_chunks_folder(): if os.path.exists(CHUNKS_DIR) and CLEAN_UP_ON_EXIT: @@ -70,45 +75,128 @@ def get_closest_silence(silences, target_time, window=SEARCH_WINDOW): return closest # None if nothing close enough -def chunk_audio_by_silence(audio_path): +def process_audio_segment(audio_path, start_time, end_time, chunk_index): + chunk_name = f"chunk_{chunk_index}_{uuid4().hex[:6]}.wav" + chunk_path = os.path.join(CHUNKS_DIR, chunk_name) + subprocess.run( + [ + "ffmpeg", "-y", "-i", audio_path, + "-ss", str(start_time), "-to", str(end_time), + "-ar", "16000", "-ac", "1", + "-c:a", "pcm_s16le", "-vn", + chunk_path + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + encoding="utf-8", + errors="replace" + ) + logger.debug(f"Chunk {chunk_index} saved: {chunk_path}") + return { + "chunk_path": chunk_path, + "start_time": start_time, + "end_time": end_time, + "chunk_index": chunk_index + } +def chunk_audio_by_silence(audio_path): if SEARCH_WINDOW > CHUNK_DURATION: - raise ValueError(f"Silence search window ({SEARCH_WINDOW}s) can't be more then Chunk Duration({CHUNK_DURATION}s).") - + raise ValueError( + f"Silence search window ({SEARCH_WINDOW}s) can't be more than chunk duration ({CHUNK_DURATION}s)." + ) duration = get_audio_duration(audio_path) silences = detect_silences(audio_path) - - current_time = 0.0 - chunk_index = 0 - + current_time, chunk_index = 0.0, 0 while current_time < duration: ideal_end = current_time + CHUNK_DURATION - end_time = get_closest_silence(silences, ideal_end) - - cut_by_silence = True - if not end_time or end_time <= current_time or end_time > duration: + end_time = get_closest_silence(silences, ideal_end) or min(ideal_end, duration) + if end_time <= current_time: end_time = min(ideal_end, duration) - cut_by_silence = False - - chunk_name = f"chunk_{chunk_index}_{uuid4().hex[:6]}.wav" - chunk_path = os.path.join(CHUNKS_DIR, chunk_name) - - subprocess.run([ - "ffmpeg", "-y", "-i", audio_path, - "-ss", str(current_time), "-to", str(end_time), - "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", "-vn", - chunk_path - ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, encoding="utf-8", errors="replace") - - chunk_meta = { - "chunk_path": chunk_path, - "start_time": current_time, - "end_time": end_time if end_time < duration else None, - "chunk_index": chunk_index, - "cut_by_silence": cut_by_silence - } - - yield chunk_meta - + yield process_audio_segment(audio_path, current_time, end_time, chunk_index) current_time = end_time chunk_index += 1 + +def chunk_audiostream_by_silence(session_id: str): + global FFMPEG_PROCESSES + mic_device = RuntimeConfig.get_section("Project").get("microphone", "").strip() + if not mic_device: + raise ValueError( + "Microphone device not set in runtime_config.yaml under Project.microphone" + ) + record_file = os.path.join(CHUNKS_DIR, f"live_input_{session_id}.wav") + process = subprocess.Popen( + [ + "ffmpeg", "-y", + "-f", "dshow", + "-i", f"audio={mic_device}", + "-ar", "16000", "-ac", "1", + "-c:a", "pcm_s16le", "-rf64", "auto", + record_file + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) + FFMPEG_PROCESSES[session_id] = process + logger.info(f"🎙️ Recording from {mic_device} (session={session_id}) ... use /stop-mic to stop.") + current_time, chunk_index = 0.0, 0 + MAX_DURATION = 45 * 60 + try: + while True: + if current_time >= MAX_DURATION: + logger.info(f"Session {session_id}: reached 45 min limit, stopping.") + break + if not os.path.exists(record_file) or os.path.getsize(record_file) < 44: + time.sleep(0.02) + continue + duration = get_audio_duration(record_file) + if (process.poll() is not None) and (duration - current_time < CHUNK_DURATION): + logger.info(f"Session {session_id}: FFmpeg stopped, processing final chunk...") + yield process_audio_segment(record_file, current_time, duration, chunk_index) + break + if duration - current_time < CHUNK_DURATION: + time.sleep(0.02) + continue + segment_file = os.path.join(CHUNKS_DIR, f"temp_segment_{uuid4().hex[:6]}.wav") + subprocess.run( + [ + "ffmpeg", "-y", "-i", record_file, + "-ss", str(current_time), "-to", str(duration), + "-ar", "16000", "-ac", "1", + "-c:a", "pcm_s16le", "-vn", + segment_file + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) + silences = detect_silences(segment_file) + silences = [ + {"start": s["start"] + current_time, "end": s["end"] + current_time} + for s in detect_silences(segment_file) + ] + ideal_end = current_time + CHUNK_DURATION + end_time = get_closest_silence(silences, ideal_end) or min(ideal_end, duration) + if end_time <= current_time: + end_time = min(ideal_end, duration) + yield process_audio_segment(record_file, current_time, end_time, chunk_index) + current_time = end_time + chunk_index += 1 + os.remove(segment_file) + finally: + proc = FFMPEG_PROCESSES.pop(session_id, None) + if proc: + try: + proc.terminate() + except Exception as e: + logger.warning(f"Error stopping FFmpeg for session {session_id}: {e}") + if os.path.exists(record_file): + try: + os.remove(record_file) + except Exception as e: + logger.warning(f"Could not remove {record_file}: {e}") + logger.info(f"🎧 Live recording stopped for session {session_id}.") + +def chunk_by_silence(input, session_id: str): + if input.source_type == AudioSource.MICROPHONE: + yield from chunk_audiostream_by_silence(session_id) + else: + yield from chunk_audio_by_silence(input.audio_filename) diff --git a/education-ai-suite/smart-classroom/components/stream_reader.py b/education-ai-suite/smart-classroom/components/stream_reader.py index 446ce9ab8..164c5fc83 100644 --- a/education-ai-suite/smart-classroom/components/stream_reader.py +++ b/education-ai-suite/smart-classroom/components/stream_reader.py @@ -1,14 +1,15 @@ from .base_component import PipelineComponent -from components.ffmpeg.audio_preprocessing import chunk_audio_by_silence - - +from components.ffmpeg.audio_preprocessing import chunk_by_silence + + class AudioStreamReader(PipelineComponent): def __init__(self, session_id): self.session_id = session_id pass - + def process(self, input_generator): for input_data in input_generator: - audio_path = input_data["audio_path"] - for chunk in chunk_audio_by_silence(audio_path): - yield chunk # contains chunk_path, start_time, end_time, etc. + input = input_data["input"] + + for chunk in chunk_by_silence(input,self.session_id): + yield chunk # contains chunk_path, start_time, end_time, etc. \ No newline at end of file diff --git a/education-ai-suite/smart-classroom/config.yaml b/education-ai-suite/smart-classroom/config.yaml index 0d3feb048..89f8194f0 100644 --- a/education-ai-suite/smart-classroom/config.yaml +++ b/education-ai-suite/smart-classroom/config.yaml @@ -11,7 +11,7 @@ monitoring: models: asr: provider: openvino # openvino, funasr and openai supported - name: whisper-tiny # can be (whisper-base, whisper-small etc) or paraformer-zh + name: whisper-base # can be (whisper-base, whisper-small etc) or paraformer-zh device: CPU # CPU Recommended temperature: 0.0 models_base_path: "models" @@ -23,7 +23,7 @@ models: device: GPU # GPU or CPU weight_format: int8 # supports fp16, int4, int8 (Recommended) max_new_tokens: 1024 - temperature: 0.5 # 0.5 default + temperature: 0.3 # 0.5 default use_cache: True models_base_path: "models" language: en # en or zh @@ -39,10 +39,10 @@ mindmap: min_token: 20 audio_preprocessing: - chunk_duration_sec: 30 + chunk_duration_sec: 15 silence_threshold: -35 # in dB silence_duration: 0.3 # minimum silence length in seconds - search_window_sec: 1.5 # how far to look for silence if no silence exactly at chunk boundary + search_window_sec: 1 # how far to look for silence if no silence exactly at chunk boundary chunk_output_path: chunks/ audio_util: diff --git a/education-ai-suite/smart-classroom/dto/audiosource.py b/education-ai-suite/smart-classroom/dto/audiosource.py new file mode 100644 index 000000000..7514dfb28 --- /dev/null +++ b/education-ai-suite/smart-classroom/dto/audiosource.py @@ -0,0 +1,5 @@ +from enum import Enum + +class AudioSource(Enum): + AUDIO_FILE = "audio_file" + MICROPHONE = "microphone" diff --git a/education-ai-suite/smart-classroom/dto/transcription_dto.py b/education-ai-suite/smart-classroom/dto/transcription_dto.py index 68d9e724f..65437c189 100644 --- a/education-ai-suite/smart-classroom/dto/transcription_dto.py +++ b/education-ai-suite/smart-classroom/dto/transcription_dto.py @@ -1,4 +1,7 @@ from pydantic import BaseModel - +from dto.audiosource import AudioSource +from typing import Optional + class TranscriptionRequest(BaseModel): audio_filename: str + source_type: Optional[AudioSource] = AudioSource.AUDIO_FILE \ No newline at end of file diff --git a/education-ai-suite/smart-classroom/pipeline.py b/education-ai-suite/smart-classroom/pipeline.py index 8367776eb..f2be50f44 100644 --- a/education-ai-suite/smart-classroom/pipeline.py +++ b/education-ai-suite/smart-classroom/pipeline.py @@ -36,11 +36,11 @@ def __init__(self, session_id=None): ) ] - def run_transcription(self, audio_path: str): + def run_transcription(self, input): project_config = RuntimeConfig.get_section("Project") monitor.start_monitoring(os.path.join(project_config.get("location"), project_config.get("name"), self.session_id, "utilization_logs")) - input_gen = ({"audio_path": audio_path} for _ in range(1)) + input_gen = ({"input": input} for _ in range(1)) for component in self.transcription_pipeline: input_gen = component.process(input_gen) @@ -50,7 +50,6 @@ def run_transcription(self, audio_path: str): yield chunk_trancription finally: monitor.stop_monitoring() - time.sleep(3) #time for socwatch to get clean-start def run_summarizer(self):