Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions education-ai-suite/smart-classroom/api/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@
from dto.summarizer_dto import SummaryRequest
from pipeline import Pipeline
import json, os
import subprocess, re
from fastapi.responses import StreamingResponse
from utils.runtime_config_loader import RuntimeConfig
from utils.storage_manager import StorageManager
from utils.platform_info import get_platform_and_model_info
from dto.project_settings import ProjectSettings
from monitoring.monitor import start_monitoring, stop_monitoring, get_metrics
from dto.audiosource import AudioSource
from components.ffmpeg import audio_preprocessing
from utils.audio_util import save_audio_file
from utils.locks import audio_pipeline_lock
import logging
Expand Down Expand Up @@ -114,6 +117,31 @@ async def generate_mindmap(request: SummaryRequest):
detail=f"Mindmap generation failed: {e}"
)

@router.get("/devices")
def list_audio_devices():
result = subprocess.run(
["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", "dummy"],
stderr=subprocess.PIPE,
text=True,
encoding="utf-8",
errors="replace"
)
audio_devices = re.findall(r'"(.*?)"\s*\(audio\)', result.stderr)
formatted_devices = [f"audio={d}" for d in audio_devices]
return {"devices": formatted_devices}


@router.post("/stop-mic")
def stop_microphone(session_id: str):
process = audio_preprocessing.FFMPEG_PROCESSES.pop(session_id, None)
if process:
logger.info(f"Stopping microphone recording for session {session_id}...")
process.terminate()
process.wait(timeout=5)
return {"status": "stopped", "message": f"Microphone for session {session_id} stopped successfully."}
else:
return {"status": "idle", "message": f"No active microphone session found for {session_id}."}

@router.get("/performance-metrics")
def get_summary_metrics(session_id: Optional[str] = Header(None, alias="session_id")):
project_config = RuntimeConfig.get_section("Project")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,25 @@
from uuid import uuid4
import atexit
import shutil
from utils.config_loader import config
import platform,time
import logging
from utils.config_loader import config
from utils.runtime_config_loader import RuntimeConfig
from dto.audiosource import AudioSource

logger = logging.getLogger(__name__)

CHUNK_DURATION = config.audio_preprocessing.chunk_duration_sec # seconds
SILENCE_THRESH = config.audio_preprocessing.silence_threshold # in dB
SILENCE_DURATION = config.audio_preprocessing.silence_duration # in seconds
CHUNK_DURATION = config.audio_preprocessing.chunk_duration_sec
SILENCE_THRESH = config.audio_preprocessing.silence_threshold
SILENCE_DURATION = config.audio_preprocessing.silence_duration
SEARCH_WINDOW = config.audio_preprocessing.search_window_sec
CLEAN_UP_ON_EXIT = config.app.cleanup_on_exit

CHUNKS_DIR = config.audio_preprocessing.chunk_output_path
os.makedirs(CHUNKS_DIR, exist_ok=True)

FFMPEG_PROCESSES = {}

@atexit.register
def cleanup_chunks_folder():
if os.path.exists(CHUNKS_DIR) and CLEAN_UP_ON_EXIT:
Expand Down Expand Up @@ -70,45 +75,128 @@ def get_closest_silence(silences, target_time, window=SEARCH_WINDOW):

return closest # None if nothing close enough

def chunk_audio_by_silence(audio_path):
def process_audio_segment(audio_path, start_time, end_time, chunk_index):
chunk_name = f"chunk_{chunk_index}_{uuid4().hex[:6]}.wav"
chunk_path = os.path.join(CHUNKS_DIR, chunk_name)
subprocess.run(
[
"ffmpeg", "-y", "-i", audio_path,
"-ss", str(start_time), "-to", str(end_time),
"-ar", "16000", "-ac", "1",
"-c:a", "pcm_s16le", "-vn",
chunk_path
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
encoding="utf-8",
errors="replace"
)
logger.debug(f"Chunk {chunk_index} saved: {chunk_path}")
return {
"chunk_path": chunk_path,
"start_time": start_time,
"end_time": end_time,
"chunk_index": chunk_index
}

def chunk_audio_by_silence(audio_path):
if SEARCH_WINDOW > CHUNK_DURATION:
raise ValueError(f"Silence search window ({SEARCH_WINDOW}s) can't be more then Chunk Duration({CHUNK_DURATION}s).")

raise ValueError(
f"Silence search window ({SEARCH_WINDOW}s) can't be more than chunk duration ({CHUNK_DURATION}s)."
)
duration = get_audio_duration(audio_path)
silences = detect_silences(audio_path)

current_time = 0.0
chunk_index = 0

current_time, chunk_index = 0.0, 0
while current_time < duration:
ideal_end = current_time + CHUNK_DURATION
end_time = get_closest_silence(silences, ideal_end)

cut_by_silence = True
if not end_time or end_time <= current_time or end_time > duration:
end_time = get_closest_silence(silences, ideal_end) or min(ideal_end, duration)
if end_time <= current_time:
end_time = min(ideal_end, duration)
cut_by_silence = False

chunk_name = f"chunk_{chunk_index}_{uuid4().hex[:6]}.wav"
chunk_path = os.path.join(CHUNKS_DIR, chunk_name)

subprocess.run([
"ffmpeg", "-y", "-i", audio_path,
"-ss", str(current_time), "-to", str(end_time),
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", "-vn",
chunk_path
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, encoding="utf-8", errors="replace")

chunk_meta = {
"chunk_path": chunk_path,
"start_time": current_time,
"end_time": end_time if end_time < duration else None,
"chunk_index": chunk_index,
"cut_by_silence": cut_by_silence
}

yield chunk_meta

yield process_audio_segment(audio_path, current_time, end_time, chunk_index)
current_time = end_time
chunk_index += 1

def chunk_audiostream_by_silence(session_id: str):
global FFMPEG_PROCESSES
mic_device = RuntimeConfig.get_section("Project").get("microphone", "").strip()
if not mic_device:
raise ValueError(
"Microphone device not set in runtime_config.yaml under Project.microphone"
)
record_file = os.path.join(CHUNKS_DIR, f"live_input_{session_id}.wav")
process = subprocess.Popen(
[
"ffmpeg", "-y",
"-f", "dshow",
"-i", f"audio={mic_device}",
"-ar", "16000", "-ac", "1",
"-c:a", "pcm_s16le", "-rf64", "auto",
record_file
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
FFMPEG_PROCESSES[session_id] = process
logger.info(f"🎙️ Recording from {mic_device} (session={session_id}) ... use /stop-mic to stop.")
current_time, chunk_index = 0.0, 0
MAX_DURATION = 45 * 60
try:
while True:
if current_time >= MAX_DURATION:
logger.info(f"Session {session_id}: reached 45 min limit, stopping.")
break
if not os.path.exists(record_file) or os.path.getsize(record_file) < 44:
time.sleep(0.02)
continue
duration = get_audio_duration(record_file)
if (process.poll() is not None) and (duration - current_time < CHUNK_DURATION):
logger.info(f"Session {session_id}: FFmpeg stopped, processing final chunk...")
yield process_audio_segment(record_file, current_time, duration, chunk_index)
break
if duration - current_time < CHUNK_DURATION:
time.sleep(0.02)
continue
segment_file = os.path.join(CHUNKS_DIR, f"temp_segment_{uuid4().hex[:6]}.wav")
subprocess.run(
[
"ffmpeg", "-y", "-i", record_file,
"-ss", str(current_time), "-to", str(duration),
"-ar", "16000", "-ac", "1",
"-c:a", "pcm_s16le", "-vn",
segment_file
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
silences = detect_silences(segment_file)
silences = [
{"start": s["start"] + current_time, "end": s["end"] + current_time}
for s in detect_silences(segment_file)
]
ideal_end = current_time + CHUNK_DURATION
end_time = get_closest_silence(silences, ideal_end) or min(ideal_end, duration)
if end_time <= current_time:
end_time = min(ideal_end, duration)
yield process_audio_segment(record_file, current_time, end_time, chunk_index)
current_time = end_time
chunk_index += 1
os.remove(segment_file)
finally:
proc = FFMPEG_PROCESSES.pop(session_id, None)
if proc:
try:
proc.terminate()
except Exception as e:
logger.warning(f"Error stopping FFmpeg for session {session_id}: {e}")
if os.path.exists(record_file):
try:
os.remove(record_file)
except Exception as e:
logger.warning(f"Could not remove {record_file}: {e}")
logger.info(f"🎧 Live recording stopped for session {session_id}.")

def chunk_by_silence(input, session_id: str):
if input.source_type == AudioSource.MICROPHONE:
yield from chunk_audiostream_by_silence(session_id)
else:
yield from chunk_audio_by_silence(input.audio_filename)
15 changes: 8 additions & 7 deletions education-ai-suite/smart-classroom/components/stream_reader.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from .base_component import PipelineComponent
from components.ffmpeg.audio_preprocessing import chunk_audio_by_silence


from components.ffmpeg.audio_preprocessing import chunk_by_silence
class AudioStreamReader(PipelineComponent):
def __init__(self, session_id):
self.session_id = session_id
pass

def process(self, input_generator):
for input_data in input_generator:
audio_path = input_data["audio_path"]
for chunk in chunk_audio_by_silence(audio_path):
yield chunk # contains chunk_path, start_time, end_time, etc.
input = input_data["input"]

for chunk in chunk_by_silence(input,self.session_id):
yield chunk # contains chunk_path, start_time, end_time, etc.
8 changes: 4 additions & 4 deletions education-ai-suite/smart-classroom/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ monitoring:
models:
asr:
provider: openvino # openvino, funasr and openai supported
name: whisper-tiny # can be (whisper-base, whisper-small etc) or paraformer-zh
name: whisper-base # can be (whisper-base, whisper-small etc) or paraformer-zh
device: CPU # CPU Recommended
temperature: 0.0
models_base_path: "models"
Expand All @@ -23,7 +23,7 @@ models:
device: GPU # GPU or CPU
weight_format: int8 # supports fp16, int4, int8 (Recommended)
max_new_tokens: 1024
temperature: 0.5 # 0.5 default
temperature: 0.3 # 0.5 default
use_cache: True
models_base_path: "models"
language: en # en or zh
Expand All @@ -39,10 +39,10 @@ mindmap:
min_token: 20

audio_preprocessing:
chunk_duration_sec: 30
chunk_duration_sec: 15
silence_threshold: -35 # in dB
silence_duration: 0.3 # minimum silence length in seconds
search_window_sec: 1.5 # how far to look for silence if no silence exactly at chunk boundary
search_window_sec: 1 # how far to look for silence if no silence exactly at chunk boundary
chunk_output_path: chunks/

audio_util:
Expand Down
5 changes: 5 additions & 0 deletions education-ai-suite/smart-classroom/dto/audiosource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from enum import Enum

class AudioSource(Enum):
AUDIO_FILE = "audio_file"
MICROPHONE = "microphone"
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from pydantic import BaseModel

from dto.audiosource import AudioSource
from typing import Optional

class TranscriptionRequest(BaseModel):
audio_filename: str
source_type: Optional[AudioSource] = AudioSource.AUDIO_FILE
5 changes: 2 additions & 3 deletions education-ai-suite/smart-classroom/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ def __init__(self, session_id=None):
)
]

def run_transcription(self, audio_path: str):
def run_transcription(self, input):
project_config = RuntimeConfig.get_section("Project")
monitor.start_monitoring(os.path.join(project_config.get("location"), project_config.get("name"), self.session_id, "utilization_logs"))

input_gen = ({"audio_path": audio_path} for _ in range(1))
input_gen = ({"input": input} for _ in range(1))

for component in self.transcription_pipeline:
input_gen = component.process(input_gen)
Expand All @@ -50,7 +50,6 @@ def run_transcription(self, audio_path: str):
yield chunk_trancription
finally:
monitor.stop_monitoring()
time.sleep(3) #time for socwatch to get clean-start


def run_summarizer(self):
Expand Down