Skip to content

Commit fb1f83a

Browse files
authored
Real time audio preprocessing (#1100)
1 parent 49fc76f commit fb1f83a

File tree

7 files changed

+176
-52
lines changed

7 files changed

+176
-52
lines changed

education-ai-suite/smart-classroom/api/endpoints.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@
77
from dto.summarizer_dto import SummaryRequest
88
from pipeline import Pipeline
99
import json, os
10+
import subprocess, re
1011
from fastapi.responses import StreamingResponse
1112
from utils.runtime_config_loader import RuntimeConfig
1213
from utils.storage_manager import StorageManager
1314
from utils.platform_info import get_platform_and_model_info
1415
from dto.project_settings import ProjectSettings
1516
from monitoring.monitor import start_monitoring, stop_monitoring, get_metrics
17+
from dto.audiosource import AudioSource
18+
from components.ffmpeg import audio_preprocessing
1619
from utils.audio_util import save_audio_file
1720
from utils.locks import audio_pipeline_lock
1821
import logging
@@ -114,6 +117,31 @@ async def generate_mindmap(request: SummaryRequest):
114117
detail=f"Mindmap generation failed: {e}"
115118
)
116119

120+
@router.get("/devices")
121+
def list_audio_devices():
122+
result = subprocess.run(
123+
["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", "dummy"],
124+
stderr=subprocess.PIPE,
125+
text=True,
126+
encoding="utf-8",
127+
errors="replace"
128+
)
129+
audio_devices = re.findall(r'"(.*?)"\s*\(audio\)', result.stderr)
130+
formatted_devices = [f"audio={d}" for d in audio_devices]
131+
return {"devices": formatted_devices}
132+
133+
134+
@router.post("/stop-mic")
135+
def stop_microphone(session_id: str):
136+
process = audio_preprocessing.FFMPEG_PROCESSES.pop(session_id, None)
137+
if process:
138+
logger.info(f"Stopping microphone recording for session {session_id}...")
139+
process.terminate()
140+
process.wait(timeout=5)
141+
return {"status": "stopped", "message": f"Microphone for session {session_id} stopped successfully."}
142+
else:
143+
return {"status": "idle", "message": f"No active microphone session found for {session_id}."}
144+
117145
@router.get("/performance-metrics")
118146
def get_summary_metrics(session_id: Optional[str] = Header(None, alias="session_id")):
119147
project_config = RuntimeConfig.get_section("Project")

education-ai-suite/smart-classroom/components/ffmpeg/audio_preprocessing.py

Lines changed: 125 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,25 @@
33
from uuid import uuid4
44
import atexit
55
import shutil
6-
from utils.config_loader import config
6+
import platform,time
77
import logging
8+
from utils.config_loader import config
9+
from utils.runtime_config_loader import RuntimeConfig
10+
from dto.audiosource import AudioSource
811

912
logger = logging.getLogger(__name__)
1013

11-
CHUNK_DURATION = config.audio_preprocessing.chunk_duration_sec # seconds
12-
SILENCE_THRESH = config.audio_preprocessing.silence_threshold # in dB
13-
SILENCE_DURATION = config.audio_preprocessing.silence_duration # in seconds
14+
CHUNK_DURATION = config.audio_preprocessing.chunk_duration_sec
15+
SILENCE_THRESH = config.audio_preprocessing.silence_threshold
16+
SILENCE_DURATION = config.audio_preprocessing.silence_duration
1417
SEARCH_WINDOW = config.audio_preprocessing.search_window_sec
1518
CLEAN_UP_ON_EXIT = config.app.cleanup_on_exit
1619

1720
CHUNKS_DIR = config.audio_preprocessing.chunk_output_path
1821
os.makedirs(CHUNKS_DIR, exist_ok=True)
1922

23+
FFMPEG_PROCESSES = {}
24+
2025
@atexit.register
2126
def cleanup_chunks_folder():
2227
if os.path.exists(CHUNKS_DIR) and CLEAN_UP_ON_EXIT:
@@ -70,45 +75,128 @@ def get_closest_silence(silences, target_time, window=SEARCH_WINDOW):
7075

7176
return closest # None if nothing close enough
7277

73-
def chunk_audio_by_silence(audio_path):
78+
def process_audio_segment(audio_path, start_time, end_time, chunk_index):
79+
chunk_name = f"chunk_{chunk_index}_{uuid4().hex[:6]}.wav"
80+
chunk_path = os.path.join(CHUNKS_DIR, chunk_name)
81+
subprocess.run(
82+
[
83+
"ffmpeg", "-y", "-i", audio_path,
84+
"-ss", str(start_time), "-to", str(end_time),
85+
"-ar", "16000", "-ac", "1",
86+
"-c:a", "pcm_s16le", "-vn",
87+
chunk_path
88+
],
89+
stdout=subprocess.DEVNULL,
90+
stderr=subprocess.DEVNULL,
91+
encoding="utf-8",
92+
errors="replace"
93+
)
94+
logger.debug(f"Chunk {chunk_index} saved: {chunk_path}")
95+
return {
96+
"chunk_path": chunk_path,
97+
"start_time": start_time,
98+
"end_time": end_time,
99+
"chunk_index": chunk_index
100+
}
74101

102+
def chunk_audio_by_silence(audio_path):
75103
if SEARCH_WINDOW > CHUNK_DURATION:
76-
raise ValueError(f"Silence search window ({SEARCH_WINDOW}s) can't be more then Chunk Duration({CHUNK_DURATION}s).")
77-
104+
raise ValueError(
105+
f"Silence search window ({SEARCH_WINDOW}s) can't be more than chunk duration ({CHUNK_DURATION}s)."
106+
)
78107
duration = get_audio_duration(audio_path)
79108
silences = detect_silences(audio_path)
80-
81-
current_time = 0.0
82-
chunk_index = 0
83-
109+
current_time, chunk_index = 0.0, 0
84110
while current_time < duration:
85111
ideal_end = current_time + CHUNK_DURATION
86-
end_time = get_closest_silence(silences, ideal_end)
87-
88-
cut_by_silence = True
89-
if not end_time or end_time <= current_time or end_time > duration:
112+
end_time = get_closest_silence(silences, ideal_end) or min(ideal_end, duration)
113+
if end_time <= current_time:
90114
end_time = min(ideal_end, duration)
91-
cut_by_silence = False
92-
93-
chunk_name = f"chunk_{chunk_index}_{uuid4().hex[:6]}.wav"
94-
chunk_path = os.path.join(CHUNKS_DIR, chunk_name)
95-
96-
subprocess.run([
97-
"ffmpeg", "-y", "-i", audio_path,
98-
"-ss", str(current_time), "-to", str(end_time),
99-
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", "-vn",
100-
chunk_path
101-
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, encoding="utf-8", errors="replace")
102-
103-
chunk_meta = {
104-
"chunk_path": chunk_path,
105-
"start_time": current_time,
106-
"end_time": end_time if end_time < duration else None,
107-
"chunk_index": chunk_index,
108-
"cut_by_silence": cut_by_silence
109-
}
110-
111-
yield chunk_meta
112-
115+
yield process_audio_segment(audio_path, current_time, end_time, chunk_index)
113116
current_time = end_time
114117
chunk_index += 1
118+
119+
def chunk_audiostream_by_silence(session_id: str):
120+
global FFMPEG_PROCESSES
121+
mic_device = RuntimeConfig.get_section("Project").get("microphone", "").strip()
122+
if not mic_device:
123+
raise ValueError(
124+
"Microphone device not set in runtime_config.yaml under Project.microphone"
125+
)
126+
record_file = os.path.join(CHUNKS_DIR, f"live_input_{session_id}.wav")
127+
process = subprocess.Popen(
128+
[
129+
"ffmpeg", "-y",
130+
"-f", "dshow",
131+
"-i", f"audio={mic_device}",
132+
"-ar", "16000", "-ac", "1",
133+
"-c:a", "pcm_s16le", "-rf64", "auto",
134+
record_file
135+
],
136+
stdout=subprocess.DEVNULL,
137+
stderr=subprocess.DEVNULL
138+
)
139+
FFMPEG_PROCESSES[session_id] = process
140+
logger.info(f"🎙️ Recording from {mic_device} (session={session_id}) ... use /stop-mic to stop.")
141+
current_time, chunk_index = 0.0, 0
142+
MAX_DURATION = 45 * 60
143+
try:
144+
while True:
145+
if current_time >= MAX_DURATION:
146+
logger.info(f"Session {session_id}: reached 45 min limit, stopping.")
147+
break
148+
if not os.path.exists(record_file) or os.path.getsize(record_file) < 44:
149+
time.sleep(0.02)
150+
continue
151+
duration = get_audio_duration(record_file)
152+
if (process.poll() is not None) and (duration - current_time < CHUNK_DURATION):
153+
logger.info(f"Session {session_id}: FFmpeg stopped, processing final chunk...")
154+
yield process_audio_segment(record_file, current_time, duration, chunk_index)
155+
break
156+
if duration - current_time < CHUNK_DURATION:
157+
time.sleep(0.02)
158+
continue
159+
segment_file = os.path.join(CHUNKS_DIR, f"temp_segment_{uuid4().hex[:6]}.wav")
160+
subprocess.run(
161+
[
162+
"ffmpeg", "-y", "-i", record_file,
163+
"-ss", str(current_time), "-to", str(duration),
164+
"-ar", "16000", "-ac", "1",
165+
"-c:a", "pcm_s16le", "-vn",
166+
segment_file
167+
],
168+
stdout=subprocess.DEVNULL,
169+
stderr=subprocess.DEVNULL
170+
)
171+
silences = detect_silences(segment_file)
172+
silences = [
173+
{"start": s["start"] + current_time, "end": s["end"] + current_time}
174+
for s in detect_silences(segment_file)
175+
]
176+
ideal_end = current_time + CHUNK_DURATION
177+
end_time = get_closest_silence(silences, ideal_end) or min(ideal_end, duration)
178+
if end_time <= current_time:
179+
end_time = min(ideal_end, duration)
180+
yield process_audio_segment(record_file, current_time, end_time, chunk_index)
181+
current_time = end_time
182+
chunk_index += 1
183+
os.remove(segment_file)
184+
finally:
185+
proc = FFMPEG_PROCESSES.pop(session_id, None)
186+
if proc:
187+
try:
188+
proc.terminate()
189+
except Exception as e:
190+
logger.warning(f"Error stopping FFmpeg for session {session_id}: {e}")
191+
if os.path.exists(record_file):
192+
try:
193+
os.remove(record_file)
194+
except Exception as e:
195+
logger.warning(f"Could not remove {record_file}: {e}")
196+
logger.info(f"🎧 Live recording stopped for session {session_id}.")
197+
198+
def chunk_by_silence(input, session_id: str):
199+
if input.source_type == AudioSource.MICROPHONE:
200+
yield from chunk_audiostream_by_silence(session_id)
201+
else:
202+
yield from chunk_audio_by_silence(input.audio_filename)
Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
from .base_component import PipelineComponent
2-
from components.ffmpeg.audio_preprocessing import chunk_audio_by_silence
3-
4-
2+
from components.ffmpeg.audio_preprocessing import chunk_by_silence
3+
4+
55
class AudioStreamReader(PipelineComponent):
66
def __init__(self, session_id):
77
self.session_id = session_id
88
pass
9-
9+
1010
def process(self, input_generator):
1111
for input_data in input_generator:
12-
audio_path = input_data["audio_path"]
13-
for chunk in chunk_audio_by_silence(audio_path):
14-
yield chunk # contains chunk_path, start_time, end_time, etc.
12+
input = input_data["input"]
13+
14+
for chunk in chunk_by_silence(input,self.session_id):
15+
yield chunk # contains chunk_path, start_time, end_time, etc.

education-ai-suite/smart-classroom/config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ monitoring:
1111
models:
1212
asr:
1313
provider: openvino # openvino, funasr and openai supported
14-
name: whisper-tiny # can be (whisper-base, whisper-small etc) or paraformer-zh
14+
name: whisper-base # can be (whisper-base, whisper-small etc) or paraformer-zh
1515
device: CPU # CPU Recommended
1616
temperature: 0.0
1717
models_base_path: "models"
@@ -23,7 +23,7 @@ models:
2323
device: GPU # GPU or CPU
2424
weight_format: int8 # supports fp16, int4, int8 (Recommended)
2525
max_new_tokens: 1024
26-
temperature: 0.5 # 0.5 default
26+
temperature: 0.3 # 0.5 default
2727
use_cache: True
2828
models_base_path: "models"
2929
language: en # en or zh
@@ -39,10 +39,10 @@ mindmap:
3939
min_token: 20
4040

4141
audio_preprocessing:
42-
chunk_duration_sec: 30
42+
chunk_duration_sec: 15
4343
silence_threshold: -35 # in dB
4444
silence_duration: 0.3 # minimum silence length in seconds
45-
search_window_sec: 1.5 # how far to look for silence if no silence exactly at chunk boundary
45+
search_window_sec: 1 # how far to look for silence if no silence exactly at chunk boundary
4646
chunk_output_path: chunks/
4747

4848
audio_util:
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from enum import Enum
2+
3+
class AudioSource(Enum):
4+
AUDIO_FILE = "audio_file"
5+
MICROPHONE = "microphone"
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
from pydantic import BaseModel
2-
2+
from dto.audiosource import AudioSource
3+
from typing import Optional
4+
35
class TranscriptionRequest(BaseModel):
46
audio_filename: str
7+
source_type: Optional[AudioSource] = AudioSource.AUDIO_FILE

education-ai-suite/smart-classroom/pipeline.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ def __init__(self, session_id=None):
3636
)
3737
]
3838

39-
def run_transcription(self, audio_path: str):
39+
def run_transcription(self, input):
4040
project_config = RuntimeConfig.get_section("Project")
4141
monitor.start_monitoring(os.path.join(project_config.get("location"), project_config.get("name"), self.session_id, "utilization_logs"))
4242

43-
input_gen = ({"audio_path": audio_path} for _ in range(1))
43+
input_gen = ({"input": input} for _ in range(1))
4444

4545
for component in self.transcription_pipeline:
4646
input_gen = component.process(input_gen)
@@ -50,7 +50,6 @@ def run_transcription(self, audio_path: str):
5050
yield chunk_trancription
5151
finally:
5252
monitor.stop_monitoring()
53-
time.sleep(3) #time for socwatch to get clean-start
5453

5554

5655
def run_summarizer(self):

0 commit comments

Comments
 (0)