Skip to content
29 changes: 23 additions & 6 deletions backend/api/routers/dub_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from core.tasks import task_manager
from schemas.requests import DubRequest
from services.model_manager import get_model, _gpu_pool
from services.audio_dsp import apply_mastering, normalize_audio
from services.audio_dsp import apply_mastering, normalize_audio, apply_effects_chain, get_effect_chain
from services.rvc import apply_rvc, is_enabled as rvc_is_enabled
from services.incremental import segment_fingerprint
from services.watermark import embed_watermark
Expand Down Expand Up @@ -159,23 +159,39 @@ def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
duration=dur_s, num_step=nstep, guidance_scale=cfg,
speed=spd, denoise=True, postprocess_output=True,
)
audio_out = audios[0]
mastered_audio = apply_mastering(audio_out, sample_rate=_model.sampling_rate if hasattr(_model, 'sampling_rate') else 24000)
return normalize_audio(mastered_audio, target_dBFS=-2.0)
except Exception as e:
except (torch.cuda.OutOfMemoryError, torch.mps.MPSError) as e:
import gc
gc.collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
elif torch.cuda.is_available():
torch.cuda.empty_cache()
# User-facing: what happened · why · what to do.
raise RuntimeError(
f"Ran out of GPU memory generating this segment. "
f"Try the Flush button in the header to free VRAM, or switch to CPU in Settings. "
f"Underlying error: {e}"
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

audio_out = audios[0]
sr = _model.sampling_rate if hasattr(_model, 'sampling_rate') else 24000

# Apply per-segment DSP effect preset (default: broadcast)
seg_effect_preset = getattr(seg, "effect_preset", None) or "broadcast"
if seg_effect_preset == "raw":
# Raw: skip all DSP — return raw model output
return audio_out

mastered_audio = apply_mastering(audio_out, sample_rate=sr)
effect_chain = get_effect_chain(seg_effect_preset)
if effect_chain:
mastered_audio = apply_effects_chain(
mastered_audio,
sample_rate=sr,
chain=effect_chain,
)

return normalize_audio(mastered_audio, target_dBFS=-2.0)

seg_instruct = seg.instruct or req.instruct
seg_profile = seg.profile_id or None
seg_speed = seg.speed if hasattr(seg, 'speed') and seg.speed is not None else req.speed
Expand Down Expand Up @@ -247,6 +263,7 @@ def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
"instruct": getattr(seg, "instruct", None),
"speed": getattr(seg, "speed", None),
"direction": getattr(seg, "direction", None),
"effect_preset": getattr(seg, "effect_preset", None),
})
except Exception as e:
logger.debug("seg fingerprint skipped for %s: %s", seg_id, e)
Expand Down
12 changes: 12 additions & 0 deletions backend/api/routers/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from core import prefs
from services import tts_backend, asr_backend, llm_backend, translation_engines
from services.audio_dsp import list_effect_presets
from api.schemas import EffectPresetsResponse

router = APIRouter()

Expand Down Expand Up @@ -60,6 +62,16 @@ def list_llm_backends():
return {"active": llm_backend.active_backend_id(), "backends": llm_backend.list_backends()}


@router.get("/engines/effects/presets", response_model=EffectPresetsResponse)
def list_effects_presets():
"""Return available DSP effect presets for the dub pipeline.

Each preset is a named chain of audio effects (EQ, compressor, reverb, etc.)
that can be applied to generated TTS audio on a per-segment basis.
"""
return {"presets": list_effect_presets()}


@router.get("/engines/translation")
def list_translation_engines():
"""Translation engines with per-engine pip-package availability.
Expand Down
33 changes: 29 additions & 4 deletions backend/api/routers/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from core.db import db_conn
from core.config import OUTPUTS_DIR, VOICES_DIR
from services.model_manager import get_model, _gpu_pool
from services.audio_dsp import apply_mastering, normalize_audio
# audio_dsp imported in _run_inference
from core import event_bus

router = APIRouter()
Expand All @@ -24,8 +24,9 @@ def _run_inference(
model, text, language, ref_audio_path, ref_text, instruct, duration,
num_step, guidance_scale, speed, t_shift, denoise,
postprocess_output, layer_penalty_factor, position_temperature,
class_temperature, used_seed,
class_temperature, used_seed, effect_preset="broadcast",
):
from services.audio_dsp import apply_mastering, normalize_audio, apply_effects_chain, get_effect_chain
import torch
try:
if used_seed is not None:
Expand All @@ -46,7 +47,30 @@ def _run_inference(
)
audio_out = audios[0]

mastered_audio = apply_mastering(audio_out, sample_rate=model.sampling_rate if hasattr(model, 'sampling_rate') else 24000)
sr = model.sampling_rate if hasattr(model, 'sampling_rate') else 24000

# Apply DSP effect preset
_effect_preset = effect_preset or "broadcast"

# Validate preset ID
from services.audio_dsp import EFFECT_PRESETS
if _effect_preset not in EFFECT_PRESETS:
raise ValueError(
f"Unknown effect preset: {_effect_preset!r}. "
f"Valid: {list(EFFECT_PRESETS.keys())}"
)

if _effect_preset == "raw":
# Raw: skip all DSP — return raw model output
return audio_out

mastered_audio = apply_mastering(audio_out, sample_rate=sr)
_chain = get_effect_chain(_effect_preset)
if _chain:
mastered_audio = apply_effects_chain(
mastered_audio, sample_rate=sr, chain=_chain,
)

return normalize_audio(mastered_audio, target_dBFS=-2.0)

except ValueError as e:
Expand Down Expand Up @@ -84,6 +108,7 @@ async def generate_speech(
class_temperature: Optional[float] = Form(None),
profile_id: Optional[str] = Form(None),
seed: Optional[int] = Form(None),
effect_preset: str = Form("broadcast"),
):
_model = await get_model()
Comment on lines +111 to 113

Expand Down Expand Up @@ -137,7 +162,7 @@ async def generate_speech(
_model, text, language, ref_audio_path, ref_text, instruct, duration,
num_step, guidance_scale, speed, t_shift, denoise,
postprocess_output, layer_penalty_factor, position_temperature,
class_temperature, used_seed,
class_temperature, used_seed, effect_preset,
)
gen_time = round(time.time() - start_time, 2)

Expand Down
17 changes: 17 additions & 0 deletions backend/api/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,20 @@ class ModelEntry(BaseModel):
supported: bool = True
size_on_disk: int | None = None
nb_files: int | None = None


# ── Effect presets ─────────────────────────────────────────────────────

class EffectPresetEntry(BaseModel):
"""One DSP effect preset."""
model_config = ConfigDict(extra="allow")

id: str
label: str
icon: str
description: str


class EffectPresetsResponse(BaseModel):
"""GET /engines/effects/presets"""
presets: list[EffectPresetEntry] = Field(default_factory=list)
15 changes: 14 additions & 1 deletion backend/schemas/requests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from pydantic import BaseModel
from pydantic import BaseModel, field_validator
from typing import List, Optional

from services.audio_dsp import EFFECT_PRESETS

class ExportRequest(BaseModel):
source_filename: str
destination_path: str
Expand All @@ -23,6 +25,17 @@ class DubSegment(BaseModel):
speed: Optional[float] = None
gain: Optional[float] = None # Per-segment volume (0.0 - 2.0, default 1.0)
target_lang: Optional[str] = None # Per-segment language override (ISO code)
effect_preset: str = "broadcast" # NEW: DSP preset id (default: broadcast)

@field_validator("effect_preset")
@classmethod
def validate_effect_preset(cls, v: str) -> str:
if v not in EFFECT_PRESETS:
raise ValueError(
f"Unknown effect preset: {v!r}. "
f"Valid: {list(EFFECT_PRESETS.keys())}"
)
return v

class DubRequest(BaseModel):
segments: List[DubSegment]
Expand Down
17 changes: 15 additions & 2 deletions backend/services/batched_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class SegmentSpec:
__slots__ = (
"index", "text", "language", "instruct", "speed", "duration",
"num_step", "guidance_scale", "profile_id",
"ref_audio", "ref_text", "start", "end",
"ref_audio", "ref_text", "start", "end", "effect_preset",
)

def __init__(self, **kwargs):
Expand Down Expand Up @@ -93,7 +93,7 @@ async def generate_segments_batched(
List of (segment_index, audio_tensor, sample_rate) tuples,
ordered by segment_index.
"""
from services.audio_dsp import apply_mastering, normalize_audio
from services.audio_dsp import apply_mastering, normalize_audio, apply_effects_chain, get_effect_chain

sr = getattr(model, "sampling_rate", 24000)
loop = asyncio.get_running_loop()
Expand Down Expand Up @@ -144,7 +144,20 @@ def _gen_one(s=seg):
postprocess_output=True,
)
audio_out = audios[0]

# Apply per-segment DSP effect preset (default: broadcast)
seg_effect_preset = getattr(s, "effect_preset", None) or "broadcast"
if seg_effect_preset == "raw":
# Raw: skip all DSP — return raw model output
return audio_out

mastered = apply_mastering(audio_out, sample_rate=sr)
effect_chain = get_effect_chain(seg_effect_preset)
if effect_chain:
mastered = apply_effects_chain(
mastered, sample_rate=sr, chain=effect_chain
)

return normalize_audio(mastered, target_dBFS=-2.0)

audio = await loop.run_in_executor(gpu_pool, _gen_one)
Expand Down
5 changes: 4 additions & 1 deletion backend/services/incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import json


_GEN_INPUT_FIELDS = ("text", "target_lang", "profile_id", "instruct", "speed", "direction")
_GEN_INPUT_FIELDS = ("text", "target_lang", "profile_id", "instruct", "speed", "direction", "effect_preset")

Comment on lines +23 to 24

def segment_fingerprint(seg: dict) -> str:
Expand All @@ -29,6 +29,9 @@ def segment_fingerprint(seg: dict) -> str:
Any change to `_GEN_INPUT_FIELDS` flips the hash and the segment becomes
a re-gen candidate. Changes to position / selection state / lip-sync
badge don't trigger regen, which is what we want.

Currently includes: text, target_lang, profile_id, instruct, speed,
direction, effect_preset.
"""
payload = {k: (seg.get(k) if seg.get(k) is not None else "") for k in _GEN_INPUT_FIELDS}
blob = json.dumps(payload, sort_keys=True, ensure_ascii=False)
Expand Down
17 changes: 17 additions & 0 deletions frontend/src/api/engines.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,20 @@ export async function getJob(id: string): Promise<unknown> {
export async function getJobEvents(id: string, afterSeq: number = 0): Promise<unknown> {
return apiJson(`/jobs/${id}/events?after_seq=${afterSeq}`);
}

// ── Effect presets ──────────────────────────────────────────────────────

export interface EffectPreset {
id: string;
label: string;
icon: string;
description: string;
}

export interface EffectPresetsResponse {
presets: EffectPreset[];
}

export async function fetchEffectPresets(): Promise<EffectPresetsResponse> {
return apiJson<EffectPresetsResponse>('/engines/effects/presets');
}
12 changes: 12 additions & 0 deletions frontend/src/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,18 @@ export interface DubHistoryResponse {
jobs: DubJobMeta[];
}

export interface DubSegment {
start: number;
end: number;
text: string;
instruct?: string;
profile_id?: string;
speed?: number;
gain?: number;
target_lang?: string;
effect_preset?: string;
}

export interface DubTranslateResponse {
segments: { id: string; text: string; text_original?: string; rate_ratio?: number; rate_error?: string }[];
}
Expand Down
17 changes: 16 additions & 1 deletion frontend/src/store/dubSlice.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* history restore explicitly rehydrates the relevant fields.
*/
import type { StateCreator } from 'zustand';
import type { EffectPreset } from '../api/engines';

export type DubStep =
| 'idle'
Expand Down Expand Up @@ -84,6 +85,12 @@ export interface DubSlice {
source_count: number;
}>;

// ── Effect Presets ────────────────────────────────────────────────────
segmentEffectPresets: Record<string, string>;
setSegmentEffectPreset: (segId: string, presetId: string) => void;
availableEffectPresets: EffectPreset[];
setAvailableEffectPresets: (presets: EffectPreset[]) => void;

// ── Setters (React-style; accept value or updater fn) ─────────────────
setDubJobId: (v: Updater<string | null>) => void;
setDubStep: (v: Updater<DubStep>) => void;
Expand Down Expand Up @@ -115,7 +122,8 @@ const INITIAL: Omit<DubSlice,
| 'setDubProgress' | 'setDubError' | 'setIsTranslating' | 'setDubSegments'
| 'setDubTranscript' | 'setDubFilename' | 'setDubDuration' | 'setDubTracks'
| 'setDubLang' | 'setDubLangCode' | 'setDubInstruct' | 'setPreserveBg'
| 'setDefaultTrack' | 'setExportTracks' | 'setPreviewSegIds' | 'setSpeakerClones' | 'resetDubState'
| 'setDefaultTrack' | 'setExportTracks' | 'setPreviewSegIds' | 'setSpeakerClones'
| 'setSegmentEffectPreset' | 'setAvailableEffectPresets' | 'resetDubState'
> = {
dubJobId: null,
dubStep: 'idle',
Expand All @@ -137,6 +145,8 @@ const INITIAL: Omit<DubSlice,
exportTracks: { original: true },
previewSegIds: [],
speakerClones: {},
segmentEffectPresets: {},
availableEffectPresets: [],
};

export const createDubSlice: StateCreator<DubSlice, [], [], DubSlice> = (set, get) => ({
Expand All @@ -162,6 +172,11 @@ export const createDubSlice: StateCreator<DubSlice, [], [], DubSlice> = (set, ge
setExportTracks: (v) => set((s) => ({ exportTracks: resolve(v, s.exportTracks) })),
setPreviewSegIds:(v) => set((s) => ({ previewSegIds:resolve(v, s.previewSegIds) })),
setSpeakerClones:(v) => set((s) => ({ speakerClones: resolve(v, s.speakerClones) })),
setSegmentEffectPreset: (segId, presetId) =>
set((s) => ({
segmentEffectPresets: { ...s.segmentEffectPresets, [segId]: presetId },
})),
setAvailableEffectPresets: (presets) => set({ availableEffectPresets: presets }),

resetDubState: () => {
// Touch `get` so strict-mode double-invocation of the initializer doesn't
Expand Down
Loading