Skip to content
47 changes: 37 additions & 10 deletions backend/api/routers/dub_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from core.tasks import task_manager
from schemas.requests import DubRequest
from services.model_manager import get_model, _gpu_pool
from services.audio_dsp import apply_mastering, normalize_audio
from services.audio_dsp import apply_mastering, normalize_audio, apply_effects_chain, get_effect_chain
from services.rvc import apply_rvc, is_enabled as rvc_is_enabled
from services.incremental import segment_fingerprint
from services.watermark import embed_watermark
Expand Down Expand Up @@ -107,7 +107,7 @@ async def _stream(task_id):
sync_scores.append(1.0)
continue

def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id, effect_preset):
ref_audio = None
ref_text = None
used_seed = None
Expand Down Expand Up @@ -160,15 +160,28 @@ def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
speed=spd, denoise=True, postprocess_output=True,
)
audio_out = audios[0]
mastered_audio = apply_mastering(audio_out, sample_rate=_model.sampling_rate if hasattr(_model, 'sampling_rate') else 24000)
sr = _model.sampling_rate if hasattr(_model, 'sampling_rate') else 24000

# Apply per-segment DSP effect preset (default: broadcast)
seg_effect_preset = effect_preset or "broadcast"
if seg_effect_preset == "raw":
return audio_out

mastered_audio = apply_mastering(audio_out, sample_rate=sr)
effect_chain = get_effect_chain(seg_effect_preset)
if effect_chain:
mastered_audio = apply_effects_chain(
mastered_audio,
sample_rate=sr,
chain=effect_chain,
)
return normalize_audio(mastered_audio, target_dBFS=-2.0)
except Exception as e:
is_oom = (
isinstance(e, torch.cuda.OutOfMemoryError)
or "out of memory" in str(e).lower()
or "CUDA error" in str(e)
)
# Always try to reclaim VRAM regardless of error type.
import gc
gc.collect()
if torch.cuda.is_available():
Expand All @@ -177,9 +190,8 @@ def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
torch.mps.empty_cache()

if not is_oom:
raise # Non-OOM — propagate the real error, don't mask it.
raise

# OOM recovery: retry once with reduced steps (less VRAM).
retry_steps = min(nstep, 8)
logger.warning(
"OOM on segment (nstep=%d), retrying with %d steps after cache flush",
Expand All @@ -194,7 +206,20 @@ def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
speed=spd, denoise=True, postprocess_output=True,
)
audio_out = audios[0]
mastered_audio = apply_mastering(audio_out, sample_rate=_model.sampling_rate if hasattr(_model, 'sampling_rate') else 24000)
sr = _model.sampling_rate if hasattr(_model, 'sampling_rate') else 24000

seg_effect_preset = effect_preset or "broadcast"
if seg_effect_preset == "raw":
return audio_out

mastered_audio = apply_mastering(audio_out, sample_rate=sr)
effect_chain = get_effect_chain(seg_effect_preset)
if effect_chain:
mastered_audio = apply_effects_chain(
mastered_audio,
sample_rate=sr,
chain=effect_chain,
)
return normalize_audio(mastered_audio, target_dBFS=-2.0)
except Exception as retry_err:
raise RuntimeError(
Expand All @@ -203,13 +228,13 @@ def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
f"Try the Flush button in the header to free VRAM, "
f"or switch to CPU in Settings. "
f"Underlying error: {retry_err}"
)
) from retry_err

seg_instruct = seg.instruct or req.instruct
seg_profile = seg.profile_id or None
seg_speed = seg.speed if hasattr(seg, 'speed') and seg.speed is not None else req.speed
seg_lang = seg.target_lang if getattr(seg, 'target_lang', None) else req.language

seg_instruct = seg.instruct or req.instruct
# Phase 4.2 — if the segment carries a free-form direction, parse it
# and append the taxonomy instruct (e.g. "urgent, surprised") on top
# of whatever instruct was already set. Also apply the director's
Expand Down Expand Up @@ -239,10 +264,11 @@ def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
# flag to restore num_step=req.num_step quality.
_num_step = 8 if req.preview else req.num_step
_t_tts_0 = time.perf_counter()
seg_effect_preset = getattr(seg, "effect_preset", None) or "broadcast"
audio_tensor = await loop.run_in_executor(
_gpu_pool, _gen,
seg.text, seg_lang, seg_instruct, seg_duration,
_num_step, req.guidance_scale, seg_speed, seg_profile,
_num_step, req.guidance_scale, seg_speed, seg_profile, seg_effect_preset,
)
_t_tts += time.perf_counter() - _t_tts_0

Expand Down Expand Up @@ -276,6 +302,7 @@ def _gen(text, lang, instruct_str, dur_s, nstep, cfg, spd, profile_id=None):
"instruct": getattr(seg, "instruct", None),
"speed": getattr(seg, "speed", None),
"direction": getattr(seg, "direction", None),
"effect_preset": getattr(seg, "effect_preset", None),
})
except Exception as e:
logger.debug("seg fingerprint skipped for %s: %s", seg_id, e)
Expand Down
12 changes: 12 additions & 0 deletions backend/api/routers/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from core import prefs
from services import tts_backend, asr_backend, llm_backend, translation_engines
from services.audio_dsp import list_effect_presets
from api.schemas import EffectPresetsResponse

router = APIRouter()

Expand Down Expand Up @@ -60,6 +62,16 @@ def list_llm_backends():
return {"active": llm_backend.active_backend_id(), "backends": llm_backend.list_backends()}


@router.get("/engines/effects/presets", response_model=EffectPresetsResponse)
def list_effects_presets():
"""Return available DSP effect presets for the dub pipeline.

Each preset is a named chain of audio effects (EQ, compressor, reverb, etc.)
that can be applied to generated TTS audio on a per-segment basis.
"""
return {"presets": list_effect_presets()}


@router.get("/engines/translation")
def list_translation_engines():
"""Translation engines with per-engine pip-package availability.
Expand Down
33 changes: 29 additions & 4 deletions backend/api/routers/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from core.db import db_conn
from core.config import OUTPUTS_DIR, VOICES_DIR
from services.model_manager import get_model, _gpu_pool
from services.audio_dsp import apply_mastering, normalize_audio
# audio_dsp imported in _run_inference
from core import event_bus

router = APIRouter()
Expand All @@ -24,8 +24,9 @@ def _run_inference(
model, text, language, ref_audio_path, ref_text, instruct, duration,
num_step, guidance_scale, speed, t_shift, denoise,
postprocess_output, layer_penalty_factor, position_temperature,
class_temperature, used_seed,
class_temperature, used_seed, effect_preset="broadcast",
):
from services.audio_dsp import apply_mastering, normalize_audio, apply_effects_chain, get_effect_chain
import torch
try:
if used_seed is not None:
Expand All @@ -46,7 +47,30 @@ def _run_inference(
)
audio_out = audios[0]

mastered_audio = apply_mastering(audio_out, sample_rate=model.sampling_rate if hasattr(model, 'sampling_rate') else 24000)
sr = model.sampling_rate if hasattr(model, 'sampling_rate') else 24000

# Apply DSP effect preset
_effect_preset = effect_preset or "broadcast"

# Validate preset ID
from services.audio_dsp import EFFECT_PRESETS
if _effect_preset not in EFFECT_PRESETS:
raise ValueError(
f"Unknown effect preset: {_effect_preset!r}. "
f"Valid: {list(EFFECT_PRESETS.keys())}"
)

if _effect_preset == "raw":
# Raw: skip all DSP — return raw model output
return audio_out

mastered_audio = apply_mastering(audio_out, sample_rate=sr)
_chain = get_effect_chain(_effect_preset)
if _chain:
mastered_audio = apply_effects_chain(
mastered_audio, sample_rate=sr, chain=_chain,
)

return normalize_audio(mastered_audio, target_dBFS=-2.0)

except ValueError as e:
Expand Down Expand Up @@ -84,6 +108,7 @@ async def generate_speech(
class_temperature: Optional[float] = Form(None),
profile_id: Optional[str] = Form(None),
seed: Optional[int] = Form(None),
effect_preset: str = Form("broadcast"),
):
_model = await get_model()
Comment on lines +111 to 113

Expand Down Expand Up @@ -137,7 +162,7 @@ async def generate_speech(
_model, text, language, ref_audio_path, ref_text, instruct, duration,
num_step, guidance_scale, speed, t_shift, denoise,
postprocess_output, layer_penalty_factor, position_temperature,
class_temperature, used_seed,
class_temperature, used_seed, effect_preset,
)
gen_time = round(time.time() - start_time, 2)

Expand Down
17 changes: 17 additions & 0 deletions backend/api/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,20 @@ class ModelEntry(BaseModel):
supported: bool = True
size_on_disk: int | None = None
nb_files: int | None = None


# ── Effect presets ─────────────────────────────────────────────────────

class EffectPresetEntry(BaseModel):
"""One DSP effect preset."""
model_config = ConfigDict(extra="allow")

id: str
label: str
icon: str
description: str


class EffectPresetsResponse(BaseModel):
"""GET /engines/effects/presets"""
presets: list[EffectPresetEntry] = Field(default_factory=list)
15 changes: 14 additions & 1 deletion backend/schemas/requests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from pydantic import BaseModel
from pydantic import BaseModel, field_validator
from typing import List, Optional

from services.audio_dsp import EFFECT_PRESETS

class ExportRequest(BaseModel):
source_filename: str
destination_path: str
Expand All @@ -23,6 +25,17 @@ class DubSegment(BaseModel):
speed: Optional[float] = None
gain: Optional[float] = None # Per-segment volume (0.0 - 2.0, default 1.0)
target_lang: Optional[str] = None # Per-segment language override (ISO code)
effect_preset: str = "broadcast" # NEW: DSP preset id (default: broadcast)

@field_validator("effect_preset")
@classmethod
def validate_effect_preset(cls, v: str) -> str:
if v not in EFFECT_PRESETS:
raise ValueError(
f"Unknown effect preset: {v!r}. "
f"Valid: {list(EFFECT_PRESETS.keys())}"
)
return v

class DubRequest(BaseModel):
segments: List[DubSegment]
Expand Down
17 changes: 15 additions & 2 deletions backend/services/batched_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class SegmentSpec:
__slots__ = (
"index", "text", "language", "instruct", "speed", "duration",
"num_step", "guidance_scale", "profile_id",
"ref_audio", "ref_text", "start", "end",
"ref_audio", "ref_text", "start", "end", "effect_preset",
)

def __init__(self, **kwargs):
Expand Down Expand Up @@ -93,7 +93,7 @@ async def generate_segments_batched(
List of (segment_index, audio_tensor, sample_rate) tuples,
ordered by segment_index.
"""
from services.audio_dsp import apply_mastering, normalize_audio
from services.audio_dsp import apply_mastering, normalize_audio, apply_effects_chain, get_effect_chain

sr = getattr(model, "sampling_rate", 24000)
loop = asyncio.get_running_loop()
Expand Down Expand Up @@ -144,7 +144,20 @@ def _gen_one(s=seg):
postprocess_output=True,
)
audio_out = audios[0]

# Apply per-segment DSP effect preset (default: broadcast)
seg_effect_preset = getattr(s, "effect_preset", None) or "broadcast"
if seg_effect_preset == "raw":
# Raw: skip all DSP — return raw model output
return audio_out

mastered = apply_mastering(audio_out, sample_rate=sr)
effect_chain = get_effect_chain(seg_effect_preset)
if effect_chain:
mastered = apply_effects_chain(
mastered, sample_rate=sr, chain=effect_chain
)

return normalize_audio(mastered, target_dBFS=-2.0)

audio = await loop.run_in_executor(gpu_pool, _gen_one)
Expand Down
5 changes: 4 additions & 1 deletion backend/services/incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import json


_GEN_INPUT_FIELDS = ("text", "target_lang", "profile_id", "instruct", "speed", "direction")
_GEN_INPUT_FIELDS = ("text", "target_lang", "profile_id", "instruct", "speed", "direction", "effect_preset")

Comment on lines +23 to 24

def segment_fingerprint(seg: dict) -> str:
Expand All @@ -29,6 +29,9 @@ def segment_fingerprint(seg: dict) -> str:
Any change to `_GEN_INPUT_FIELDS` flips the hash and the segment becomes
a re-gen candidate. Changes to position / selection state / lip-sync
badge don't trigger regen, which is what we want.

Currently includes: text, target_lang, profile_id, instruct, speed,
direction, effect_preset.
"""
payload = {k: (seg.get(k) if seg.get(k) is not None else "") for k in _GEN_INPUT_FIELDS}
blob = json.dumps(payload, sort_keys=True, ensure_ascii=False)
Expand Down
17 changes: 17 additions & 0 deletions frontend/src/api/engines.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,20 @@ export async function getJob(id: string): Promise<unknown> {
export async function getJobEvents(id: string, afterSeq: number = 0): Promise<unknown> {
return apiJson(`/jobs/${id}/events?after_seq=${afterSeq}`);
}

// ── Effect presets ──────────────────────────────────────────────────────

export interface EffectPreset {
id: string;
label: string;
icon: string;
description: string;
}

export interface EffectPresetsResponse {
presets: EffectPreset[];
}

export async function fetchEffectPresets(): Promise<EffectPresetsResponse> {
return apiJson<EffectPresetsResponse>('/engines/effects/presets');
}
12 changes: 12 additions & 0 deletions frontend/src/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,18 @@ export interface DubHistoryResponse {
jobs: DubJobMeta[];
}

export interface DubSegment {
start: number;
end: number;
text: string;
instruct?: string;
profile_id?: string;
speed?: number;
gain?: number;
target_lang?: string;
effect_preset?: string;
}

export interface DubTranslateResponse {
segments: { id: string; text: string; text_original?: string; rate_ratio?: number; rate_error?: string }[];
}
Expand Down
Loading