OmniVoice-Studio/backend/schemas/requests.py at 4be409543f58c7657b5e153ca0f6efb15682b820 · debpalash/OmniVoice-Studio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from pydantic import BaseModel, field_validator
from typing import List, Optional

from services.audio_dsp import EFFECT_PRESETS

class ExportRequest(BaseModel):
    source_filename: str
    destination_path: str
    mode: str = "history"

class ExportRecordRequest(BaseModel):
    filename: str
    destination_path: str = "~/Downloads"
    mode: str = "file"

class RevealRequest(BaseModel):
    path: str

class DubSegment(BaseModel):
    start: float
    end: float
    text: str
    instruct: str = ""       # Per-segment voice override
    profile_id: str = ""     # Per-segment voice profile
    speed: Optional[float] = None
    gain: Optional[float] = None  # Per-segment volume (0.0 - 2.0, default 1.0)
    target_lang: Optional[str] = None  # Per-segment language override (ISO code)
    effect_preset: str = "broadcast"   # NEW: DSP preset id (default: broadcast)

    @field_validator("effect_preset")
    @classmethod
    def validate_effect_preset(cls, v: str) -> str:
        if v not in EFFECT_PRESETS:
            raise ValueError(
                f"Unknown effect preset: {v!r}. "
                f"Valid: {list(EFFECT_PRESETS.keys())}"
            )
        return v

class DubRequest(BaseModel):
    segments: List[DubSegment]
    language: str = "Auto"
    language_code: str = "und"  # ISO 639-1 for ffmpeg metadata (e.g. "es", "fr", "de")
    instruct: str = ""
    num_step: int = 16
    guidance_scale: float = 2.0
    speed: float = 1.0
    # Phase 4.1 — partial regen. Parallel lists by index with `segments`.
    # When `regen_only` is set, only listed segment ids re-run TTS; others
    # reuse their on-disk seg_N.wav. `segment_ids` lets the client bind
    # each segment to a stable id across regen cycles.
    segment_ids: Optional[List[str]] = None
    regen_only: Optional[List[str]] = None
    # Fast-preview mode for interactive edits. When true, TTS runs at
    # num_step=8 (~2× faster, ~10-20% quality drop). Client is responsible
    # for re-rendering preview segs at full quality before final export.
    preview: Optional[bool] = False
    # How to handle segs whose TTS audio is longer than its slot (the
    # "ghost lang" overlap bug otherwise). Options:
    #   "time_stretch" — phase-vocoder stretch to fit, preserves pitch (default).
    #   "trim"         — hard-clip to slot length + fade out (cheap, may cut mid-word).
    #   "off"          — no fit; mix layers with += (legacy behaviour, may overlap).
    slot_fit: Optional[str] = "time_stretch"

class TranslateSegment(BaseModel):
    id: str
    text: str
    target_lang: Optional[str] = None

class TranslateRequest(BaseModel):
    segments: List[TranslateSegment]
    target_lang: str  # ISO 639-1 code like "es", "fr"
    provider: Optional[str] = None
    source_lang: Optional[str] = None  # ISO 639-1; overrides job detection
    job_id: Optional[str] = None  # Dub job id, used to resolve detected source_lang
    quality: Optional[str] = "fast"  # "fast" (one-shot) | "cinematic" (reflect → adapt)
    glossary: Optional[List[dict]] = None  # [{"source": "...", "target": "...", "note": "..."}]

class DubIngestUrlRequest(BaseModel):
    url: str
    job_id: Optional[str] = None
    # When true and the URL is a caption-bearing host (YouTube, Vimeo, TED…),
    # ask yt-dlp to also download the original-language + any additional
    # sub_langs as VTT. The UI uses this to seed a transcript without running
    # Whisper, and optionally to skip the Translate step for languages that
    # YouTube auto-translates for us.
    fetch_subs: Optional[bool] = False
    sub_langs: Optional[List[str]] = None

class ProjectSaveRequest(BaseModel):
    name: str
    video_path: Optional[str] = None
    audio_path: Optional[str] = None
    duration: Optional[float] = None
    state: dict  # Full JSON blob: segments, settings, tracks, etc.