whisper-transcriber/main.py at master · Samk13/whisper-transcriber · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
import subprocess
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from time import perf_counter

import typer
import whisper
from rich.console import Console
from rich.panel import Panel
from rich.table import Table


class WhisperModel(str, Enum):
    TINY_EN = "tiny.en"
    TINY = "tiny"
    BASE_EN = "base.en"
    BASE = "base"
    SMALL_EN = "small.en"
    SMALL = "small"
    MEDIUM_EN = "medium.en"
    MEDIUM = "medium"
    LARGE = "large"
    TURBO = "turbo"


class WhisperTask(str, Enum):
    TRANSCRIBE = "transcribe"
    TRANSLATE = "translate"


@dataclass(frozen=True)
class RunConfig:
    output_dir: Path
    input_dir: Path
    extractor_script: Path
    language: str | None
    task: WhisperTask
    realtime: bool
    fast_decode: bool
    timestamps: bool
    model: WhisperModel


@dataclass(frozen=True)
class TranscriptionResult:
    file_name: str
    language: str
    output_file: Path
    elapsed_seconds: float


app = typer.Typer(add_completion=False)
console = Console()


def list_files_by_suffix(directory: Path, suffix: str) -> list[Path]:
    return sorted(
        path
        for path in directory.iterdir()
        if path.is_file() and path.suffix.lower() == suffix.lower()
    )


def format_timestamp(seconds: float) -> str:
    total_ms = int(seconds * 1000)
    hours, remainder = divmod(total_ms, 3_600_000)
    minutes, remainder = divmod(remainder, 60_000)
    secs, ms = divmod(remainder, 1000)
    if hours > 0:
        return f"{hours:02d}:{minutes:02d}:{secs:02d}.{ms:03d}"
    return f"{minutes:02d}:{secs:02d}.{ms:03d}"


def render_output_text(raw_result: dict, include_timestamps: bool) -> str:
    if not include_timestamps:
        return raw_result.get("text", "").strip()

    segments = raw_result.get("segments") or []
    lines: list[str] = []
    for segment in segments:
        start = format_timestamp(float(segment.get("start", 0.0)))
        end = format_timestamp(float(segment.get("end", 0.0)))
        text = str(segment.get("text", "")).strip()
        if text:
            lines.append(f"[{start} --> {end}] {text}")

    if lines:
        return "\n".join(lines)
    return raw_result.get("text", "").strip()


class TranscriptionService:
    def __init__(self, cfg: RunConfig, ui: Console) -> None:
        self.cfg = cfg
        self.ui = ui
        self.model = self._load_model()

    def _load_model(self) -> whisper.Whisper:
        with self.ui.status("[bold yellow]Loading model...[/bold yellow]", spinner="dots"):
            return whisper.load_model(self.cfg.model.value)

    def print_header(self, mode: str, source: Path | None = None) -> None:
        source_line = (
            f"Input: [green]{source}[/green]"
            if source
            else f"Input dir: [green]{self.cfg.input_dir}[/green]"
        )
        self.ui.print(
            Panel.fit(
                f"[bold cyan]Whisper Transcriber[/bold cyan]\n"
                f"Mode: [magenta]{mode}[/magenta]\n"
                f"{source_line}\n"
                f"Model: [magenta]{self.cfg.model.value}[/magenta]\n"
                f"Task: [magenta]{self.cfg.task.value}[/magenta]\n"
                f"Language: [magenta]{self.cfg.language or 'auto'}[/magenta]\n"
                f"Realtime: [magenta]{'on' if self.cfg.realtime else 'off'}[/magenta]\n"
                f"Fast decode: [magenta]{'on' if self.cfg.fast_decode else 'off'}[/magenta]\n"
                f"Timestamps: [magenta]{'on' if self.cfg.timestamps else 'off'}[/magenta]",
                border_style="cyan",
            )
        )

    def print_translation_warning_if_needed(self) -> None:
        if self.cfg.task == WhisperTask.TRANSLATE and self.cfg.model == WhisperModel.TURBO:
            self.ui.print(
                "[yellow]Warning:[/yellow] model 'turbo' is not ideal for translation. "
                "Prefer 'medium' or 'large'."
            )

    def transcribe_file(self, audio_file: Path) -> TranscriptionResult:
        start_time = perf_counter()
        if self.cfg.realtime:
            self.ui.print(
                f"[bold yellow]Streaming transcript for {audio_file.name}...[/bold yellow]"
            )
        else:
            self.ui.print(
                f"[bold yellow]Transcribing full audio: {audio_file.name}...[/bold yellow]"
            )

        transcribe_kwargs = {
            "language": self.cfg.language,
            "task": self.cfg.task.value,
            "verbose": self.cfg.realtime,
            # Avoid CPU FP16 warning/fallback path.
            "fp16": str(self.model.device) != "cpu",
        }
        if self.cfg.fast_decode:
            transcribe_kwargs.update(
                {
                    "beam_size": 1,
                    "best_of": 1,
                    "temperature": 0.0,
                }
            )

        raw_result = self.model.transcribe(str(audio_file), **transcribe_kwargs)
        transcript_text = render_output_text(
            raw_result, include_timestamps=self.cfg.timestamps
        )
        detected_language = raw_result.get("language", "unknown")

        output_file = self.cfg.output_dir / f"{audio_file.stem}.txt"
        output_file.write_text(f"{transcript_text}\n", encoding="utf-8")
        elapsed = perf_counter() - start_time

        details = Table.grid(padding=(0, 2))
        details.add_row("[bold]File[/bold]", f"[green]{audio_file.name}[/green]")
        details.add_row("[bold]Language[/bold]", f"[green]{detected_language}[/green]")
        details.add_row("[bold]Output[/bold]", f"[cyan]{output_file}[/cyan]")
        details.add_row("[bold]Elapsed[/bold]", f"[yellow]{elapsed:.2f}s[/yellow]")
        self.ui.print(Panel(details, title="Done", border_style="green"))
        self.ui.print(
            Panel(
                transcript_text,
                title=f"Transcript - {audio_file.name}",
                border_style="blue",
            )
        )

        return TranscriptionResult(
            file_name=audio_file.name,
            language=detected_language,
            output_file=output_file,
            elapsed_seconds=elapsed,
        )


class BatchTranscriber:
    def __init__(self, cfg: RunConfig, service: TranscriptionService, ui: Console) -> None:
        self.cfg = cfg
        self.service = service
        self.ui = ui

    def _convert_mov_files(self) -> None:
        mov_files = list_files_by_suffix(self.cfg.input_dir, ".mov")
        if not mov_files:
            self.ui.print("[yellow]No MOV files found for conversion.[/yellow]")
            return

        if not self.cfg.extractor_script.exists():
            self.ui.print(
                f"[red]Extractor script not found:[/red] {self.cfg.extractor_script}"
            )
            raise typer.Exit(code=1)

        self.ui.print(
            f"[cyan]Found {len(mov_files)} MOV file(s); converting to MP3 with {self.cfg.extractor_script}[/cyan]"
        )
        for mov_file in mov_files:
            mp3_file = mov_file.with_suffix(".mp3")
            if mp3_file.exists():
                self.ui.print(
                    f"[yellow]Skipping conversion:[/yellow] {mov_file.name} -> {mp3_file.name} (already exists)"
                )
                continue

            with self.ui.status(
                f"[bold yellow]Converting {mov_file.name} to MP3...[/bold yellow]",
                spinner="dots",
            ):
                try:
                    subprocess.run(
                        ["bash", str(self.cfg.extractor_script), str(mov_file)],
                        check=True,
                        capture_output=True,
                        text=True,
                    )
                except subprocess.CalledProcessError as exc:
                    self.ui.print(f"[red]Failed to convert {mov_file}[/red]")
                    if exc.stderr:
                        self.ui.print(exc.stderr.strip())
                    raise typer.Exit(code=1) from exc
            self.ui.print(f"[green]Converted:[/green] {mov_file.name} -> {mp3_file.name}")

    def run(self) -> None:
        self._convert_mov_files()
        mp3_files = list_files_by_suffix(self.cfg.input_dir, ".mp3")
        if not mp3_files:
            self.ui.print(f"[red]No MP3 files found in {self.cfg.input_dir}.[/red]")
            raise typer.Exit(code=1)

        results: list[TranscriptionResult] = []
        for mp3_file in mp3_files:
            self.ui.rule(f"[bold white]Transcribing {mp3_file.name}")
            results.append(self.service.transcribe_file(mp3_file))

        table = Table(title="Batch Results", border_style="green")
        table.add_column("File", style="cyan")
        table.add_column("Language", style="green")
        table.add_column("Output", style="blue")
        table.add_column("Elapsed", style="yellow", justify="right")
        for result in results:
            table.add_row(
                result.file_name,
                result.language,
                str(result.output_file),
                f"{result.elapsed_seconds:.2f}s",
            )
        self.ui.print(table)


@app.command()
def main(
    audio_file: Path | None = typer.Argument(
        None,
        help="Path to one MP3 file. If omitted, batch mode runs over data/input.",
    ),
    output_dir: Path = typer.Option(
        Path("data/output"),
        "--output-dir",
        help="Directory where transcript files are saved (default: data/output)",
    ),
    input_dir: Path = typer.Option(
        Path("data/input"),
        "--input-dir",
        help="Input directory used for batch mode (default: data/input)",
    ),
    extractor_script: Path = typer.Option(
        Path("scripts/extract_audio.sh"),
        "--extractor-script",
        help="Path to MOV-to-MP3 extractor script.",
    ),
    language: str | None = typer.Option(
        None,
        "--language",
        help="Optional spoken language (e.g. English, Japanese, sv).",
    ),
    task: WhisperTask = typer.Option(
        WhisperTask.TRANSCRIBE,
        "--task",
        case_sensitive=False,
        help="Use transcribe (default) or translate.",
    ),
    realtime: bool = typer.Option(
        False,
        "--realtime/--no-realtime",
        help="Stream transcript segments while processing.",
    ),
    fast_decode: bool = typer.Option(
        True,
        "--fast-decode/--no-fast-decode",
        help="Use faster decoding settings (beam_size=1, best_of=1, temperature=0).",
    ),
    timestamps: bool = typer.Option(
        False,
        "--timestamps/--no-timestamps",
        help="Include [start --> end] timestamps in saved output files.",
    ),
    model: WhisperModel = typer.Option(
        WhisperModel.TURBO,
        "--model",
        case_sensitive=True,
        help="Whisper model to use (default: turbo)",
    ),
) -> None:
    run_start = perf_counter()
    output_dir.mkdir(parents=True, exist_ok=True)
    input_dir.mkdir(parents=True, exist_ok=True)

    cfg = RunConfig(
        output_dir=output_dir,
        input_dir=input_dir,
        extractor_script=extractor_script,
        language=language,
        task=task,
        realtime=realtime,
        fast_decode=fast_decode,
        timestamps=timestamps,
        model=model,
    )

    service = TranscriptionService(cfg, console)
    service.print_translation_warning_if_needed()

    if audio_file is not None:
        if not audio_file.exists() or not audio_file.is_file():
            console.print(f"[red]Audio file not found:[/red] {audio_file}")
            raise typer.Exit(code=1)
        service.print_header("single", source=audio_file)
        service.transcribe_file(audio_file)
    else:
        service.print_header("batch")
        BatchTranscriber(cfg, service, console).run()

    total_seconds = perf_counter() - run_start
    console.print(
        Panel.fit(
            f"[bold green]Run complete[/bold green]\nTotal elapsed: [yellow]{total_seconds:.2f}s[/yellow]",
            border_style="green",
        )
    )


if __name__ == "__main__":
    app()