whispr/main.py at stable · franckferman/whispr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env python3
# main.py

"""
whispr - CLI Entry Point

Description:
Command-line interface for the modular transcription system.
Supports local files, YouTube URLs, and generic HTTP URLs.
Multiple backends: whisper.cpp, faster-whisper, OpenAI API.

Created By  : Franck FERMAN
Version     : 2.0.0

Usage examples:
    python main.py --config config.json
    python main.py --url https://youtube.com/watch?v=... --backend whisper_cpp --chunks 4 --language fr
    python main.py --file video.mp4 --backend faster_whisper --workers 2
    python main.py --file audio.wav --backend openai
    python main.py --dry-run --url https://... --backend whisper_cpp
"""

import argparse
import sys

from transcriber.config import TranscriptionConfig
from transcriber.logger import setup_logging
from transcriber.managers.transcription import TranscriptionManager


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        prog="main.py",
        description="Modular video/audio transcription system with multi-backend support.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
examples:
  Load full config from file:
    python main.py --config config.json

  Transcribe a YouTube video with whisper.cpp, 4 workers, French:
    python main.py --url https://youtube.com/watch?v=... --backend whisper_cpp --workers 4 --language fr

  Transcribe a local video with faster-whisper, 2 parallel workers:
    python main.py --file video.mp4 --backend faster_whisper --workers 2

  Transcribe a local audio file via OpenAI API:
    python main.py --file audio.wav --backend openai --openai-key sk-...

  Dry run (no output written):
    python main.py --dry-run --file audio.mp3 --backend whisper_cpp
        """,
    )

    # ---- Input ----
    input_group = parser.add_mutually_exclusive_group()
    input_group.add_argument(
        "--file", "-f",
        metavar="PATH",
        help="Path to a local audio/video file.",
    )
    input_group.add_argument(
        "--url", "-u",
        metavar="URL",
        help="URL to a YouTube video or direct audio/video URL.",
    )

    # ---- Config file ----
    parser.add_argument(
        "--config", "-c",
        metavar="FILE",
        help="Path to a JSON configuration file. CLI arguments override config file values.",
    )

    # ---- Backend ----
    parser.add_argument(
        "--backend", "-b",
        choices=["whisper_cpp", "faster_whisper", "openai"],
        metavar="BACKEND",
        help="Transcription backend: whisper_cpp | faster_whisper | openai (default: whisper_cpp).",
    )
    parser.add_argument(
        "--fallback-backend",
        choices=["whisper_cpp", "faster_whisper", "openai"],
        metavar="BACKEND",
        help="Fallback backend if the primary backend fails all retries.",
    )

    # ---- Backend-specific ----
    parser.add_argument(
        "--whisper-binary",
        metavar="PATH",
        help="Path to the whisper.cpp binary (default: 'whisper').",
    )
    parser.add_argument(
        "--whisper-model",
        metavar="PATH",
        help="Path to the GGML model file for whisper.cpp.",
    )
    parser.add_argument(
        "--fw-model",
        metavar="NAME",
        help="faster-whisper model size (tiny/base/small/medium/large-v2, default: base).",
    )
    parser.add_argument(
        "--fw-device",
        choices=["cpu", "cuda"],
        metavar="DEVICE",
        help="faster-whisper inference device (cpu or cuda, default: cpu).",
    )
    parser.add_argument(
        "--openai-key",
        metavar="KEY",
        help="OpenAI API key (can also be set via OPENAI_API_KEY env var).",
    )
    parser.add_argument(
        "--openai-model",
        metavar="MODEL",
        help="OpenAI model name (default: whisper-1).",
    )

    # ---- Processing ----
    parser.add_argument(
        "--language", "-l",
        metavar="LANG",
        help="ISO 639-1 language code (e.g. 'fr', 'en'). Auto-detect if omitted.",
    )
    parser.add_argument(
        "--chunk-duration",
        type=int,
        metavar="SECONDS",
        help="Duration of each audio chunk in seconds (default: 600).",
    )
    parser.add_argument(
        "--workers", "-w",
        type=int,
        metavar="N",
        help="Number of parallel transcription workers (default: 2).",
    )
    parser.add_argument(
        "--temp-dir",
        metavar="DIR",
        help="Directory for temporary files (default: OS temp dir).",
    )

    # ---- Output ----
    parser.add_argument(
        "--format", "-F",
        dest="output_format",
        metavar="FMT",
        help=(
            "Output format(s), comma-separated: txt,json,srt,vtt "
            "(default: txt). Example: --format txt,srt"
        ),
    )
    parser.add_argument(
        "--output-dir", "-o",
        metavar="DIR",
        help="Directory where output files are written (default: current directory).",
    )
    parser.add_argument(
        "--output-prefix",
        metavar="PREFIX",
        help="Base filename prefix for output files (default: transcript).",
    )

    # ---- Retry ----
    parser.add_argument(
        "--max-retries",
        type=int,
        metavar="N",
        help="Maximum retry attempts per chunk on backend failure (default: 3).",
    )

    # ---- Misc ----
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print what would be done without performing transcription or writing files.",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Enable DEBUG-level logging.",
    )
    parser.add_argument(
        "--log-file",
        metavar="FILE",
        help="Write logs to this file. Use 'auto' for a timestamped filename.",
    )

    return parser


def main() -> int:
    parser = build_parser()
    args = parser.parse_args()

    # ---- Logging setup (before any other output) ----
    setup_logging(
        debug=args.debug,
        log_file=args.log_file,
    )

    # ---- Build config ----
    if args.config:
        config = TranscriptionConfig.from_json_file(args.config)
    else:
        config = TranscriptionConfig()

    # ---- Parse output formats from CLI ----
    output_formats = None
    if args.output_format:
        output_formats = [f.strip() for f in args.output_format.split(",") if f.strip()]

    # ---- Build overrides dict from CLI args ----
    overrides = {}
    if args.file:
        overrides["input_file"] = args.file
    if args.url:
        overrides["input_url"] = args.url
    if args.backend:
        overrides["backend"] = args.backend
    if args.fallback_backend:
        overrides["fallback_backend"] = args.fallback_backend
    if args.language:
        overrides["language"] = args.language
    if args.chunk_duration:
        overrides["chunk_duration_seconds"] = args.chunk_duration
    if args.workers:
        overrides["workers"] = args.workers
    if args.temp_dir:
        overrides["temp_dir"] = args.temp_dir
    if output_formats:
        overrides["output_formats"] = output_formats
    if args.output_dir:
        overrides["output_dir"] = args.output_dir
    if args.output_prefix:
        overrides["output_prefix"] = args.output_prefix
    if args.max_retries is not None:
        overrides["max_retries"] = args.max_retries
    if args.dry_run:
        overrides["dry_run"] = True
    if args.debug:
        overrides["debug"] = True
    if args.log_file:
        overrides["log_file"] = args.log_file

    # Backend-specific
    if args.whisper_binary:
        overrides["whisper_cpp_binary"] = args.whisper_binary
    if args.whisper_model:
        overrides["whisper_cpp_model"] = args.whisper_model
    if args.fw_model:
        overrides["faster_whisper_model"] = args.fw_model
    if args.fw_device:
        overrides["faster_whisper_device"] = args.fw_device
    if args.openai_key:
        overrides["openai_api_key"] = args.openai_key
    if args.openai_model:
        overrides["openai_model"] = args.openai_model

    config.apply_overrides(overrides)

    # ---- Run ----
    try:
        manager = TranscriptionManager(config)
        manager.run()
        return 0
    except (ValueError, FileNotFoundError) as exc:
        print(f"[ERROR] {exc}", file=sys.stderr)
        return 2
    except RuntimeError as exc:
        print(f"[ERROR] {exc}", file=sys.stderr)
        return 1
    except KeyboardInterrupt:
        print("\n[INFO] Interrupted by user.", file=sys.stderr)
        return 130


if __name__ == "__main__":
    sys.exit(main())