|
16 | 16 | from .common import ApiKeyPool, start_daemon_thread, is_url, WARNING, ERROR, INFO |
17 | 17 | from .audio_getter import StreamAudioGetter, LocalFileAudioGetter, DeviceAudioGetter |
18 | 18 | from .audio_slicer import AudioSlicer |
19 | | -from .audio_transcriber import OpenaiWhisper, FasterWhisper, SimulStreaming, RemoteOpenaiTranscriber |
| 19 | +from .audio_transcriber import OpenaiWhisper, FasterWhisper, SimulStreaming, RemoteOpenaiTranscriber, HFTranscriber |
20 | 20 | from .llm_translator import LLMClient, ParallelTranslator, SerialTranslator |
21 | 21 | from .result_exporter import ResultExporter |
22 | 22 | from . import __version__ |
|
25 | 25 | def main(url, openai_api_key, google_api_key, openai_base_url, google_base_url, proxy, format, cookies, input_proxy, |
26 | 26 | device_index, device_recording_interval, mic, min_audio_length, max_audio_length, target_audio_length, |
27 | 27 | continuous_no_speech_threshold, disable_dynamic_no_speech_threshold, prefix_retention_length, vad_threshold, |
28 | | - disable_dynamic_vad_threshold, model, language, use_faster_whisper, use_simul_streaming, |
| 28 | + disable_dynamic_vad_threshold, model, language, use_faster_whisper, use_simul_streaming, use_hf_asr, |
29 | 29 | use_openai_transcription_api, openai_transcription_model, transcription_filters, disable_transcription_context, |
30 | 30 | transcription_initial_prompt, gpt_model, gemini_model, translation_prompt, translation_history_size, |
31 | 31 | translation_timeout, use_json_result, retry_if_translation_fails, temperature, top_p, top_k, prompt_cache_key, |
@@ -97,6 +97,8 @@ def init_transcriber(): |
97 | 97 | language=language, |
98 | 98 | proxy=processing_proxy, |
99 | 99 | **common_args) |
| 100 | + elif use_hf_asr: |
| 101 | + return HFTranscriber(model=model, language=language, proxy=processing_proxy, **common_args) |
100 | 102 | else: |
101 | 103 | return OpenaiWhisper(model=model, language=language, **common_args) |
102 | 104 |
|
@@ -334,6 +336,10 @@ def cli(): |
334 | 336 | type=str, |
335 | 337 | default='gpt-4o-mini-transcribe', |
336 | 338 | help='OpenAI\'s transcription model name, whisper-1 / gpt-4o-mini-transcribe / gpt-4o-transcribe') |
| 339 | + parser.add_argument( |
| 340 | + '--use_hf_asr', |
| 341 | + action='store_true', |
| 342 | + help='Set this flag to use a HuggingFace ASR model (via transformers pipeline) specified by --model.') |
337 | 343 | parser.add_argument( |
338 | 344 | '--transcription_filters', |
339 | 345 | type=str, |
@@ -541,11 +547,14 @@ def cli(): |
541 | 547 | if args['use_openai_transcription_api']: |
542 | 548 | transcription_encoder_flag_num += 1 |
543 | 549 | transcription_decoder_flag_num += 1 |
| 550 | + if args['use_hf_asr']: |
| 551 | + transcription_encoder_flag_num += 1 |
| 552 | + transcription_decoder_flag_num += 1 |
544 | 553 | if transcription_encoder_flag_num > 1: |
545 | | - print(f'{ERROR}Cannot use Faster Whisper or OpenAI Transcription API at the same time') |
| 554 | + print(f'{ERROR}Cannot use Faster Whisper, OpenAI Transcription API or HuggingFace ASR at the same time') |
546 | 555 | sys.exit(0) |
547 | 556 | if transcription_decoder_flag_num > 1: |
548 | | - print(f'{ERROR}Cannot use Simul Streaming or OpenAI Transcription API at the same time') |
| 557 | + print(f'{ERROR}Cannot use Simul Streaming, OpenAI Transcription API or HuggingFace ASR at the same time') |
549 | 558 | sys.exit(0) |
550 | 559 |
|
551 | 560 | if args['use_openai_transcription_api'] and not args['openai_api_key']: |
|
0 commit comments