🐞 fix(transcribe): 修复 funasr 存在的一堆问题 (#22)

HuanLinOTO · pre-commit-ci[bot] · web-flow · commit ee6f7f2438ae · 2024-05-25T23:42:06.000-07:00
* 🐞 fix(transcribe): 修复 funasr 存在的一堆问题 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ This repo contains some scripts for audio processing. Main features include:
 - [x] Audio data statistics (supports determining audio length)
 - [x] Audio resampling
 - [x] Audio transcribe (.lab)
+- [x] Audio transcribe via FunASR (use `--model-type funasr` to enable, detailed usage can be found at code)
 - [ ] Audio transcribe via WhisperX
 
 ([ ] indicates not completed, [x] indicates completed)
@@ -22,11 +23,10 @@ This repo contains some scripts for audio processing. Main features include:
 ## Getting Started:
 
 ```
-pip install -e . 
+pip install -e .
 fap --help
 ```
 
 ## Reference
 
 - [Batch Whisper](https://github.com/Blair-Johnson/batch-whisper)
-
diff --git a/README.zh.md b/README.zh.md
@@ -13,6 +13,8 @@
 - [x] 音频数据统计（支持判断音频长度）
 - [x] 音频重采样
 - [x] 音频打标 (.lab)
+- [x] 音频打标 FunASR（使用 `--model-type funasr` 开启, 详细使用方法可查看代码）
+- [ ] 音频打标 WhisperX
 
 ([ ] 表示未完成, [x] 表示已完成)
 
@@ -21,11 +23,10 @@
 ## 上手指南:
 
 ```
-pip install -e . 
+pip install -e .
 fap --help
 ```
 
 ## 引用
 
 - [Batch Whisper](https://github.com/Blair-Johnson/batch-whisper)
-
diff --git a/fish_audio_preprocess/cli/transcribe.py b/fish_audio_preprocess/cli/transcribe.py
@@ -7,7 +7,7 @@
 from tqdm import tqdm
 
 from fish_audio_preprocess.utils.file import AUDIO_EXTENSIONS, list_files, split_list
-from fish_audio_preprocess.utils.transcribe import batch_transcribe
+from fish_audio_preprocess.utils.transcribe import ASRModelType, batch_transcribe
 
 
 def replace_lastest(string, old, new):
@@ -32,8 +32,9 @@ def replace_lastest(string, old, new):
 )
 @click.option(
     "--model-size",
-    help="whisper model size or funasr",
-    default="tiny",
+    # whisper 默认 medium, funasr 默认 paraformer-zh
+    help="asr model size(default medium for whisper, paraformer-zh for funasr)",
+    default="medium",
     show_default=True,
     type=str,
 )
@@ -42,22 +43,41 @@ def replace_lastest(string, old, new):
     default=False,
     help="Search recursively",
 )
+@click.option(
+    "--model-type",
+    help="ASR model type (funasr or whisper)",
+    default="whisper",
+    show_default=True,
+)
 def transcribe(
     input_dir: str,
     num_workers: int,
     lang: str,
     model_size: str,
     recursive: bool,
+    model_type: ASRModelType,
 ):
     """
     Transcribe audio files in a directory.
     """
+    ctx = click.get_current_context()
+    provided_options = {
+        key: value
+        for key, value in ctx.params.items()
+        if ctx.get_parameter_source(key) == click.core.ParameterSource.COMMANDLINE
+    }
+
+    # 如果是 funasr 且没有提供 model_size, 则默认为 paraformer-zh
+    if model_type == "funasr" and "model_size" not in provided_options:
+        logger.info("Using paraformer-zh model for funasr as default")
+        model_size = "paraformer-zh"
+
     if not torch.cuda.is_available():
         logger.warning(
             "CUDA is not available, using CPU. This will be slow and even this script can not work. "
             "To speed up, use a GPU enabled machine or install torch with cuda builtin."
         )
-
+    logger.info(f"Using {num_workers} workers for processing")
     logger.info(f"Transcribing audio files in {input_dir}")
     # 扫描出所有的音频文件
     audio_files = list_files(input_dir, recursive=recursive)
@@ -78,6 +98,7 @@ def transcribe(
                     batch_transcribe,
                     files=chunk,
                     model_size=model_size,
+                    model_type=model_type,
                     lang=lang,
                     pos=len(tasks),
                 )
diff --git a/fish_audio_preprocess/utils/transcribe.py b/fish_audio_preprocess/utils/transcribe.py
@@ -1,5 +1,7 @@
 from pathlib import Path
+from typing import Literal
 
+from loguru import logger
 from tqdm import tqdm
 
 PROMPT = {
@@ -8,12 +10,21 @@
     "jp": "先進技術の領域において、人工知能の進化は画期的な成果として立っています。常に機械ができることの限界を押し広げているこのダイナミックな分野は、急速な成長と革新を見せています。複雑なデータパターンの解読から自動運転車の操縦まで、AIの応用は広範囲に及びます。",
 }
 
+ASRModelType = Literal["funasr", "whisper"]
 
-def batch_transcribe(files: list[Path], model_size: str, lang: str, pos: int):
+
+def batch_transcribe(
+    files: list[Path],
+    model_size: str,
+    model_type: ASRModelType,
+    lang: str,
+    pos: int,
+):
     results = {}
-    if "funasr" not in model_size:
+    if model_type == "whisper":
         import whisper
 
+        logger.info(f"Loading {model_size} model for {lang} transcription")
         model = whisper.load_model(model_size)
         for file in tqdm(files, position=pos):
             if lang in PROMPT:
@@ -23,17 +34,29 @@ def batch_transcribe(files: list[Path], model_size: str, lang: str, pos: int):
             else:
                 result = model.transcribe(file, language=lang)
             results[str(file)] = result["text"]
-    else:
+    elif model_type == "funasr":
         from funasr import AutoModel
 
+        logger.info(f"Loading {model_size} model for {lang} transcription")
         model = AutoModel(
-            model="paraformer-zh",
+            model=model_size,
+            vad_model="fsmn-vad",
             punc_model="ct-punc",
             log_level="ERROR",
             disable_pbar=True,
         )
         for file in tqdm(files, position=pos):
-            result = model.generate(input=file, batch_size_s=300)
-            results[str(file)] = result[0]["text"]
-
+            if lang in PROMPT:
+                result = model.generate(
+                    input=file, batch_size_s=300, hotword=PROMPT[lang]
+                )
+            else:
+                result = model.generate(input=file, batch_size_s=300)
+            # print(result)
+            if isinstance(result, list):
+                results[str(file)] = "".join([item["text"] for item in result])
+            else:
+                results[str(file)] = result["text"]
+    else:
+        raise ValueError(f"Unsupported model type: {model_type}")
     return results