open-mmlab · yuantuo666 · Mar 3, 2025 · Jan 13, 2025 · Feb 21, 2025 · Mar 3, 2025
diff --git a/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/model.py b/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/model.py
@@ -1,4 +1,4 @@
-""" CLAP Model
+"""CLAP Model
 
 Adapted from CLIP: https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
 Adapted to the Audio Task.

diff --git a/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/openai.py b/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/openai.py
@@ -1,4 +1,4 @@
-""" OpenAI pretrained model functions
+"""OpenAI pretrained model functions
 
 Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
 """

diff --git a/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/timm_model.py b/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/timm_model.py
@@ -1,4 +1,4 @@
-""" timm model adapter
+"""timm model adapter
 
 Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
 """

diff --git a/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/tokenizer.py b/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/tokenizer.py
@@ -1,4 +1,4 @@
-""" CLIP tokenizer
+"""CLIP tokenizer
 
 Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
 """

diff --git a/models/tts/debatts/utils/g2p/english.py b/models/tts/debatts/utils/g2p/english.py
@@ -1,4 +1,4 @@
-""" from https://github.com/keithito/tacotron """
+"""from https://github.com/keithito/tacotron"""
 
 import re
 from unidecode import unidecode

diff --git a/models/tts/valle_v2/modeling_llama.py b/models/tts/valle_v2/modeling_llama.py
@@ -23,7 +23,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LLaMA model."""
+"""PyTorch LLaMA model."""
 import math
 from typing import List, Optional, Tuple, Union
 

diff --git a/preprocessors/Emilia/main.py b/preprocessors/Emilia/main.py
@@ -509,6 +509,9 @@ def main_process(audio_path, save_path=None, audio_name=None):
         logger.info("Using CPU")
         device_name = "cpu"
         device = torch.device(device_name)
+        # whisperX expects compute type: int8
+        logger.info("Overriding the compute type to int8")
+        args.compute_type = "int8"
 
     check_env(logger)
 

diff --git a/processors/audio_features_extractor.py b/processors/audio_features_extractor.py
@@ -11,7 +11,7 @@
 1. Acoustic features such as Mel Spectrogram, F0, Energy, etc.
 2. Content features such as phonetic posteriorgrams (PPG) and bottleneck features (BNF) from pretrained models
 
-Note: 
+Note:
 All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset.
 
 """

diff --git a/processors/descriptive_text_features_extractor.py b/processors/descriptive_text_features_extractor.py
@@ -11,7 +11,7 @@
 The common descriptive text features include:
 1. Global semantic guidance features that extracted some pretrained text models like T5. It can be adopted to TTA, TTM, etc.
 
-Note: 
+Note:
 All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset.
 
 """
diff --git a/processors/text_features_extractor.py b/processors/text_features_extractor.py
@@ -11,7 +11,7 @@
 The common text features include:
 1. phone features that are used for TTS, SVS, etc.
 
-Note: 
+Note:
 All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset.
 
 """
diff --git a/text/__init__.py b/text/__init__.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 import re
 from text import cleaners
 from text.symbols import symbols

diff --git a/text/cleaners.py b/text/cleaners.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 
 """
 Cleaners are transformations that run over the input text at both training and eval time.

diff --git a/text/cmudict.py b/text/cmudict.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""  This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 
 import re
 

diff --git a/text/numbers.py b/text/numbers.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 
 import inflect
 import re

diff --git a/text/symbols.py b/text/symbols.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 
 """
 Defines the set of symbols used in text input to the model.

diff --git a/utils/cut_by_vad.py b/utils/cut_by_vad.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/facebookresearch/libri-light/blob/main/data_preparation/cut_by_vad.py"""
+"""This code is modified from https://github.com/facebookresearch/libri-light/blob/main/data_preparation/cut_by_vad.py"""
 import pathlib
 import soundfile as sf
 import numpy as np

diff --git a/utils/mfa_prepare.py b/utils/mfa_prepare.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://montreal-forced-aligner.readthedocs.io/en/latest/user_guide/performance.html"""
+"""This code is modified from https://montreal-forced-aligner.readthedocs.io/en/latest/user_guide/performance.html"""
 
 import os
 import subprocess
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,7 +11,7 @@ @@
 . Acoustic features such as Mel Spectrogram, F0, Energy, etc.
 . Content features such as phonetic posteriorgrams (PPG) and bottleneck features (BNF) from pretrained models
-    Note:
+    Note:
     All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset.
     """
@@ Expand Down @@