From ba4368b35eee3601a5030a667dfe3dfd2a2b31c7 Mon Sep 17 00:00:00 2001 From: fabiocat93 Date: Mon, 9 Sep 2024 18:18:40 +0200 Subject: [PATCH 1/7] adding speech_to_visemes --- TTS/STV/speech_to_visemes.py | 341 ++++++++++++++++++++++ TTS/chatTTS_handler.py | 66 ++++- TTS/melo_handler.py | 32 +- TTS/parler_handler.py | 29 +- arguments_classes/parler_tts_arguments.py | 2 +- connections/local_audio_streamer.py | 13 +- connections/socket_sender.py | 33 ++- listen_and_play.py | 37 ++- 8 files changed, 520 insertions(+), 33 deletions(-) create mode 100644 TTS/STV/speech_to_visemes.py diff --git a/TTS/STV/speech_to_visemes.py b/TTS/STV/speech_to_visemes.py new file mode 100644 index 0000000..11d16dd --- /dev/null +++ b/TTS/STV/speech_to_visemes.py @@ -0,0 +1,341 @@ +"""This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes.""" +from transformers import pipeline +import logging +import numpy as np + +logger = logging.getLogger(__name__) +from typing import List, Dict, Any, Optional + +class SpeechToVisemes(): + """ + Handles the conversion of speech to visemes using a phoneme-to-viseme mapping. + + Attributes: + model_name (str): The name of the model to use for speech recognition. + device (str): The device to run the model on (e.g., "cpu", "mps", "cuda"). + gen_kwargs (dict): Additional generation parameters for the speech recognition pipeline. + asr_pipeline (transformers.Pipeline): The automatic speech recognition pipeline. + """ + + def __init__( + self, + model_name="bookbot/wav2vec2-ljspeech-gruut", + device="mps", + gen_kwargs={}, + ): + """ + Initializes the SpeechToVisemes class with the specified parameters. + + Args: + model_name (str, optional): The name of the model to use for speech recognition. + device (str, optional): The device to run the model on. + gen_kwargs (dict, optional): Additional generation parameters for the speech recognition pipeline. + """ + self.device = device + self.gen_kwargs = gen_kwargs + + # Initialize the automatic speech recognition pipeline + self.asr_pipeline = pipeline( + "automatic-speech-recognition", model=model_name, device=device + ) + + def _map_phonemes_to_visemes( + self, + data: Dict[str, Any], + ) -> List[Dict[str, Any]]: + """ + Maps phonemes to corresponding visemes with timestamps. + + Refer to the following references for more information on the phoneme-to-viseme mapping: + - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-speech-synthesis-viseme?tabs=visemeid&pivots=programming-language-python#map-phonemes-to-visemes + - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets + + Args: + data (Dict[str, Any]): A dictionary containing phoneme data, where `data['chunks']` + holds a list of phonemes and their timestamps. + + Returns: + List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme + ID and the corresponding timestamp. + """ + + def _phoneme_to_viseme(phoneme: str) -> List[int]: + """ + Converts a phoneme to its corresponding viseme(s). + + Args: + phoneme (str): The phoneme to map to viseme. + + Returns: + List[int]: A list of viseme IDs corresponding to the phoneme. + """ + # inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets + phoneme_viseme_map = { + # basic + 'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], + 'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], + 'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], + 'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0], + + # ar-EG + "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], + "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], + "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], + "m": [21], "n": [19], "r": [13], "ʕ": [12], + + # bg-BG + "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], + "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], + "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], + "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], + "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15], + + # ca-ES + "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], + "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], + "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], + "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], + + # cs-CZ + "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], + "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], + "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], + "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], + "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13], + + # da-DK + "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], + "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], + "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], + "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], + "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7], + + # de-DE/de-CH/de-AT + "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], + "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], + "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], + "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], + "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], + "ʒ": [16], "ʔ": [19], + + # el-GR + "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], + "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], + "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], + "x": [12], "z": [15], + + # en-GB/en-IE/en-AU + "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], + "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], + "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], + "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], + "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # en-US/en-CA + "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], + "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], + "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], + "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], + "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], + "dʒ": [19, 16], "l": [14], "ɹ": [13], + + # es-ES + "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], + "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], + "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], + + # es-MX + "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], + "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], + "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], + + # fi-FI + "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], + "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], + "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], + "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], + "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], + "s": [15], "ʃ": [16], "t": [19], "ʋ": [18], + + # fr-FR/fr-CA/fr-CH + "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], + "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], + "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], + "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15], + + # he-IL + "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], + "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], + "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16], + + # hr-HR + "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], + "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], + "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], + "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16], + + # hu-HU + "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], + "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], + "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], + "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], + "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], + "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], + "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], + "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16], + + # id-ID + "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], + "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], + "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], + "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15], + + # it-IT + "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], + "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], + "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], + "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], + "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], + "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], + "v": [18], "vː": [18], "w": [7], "z": [15], + + # ko-KR + "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], + "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], + "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], + "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], + "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], + "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19], + + # ms-MY + "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], + "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], + "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], + "r": [13], "h": [12], "j": [6], "w": [7], "l": [14], + + # nb-NO + "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], + "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], + "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], + "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], + "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19], + + # nl-NL/nl-BE + "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], + "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], + "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], + "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], + "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16], + + # pl-PL + "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], + "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], + "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], + "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], + "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], + "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16], + + # pt-BR + "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], + "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], + "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], + "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20], + + # pt-PT + "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], + "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], + "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], + "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], + "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], + "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # ro-RO + "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], + "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], + "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], + "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], + "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], + "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16], + + # ru-RU + "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], + "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], + "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], + "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], + "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6], + + # sk-SK + "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], + "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], + "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], + "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], + "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], + "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], + "w": [7], + + # sl-SI + "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], + "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], + "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], + "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], + "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], + "z": [15], + + # sv-SE + "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], + "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], + "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], + "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], + "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19], + + # th-TH + "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], + "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], + "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], + "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], + "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19], + + # tr-TR + "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], + "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], + "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], + "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], + "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # vi-VN + "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], + "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], + "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], + "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], + "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], + "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], + "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7] + } + return phoneme_viseme_map.get(phoneme, []) + + viseme_list = [] + chunks = data.get('chunks', []) + + for i, chunk in enumerate(chunks): + phoneme = chunk.get('text', None) + timestamp = chunk.get('timestamp', None) + visemes = _phoneme_to_viseme(phoneme) + + for viseme in visemes: + viseme_list.append({ + 'viseme': viseme, + 'timestamp': timestamp + }) + + return viseme_list + + + def process(self, audio_file: str) -> List[Dict[str, Any]]: + """Process an audio file and convert speech to visemes.""" + # Perform ASR to get phoneme data + asr_result = self.asr_pipeline(audio_file, return_timestamps='char') + # Map phonemes to visemes + viseme_data = self._map_phonemes_to_visemes(asr_result) + + return viseme_data + \ No newline at end of file diff --git a/TTS/chatTTS_handler.py b/TTS/chatTTS_handler.py index 6bdc6bf..1cee897 100644 --- a/TTS/chatTTS_handler.py +++ b/TTS/chatTTS_handler.py @@ -5,6 +5,7 @@ import numpy as np from rich.console import Console import torch +from .STV.speech_to_visemes import SpeechToVisemes logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", @@ -22,6 +23,7 @@ def setup( gen_kwargs={}, # Unused stream=True, chunk_size=512, + viseme_flag = True ): self.should_listen = should_listen self.device = device @@ -33,6 +35,9 @@ def setup( self.params_infer_code = ChatTTS.Chat.InferCodeParams( spk_emb=rnd_spk_emb, ) + self.viseme_flag = viseme_flag + if self.viseme_flag: + self.speech_to_visemes = SpeechToVisemes() self.warmup() def warmup(self): @@ -61,22 +66,65 @@ def process(self, llm_sentence): if gen[0] is None or len(gen[0]) == 0: self.should_listen.set() return + + # Resample the audio to 16000 Hz audio_chunk = librosa.resample(gen[0], orig_sr=24000, target_sr=16000) - audio_chunk = (audio_chunk * 32768).astype(np.int16)[0] - while len(audio_chunk) > self.chunk_size: - yield audio_chunk[: self.chunk_size] # 返回前 chunk_size 字节的数据 - audio_chunk = audio_chunk[self.chunk_size :] # 移除已返回的数据 - yield np.pad(audio_chunk, (0, self.chunk_size - len(audio_chunk))) + # Ensure the audio is converted to mono (single channel) + if len(audio_chunk.shape) > 1: + audio_chunk = librosa.to_mono(audio_chunk) + audio_chunk = (audio_chunk * 32768).astype(np.int16) + + # Process visemes if viseme_flag is set + if self.viseme_flag: + visemes = self.speech_to_visemes.process(audio_chunk) + for viseme in visemes: + console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") + else: + visemes = None + + # Loop through audio chunks, yielding dict for each chunk + for i in range(0, len(audio_chunk), self.chunk_size): + chunk_data = { + "audio": np.pad( + audio_chunk[i : i + self.chunk_size], + (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), + ) + } + # Include text and visemes for the first chunk + if i == 0: + chunk_data["text"] = llm_sentence # Assuming llm_sentence is defined elsewhere + chunk_data["visemes"] = visemes + + yield chunk_data else: wavs = wavs_gen if len(wavs[0]) == 0: self.should_listen.set() return audio_chunk = librosa.resample(wavs[0], orig_sr=24000, target_sr=16000) + # Ensure the audio is converted to mono (single channel) + if len(audio_chunk.shape) > 1: + audio_chunk = librosa.to_mono(audio_chunk) audio_chunk = (audio_chunk * 32768).astype(np.int16) + + if self.viseme_flag: + visemes = self.speech_to_visemes.process(audio_chunk) + for viseme in visemes: + console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") + else: + visemes = None + for i in range(0, len(audio_chunk), self.chunk_size): - yield np.pad( - audio_chunk[i : i + self.chunk_size], - (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), - ) + chunk_data = { + "audio": np.pad( + audio_chunk[i : i + self.chunk_size], + (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), + ) + } + # For the first chunk, include text and visemes + if i == 0: + chunk_data["text"] = llm_sentence + chunk_data["visemes"] = visemes + yield chunk_data + self.should_listen.set() diff --git a/TTS/melo_handler.py b/TTS/melo_handler.py index b1b2226..fc33730 100644 --- a/TTS/melo_handler.py +++ b/TTS/melo_handler.py @@ -6,6 +6,8 @@ from rich.console import Console import torch +from .STV.speech_to_visemes import SpeechToVisemes + logger = logging.getLogger(__name__) console = Console() @@ -28,7 +30,6 @@ "ko": "KR", } - class MeloTTSHandler(BaseHandler): def setup( self, @@ -38,6 +39,7 @@ def setup( speaker_to_id="en", gen_kwargs={}, # Unused blocksize=512, + viseme_flag = True # To obtain timestamped visemes ): self.should_listen = should_listen self.device = device @@ -49,6 +51,11 @@ def setup( WHISPER_LANGUAGE_TO_MELO_SPEAKER[speaker_to_id] ] self.blocksize = blocksize + + self.viseme_flag = viseme_flag + if self.viseme_flag: + self.speech_to_visemes = SpeechToVisemes() + self.warmup() def warmup(self): @@ -100,10 +107,25 @@ def process(self, llm_sentence): return audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000) audio_chunk = (audio_chunk * 32768).astype(np.int16) + + if self.viseme_flag: + visemes = self.speech_to_visemes.process(audio_chunk) + for viseme in visemes: + console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") + else: + visemes = None + for i in range(0, len(audio_chunk), self.blocksize): - yield np.pad( - audio_chunk[i : i + self.blocksize], - (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])), - ) + chunk_data = { + "audio": np.pad( + audio_chunk[i : i + self.blocksize], + (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])) + ) + } + # For the first chunk, include text and visemes + if i == 0: + chunk_data["text"] = llm_sentence + chunk_data["visemes"] = visemes + yield chunk_data self.should_listen.set() diff --git a/TTS/parler_handler.py b/TTS/parler_handler.py index 5cc0ce9..4c52d5b 100644 --- a/TTS/parler_handler.py +++ b/TTS/parler_handler.py @@ -14,6 +14,7 @@ from transformers.utils.import_utils import ( is_flash_attn_2_available, ) +from .STV.speech_to_visemes import SpeechToVisemes torch._inductor.config.fx_graph_cache = True # mind about this parameter ! should be >= 2 * number of padded prompt sizes for TTS @@ -47,6 +48,7 @@ def setup( ), play_steps_s=1, blocksize=512, + viseme_flag = True ): self.should_listen = should_listen self.device = device @@ -78,6 +80,10 @@ def setup( self.model.forward, mode=self.compile_mode, fullgraph=True ) + self.viseme_flag = viseme_flag + if self.viseme_flag: + self.speech_to_visemes = SpeechToVisemes() + self.warmup() def prepare_model_inputs( @@ -179,10 +185,25 @@ def process(self, llm_sentence): ) audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000) audio_chunk = (audio_chunk * 32768).astype(np.int16) + + if self.viseme_flag: + visemes = self.speech_to_visemes.process(audio_chunk) + for viseme in visemes: + console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") + else: + visemes = None + for i in range(0, len(audio_chunk), self.blocksize): - yield np.pad( - audio_chunk[i : i + self.blocksize], - (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])), - ) + chunk_data = { + "audio": np.pad( + audio_chunk[i : i + self.blocksize], + (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])) + ) + } + # For the first chunk, include text and visemes + if i == 0: + chunk_data["text"] = llm_sentence + chunk_data["visemes"] = visemes + yield chunk_data self.should_listen.set() diff --git a/arguments_classes/parler_tts_arguments.py b/arguments_classes/parler_tts_arguments.py index 5159432..1bb0f21 100644 --- a/arguments_classes/parler_tts_arguments.py +++ b/arguments_classes/parler_tts_arguments.py @@ -36,7 +36,7 @@ class ParlerTTSHandlerArguments: tts_gen_max_new_tokens: int = field( default=512, metadata={ - "help": "Maximum number of new tokens to generate in a single completion. Default is 256, which corresponds to ~6 secs" + "help": "Maximum number of new tokens to generate in a single completion. Default is 512, which corresponds to ~6 secs" }, ) description: str = field( diff --git a/connections/local_audio_streamer.py b/connections/local_audio_streamer.py index 389dcb8..d42fbe7 100644 --- a/connections/local_audio_streamer.py +++ b/connections/local_audio_streamer.py @@ -27,7 +27,18 @@ def callback(indata, outdata, frames, time, status): self.input_queue.put(indata.copy()) outdata[:] = 0 * outdata else: - outdata[:] = self.output_queue.get()[:, np.newaxis] + data = self.output_queue.get() + """ + # Check if text data is present and log it + if data.get('text') is not None: + text = data['text'] + logger.info(f"Text: {text}") + # Check if viseme data is present and log it + if data.get('visemes') is not None: + visemes = data['visemes'] + logger.info(f"Visemes: {visemes}") + """ + outdata[:] = data['audio'][:, np.newaxis] logger.debug("Available devices:") logger.debug(sd.query_devices()) diff --git a/connections/socket_sender.py b/connections/socket_sender.py index 11ed210..fb5c7cb 100644 --- a/connections/socket_sender.py +++ b/connections/socket_sender.py @@ -1,6 +1,8 @@ import socket from rich.console import Console import logging +import pickle +import struct logger = logging.getLogger(__name__) @@ -11,7 +13,6 @@ class SocketSender: """ Handles sending generated audio packets to the clients. """ - def __init__(self, stop_event, queue_in, host="0.0.0.0", port=12346): self.stop_event = stop_event self.queue_in = queue_in @@ -28,9 +29,31 @@ def run(self): logger.info("sender connected") while not self.stop_event.is_set(): - audio_chunk = self.queue_in.get() - self.conn.sendall(audio_chunk) - if isinstance(audio_chunk, bytes) and audio_chunk == b"END": - break + data = self.queue_in.get() + packet = {} + if 'audio' in data and data['audio'] is not None: + audio_chunk = data['audio'] + packet['audio'] = data['audio'] + if 'text' in data and data['text'] is not None: + packet['text'] = data['text'] + if 'visemes' in data and data['visemes'] is not None: + packet['visemes'] = data['visemes'] + + # Serialize the packet using pickle + serialized_packet = pickle.dumps(packet) + + # Compute the length of the serialized packet + packet_length = len(serialized_packet) + + # Send the packet length as a 4-byte integer using struct + self.conn.sendall(struct.pack('!I', packet_length)) + + # Send the serialized packet + self.conn.sendall(serialized_packet) + + if 'audio' in data and data['audio'] is not None: + if isinstance(audio_chunk, bytes) and audio_chunk == b"END": + break + self.conn.close() logger.info("Sender closed") diff --git a/listen_and_play.py b/listen_and_play.py index 35eabd6..2082a5e 100644 --- a/listen_and_play.py +++ b/listen_and_play.py @@ -4,15 +4,16 @@ from dataclasses import dataclass, field import sounddevice as sd from transformers import HfArgumentParser - +import struct +import pickle @dataclass class ListenAndPlayArguments: send_rate: int = field(default=16000, metadata={"help": "In Hz. Default is 16000."}) recv_rate: int = field(default=16000, metadata={"help": "In Hz. Default is 16000."}) list_play_chunk_size: int = field( - default=1024, - metadata={"help": "The size of data chunks (in bytes). Default is 1024."}, + default=512, + metadata={"help": "The size of data chunks (in bytes). Default is 512."}, ) host: str = field( default="localhost", @@ -33,7 +34,7 @@ class ListenAndPlayArguments: def listen_and_play( send_rate=16000, recv_rate=44100, - list_play_chunk_size=1024, + list_play_chunk_size=512, host="localhost", send_port=12345, recv_port=12346, @@ -79,9 +80,29 @@ def receive_full_chunk(conn, chunk_size): return data while not stop_event.is_set(): - data = receive_full_chunk(recv_socket, list_play_chunk_size * 2) - if data: - recv_queue.put(data) + # Step 1: Receive the first 4 bytes to get the packet length + length_data = receive_full_chunk(recv_socket, 4) + if not length_data: + continue # Handle disconnection or data not available + + # Step 2: Unpack the length (4 bytes) + packet_length = struct.unpack('!I', length_data)[0] + + # Step 3: Receive the full packet based on the length + serialized_packet = receive_full_chunk(recv_socket, packet_length) + if serialized_packet: + # Step 4: Deserialize the packet using pickle + packet = pickle.loads(serialized_packet) + # Step 5: Extract the packet contents + if 'text' in packet: + pass + # print(packet['text']) + if 'visemes' in packet: + pass + # print(packet['visemes']) + + # Step 6: Put the packet audio data into the queue for sending + recv_queue.put(packet['audio'].tobytes()) try: send_stream = sd.RawInputStream( @@ -123,4 +144,4 @@ def receive_full_chunk(conn, chunk_size): if __name__ == "__main__": parser = HfArgumentParser((ListenAndPlayArguments,)) (listen_and_play_kwargs,) = parser.parse_args_into_dataclasses() - listen_and_play(**vars(listen_and_play_kwargs)) + listen_and_play(**vars(listen_and_play_kwargs)) \ No newline at end of file From 7ff873cf43816e1b6434657d06328fd4cae09cf3 Mon Sep 17 00:00:00 2001 From: fabiocat93 Date: Sun, 6 Oct 2024 10:41:14 -0400 Subject: [PATCH 2/7] picking phoneme_viseme_map from json file --- .gitignore | 3 +- TTS/STV/phoneme_viseme_map.json | 1 + TTS/STV/phoneme_viseme_map_readable.json.txt | 241 ++++++++++++++++ TTS/STV/speech_to_visemes.py | 288 ++----------------- 4 files changed, 268 insertions(+), 265 deletions(-) create mode 100644 TTS/STV/phoneme_viseme_map.json create mode 100644 TTS/STV/phoneme_viseme_map_readable.json.txt diff --git a/.gitignore b/.gitignore index 33b7875..95dc6c6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ tmp -cache \ No newline at end of file +cache +mlx_models/ \ No newline at end of file diff --git a/TTS/STV/phoneme_viseme_map.json b/TTS/STV/phoneme_viseme_map.json new file mode 100644 index 0000000..8c91531 --- /dev/null +++ b/TTS/STV/phoneme_viseme_map.json @@ -0,0 +1 @@ +{"æ":[1],"ə":[1],"ʌ":[1],"ɑ":[2],"ɔ":[3],"ɛ":[4],"ʊ":[4],"ɝ":[5],"j":[6],"i":[6],"ɪ":[6],"w":[7],"u":[7],"o":[8],"aʊ":[9],"ɔɪ":[10],"aɪ":[11],"h":[12],"ɹ":[13],"l":[14],"s":[15],"z":[15],"ʃ":[16],"tʃ":[19,16],"dʒ":[19,16],"ʒ":[16],"ð":[17],"f":[18],"v":[18],"d":[19],"t":[19],"n":[19],"θ":[19],"k":[20],"g":[20],"ŋ":[20],"p":[21],"b":[21],"m":[21]," ":[0],"a":[2],"aː":[2],"iː":[6],"uː":[7],"dˤ":[19],"q":[20],"tˤ":[19],"ʔ":[19],"ħ":[12],"ðˤ":[17],"ɣ":[20],"x":[12],"sˤ":[15],"r":[13],"ʕ":[12],"j͡a":[6,2],"ɤ":[1],"j͡u":[6,7],"t͡s":[19,15],"zʲ":[15],"lʲ":[14],"nʲ":[19],"d͡ʒ":[19,16],"mʲ":[21],"tʲ":[19],"rʲ":[13],"pʲ":[21],"dʲ":[19],"vʲ":[18],"sʲ":[15],"bʲ":[21],"kʲ":[20],"gʲ":[20],"fʲ":[18],"t͡ʃ":[19,16],"d͡z":[19,15],"e":[4],"β":[21],"ʎ":[14],"ɲ":[19],"ɾ":[19],"ɛː":[4],"oː":[8],"o͡ʊ̯":[8,4],"a͡ʊ":[2,4],"ɛ͡ʊ̯":[4,4],"c":[16],"ɟ":[16],"r̝":[13],"ɦ":[12],"ɱ":[21],"r̝̊":[13],"ɑː":[2],"ɒ":[2],"ɒː":[2],"ɔː":[3],"ɐ":[4],"æː":[1],"ø":[1],"øː":[1],"eː":[4],"œ":[4],"œː":[4],"y":[4],"yː":[4],"kʰ":[20],"pʰ":[21],"ʁ":[13],"ɐ̯":[4],"ɕ":[16],"ʏ":[7],"ai":[2,6],"au":[2,7],"ɔy":[3,4],"ɔʏ̯":[3,4],"ʤ":[16],"pf":[21,18],"ʀ":[13],"ts":[19,15],"ç":[12],"ʝ":[12],"ɛə":[4,1],"ɜː":[5],"eɪ":[4,6],"ɪə":[6,1],"əʊ":[1,4],"ʊə":[4,1],"iy":[6],"oʊ":[8,4],"ju":[6,7],"ɪɹ":[6,13],"ɛɹ":[4,13],"ʊɹ":[4,13],"aɪɹ":[11,13],"aʊɹ":[9,13],"ɔɹ":[3,13],"ɑɹ":[2,13],"ɚ":[1],"j͡j":[6,6],"ɑ͡i":[2,6],"ɑ͡u":[2,7],"æ͡i":[1,6],"æ͡y":[1,4],"e͡i":[4,6],"ø͡i":[1,6],"ø͡y":[1,4],"e͡u":[4,7],"e͡y":[4,4],"i͡e":[6,4],"i͡u":[6,7],"i͡y":[6,4],"o͡i":[8,6],"o͡u":[8,7],"u͡i":[7,6],"u͡o":[7,8],"y͡ø":[4,1],"y͡i":[4,6],"ʋ":[18],"ɑ̃":[2],"ɛ̃":[4],"ɔ̃":[3],"œ̃":[4],"ɥ":[7],"n‿":[19],"t‿":[19],"z‿":[15],"ʨ":[16],"ʥ":[16],"bː":[21],"dː":[19],"ɟː":[16],"d͡ʒː":[19,16],"dz":[19,15],"dzː":[19,15],"fː":[18],"gː":[20],"hː":[12],"jː":[6],"ɲː":[19],"kː":[20],"lː":[14],"mː":[21],"nː":[19],"pː":[21],"rː":[13],"sː":[15],"ʃː":[16],"tː":[19],"cː":[16],"t͡sː":[19,15],"t͡ʃː":[19,16],"vː":[18],"ɰ":[20],"zː":[15],"ʒː":[16],"a͡i":[2,6],"ɔ͡i":[3,6],"ɛj":[4,6],"ɛu":[4,7],"ei":[4,6],"eu":[4,7],"ɔj":[3,6],"oi":[8,6],"ou":[8,7],"ʧ":[16],"tʃː":[19,16],"ʣ":[15],"ʣː":[15],"ʤː":[16],"ʎː":[14],"ʦ":[15],"ʦː":[15],"ɯ":[6],"ɰ͡i":[20,6],"w͡a":[7,2],"w͡ɛ":[7,4],"w͡e":[7,4],"w͡i":[7,6],"w͡ʌ":[7,1],"j͡ɛ":[6,4],"j͡e":[6,4],"j͡ʌ":[6,1],"j͡o":[6,8],"b̥":[21],"t͡ɕʰ":[19,16],"d̥":[19],"g̥":[20],"d͡ʑ":[19,16],"d͡ʑ̥":[19,16],"t͡ɕ":[19,16],"sʰ":[15],"tʰ":[19],"ʉ":[6],"ʉː":[6],"æɪ":[1,6],"æʉ":[1,6],"ɑɪ":[2,6],"œʏ":[4,7],"ɔʏ":[3,7],"ʉɪ":[6,6],"ʂ":[15],"ɖ":[19],"ɭ":[14],"ɳ":[19],"ʈ":[19],"ɛ͡i":[4,6],"œ͡y":[4,4],"χ":[12],"ɨ":[6],"t͡ʂ":[19,15],"d̪ʲ":[19],"ɡ":[20],"d͡ʐ":[19,15],"l̪ʲ":[14],"t̪ʲ":[19],"xʲ":[12],"ʑ":[16],"ĩ":[6],"ũ":[7],"ɐ̃":[4],"ẽ":[4],"õ":[8],"w̃":[7],"j̃":[6],"ɐj":[4,6],"ɐ̃j̃":[4,6],"ɐ̃w̃":[4,7],"ɐ͡w":[4,7],"a͡j":[2,6],"ɔ͡j":[3,6],"a͡w":[2,7],"ɛ͡w":[4,7],"e͡w":[4,7],"i͡w":[6,7],"o͡j":[8,6],"õj̃":[8,6],"u͡j":[7,6],"ũj̃":[7,6],"ɫ":[14],"e̯a":[4,2],"e̯o":[4,8],"o̯a":[8,2],"d͡ʒʲ":[19,16],"ʃʲ":[16],"t͡sʲ":[19,15],"t͡ʃʲ":[19,16],"ʒʲ":[16],"ʐ":[15],"ɕː":[16],"i͡a":[6,2],"r̩":[13],"r̩ː":[13],"l̩":[14],"l̩ː":[14],"ɴ":[19],"u̯":[7],"i̯":[6],"dˡ":[19],"dn":[19,19],"tˡ":[19],"tn":[19,19],"ʍ":[7],"a‿u":[2,7],"ɶ":[8],"ɵ":[1],"ɧ":[16],"ia":[6,2],"əː":[1],"ua":[7,2],"ɯː":[6],"ɯa":[6,2],"tɕʰ":[19,16],"œ͡ɟ":[4,16],"i͡ɟ":[6,16],"o͡ɟ":[8,16],"u͡ɟ":[7,16],"ɯ͡ɟ":[6,16],"y͡ɟ":[4,16],"ɮ":[6],"u͡a":[7,2],"ɛ̆j":[4,6],"ə͡j":[1,6],"i͡e͡w":[6,4,7],"ɨ͡ə":[6,1],"ie":[6,4],"ăw":[2,7],"ăj":[2,6],"ɨ͡ə͡j":[6,1,6],"ɔ̆w":[3,7],"ɨ͡w":[6,7],"e͡j":[4,6],"ɨ͡ʌ͡w":[6,1,7],"ɨ͡j":[6,6],"iə":[6,1],"a͡ʲ":[2],"ɓ":[21],"ɗ":[19]} \ No newline at end of file diff --git a/TTS/STV/phoneme_viseme_map_readable.json.txt b/TTS/STV/phoneme_viseme_map_readable.json.txt new file mode 100644 index 0000000..911f8c7 --- /dev/null +++ b/TTS/STV/phoneme_viseme_map_readable.json.txt @@ -0,0 +1,241 @@ +{ + # basic + 'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], + 'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], + 'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], + 'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0], + + # ar-EG + "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], + "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], + "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], + "m": [21], "n": [19], "r": [13], "ʕ": [12], + + # bg-BG + "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], + "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], + "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], + "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], + "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15], + + # ca-ES + "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], + "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], + "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], + "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], + + # cs-CZ + "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], + "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], + "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], + "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], + "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13], + + # da-DK + "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], + "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], + "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], + "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], + "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7], + + # de-DE/de-CH/de-AT + "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], + "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], + "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], + "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], + "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], + "ʒ": [16], "ʔ": [19], + + # el-GR + "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], + "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], + "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], + "x": [12], "z": [15], + + # en-GB/en-IE/en-AU + "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], + "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], + "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], + "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], + "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # en-US/en-CA + "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], + "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], + "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], + "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], + "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], + "dʒ": [19, 16], "l": [14], "ɹ": [13], + + # es-ES + "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], + "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], + "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], + + # es-MX + "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], + "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], + "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], + + # fi-FI + "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], + "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], + "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], + "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], + "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], + "s": [15], "ʃ": [16], "t": [19], "ʋ": [18], + + # fr-FR/fr-CA/fr-CH + "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], + "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], + "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], + "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15], + + # he-IL + "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], + "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], + "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16], + + # hr-HR + "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], + "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], + "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], + "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16], + + # hu-HU + "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], + "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], + "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], + "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], + "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], + "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], + "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], + "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16], + + # id-ID + "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], + "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], + "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], + "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15], + + # it-IT + "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], + "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], + "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], + "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], + "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], + "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], + "v": [18], "vː": [18], "w": [7], "z": [15], + + # ko-KR + "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], + "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], + "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], + "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], + "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], + "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19], + + # ms-MY + "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], + "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], + "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], + "r": [13], "h": [12], "j": [6], "w": [7], "l": [14], + + # nb-NO + "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], + "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], + "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], + "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], + "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19], + + # nl-NL/nl-BE + "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], + "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], + "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], + "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], + "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16], + + # pl-PL + "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], + "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], + "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], + "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], + "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], + "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16], + + # pt-BR + "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], + "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], + "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], + "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20], + + # pt-PT + "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], + "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], + "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], + "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], + "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], + "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # ro-RO + "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], + "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], + "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], + "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], + "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], + "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16], + + # ru-RU + "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], + "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], + "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], + "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], + "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6], + + # sk-SK + "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], + "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], + "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], + "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], + "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], + "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], + "w": [7], + + # sl-SI + "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], + "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], + "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], + "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], + "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], + "z": [15], + + # sv-SE + "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], + "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], + "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], + "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], + "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19], + + # th-TH + "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], + "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], + "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], + "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], + "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19], + + # tr-TR + "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], + "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], + "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], + "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], + "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], + + # vi-VN + "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], + "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], + "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], + "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], + "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], + "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], + "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7] +} \ No newline at end of file diff --git a/TTS/STV/speech_to_visemes.py b/TTS/STV/speech_to_visemes.py index 11d16dd..861a7d9 100644 --- a/TTS/STV/speech_to_visemes.py +++ b/TTS/STV/speech_to_visemes.py @@ -1,10 +1,10 @@ """This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes.""" from transformers import pipeline import logging -import numpy as np +import json logger = logging.getLogger(__name__) -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any class SpeechToVisemes(): """ @@ -34,6 +34,11 @@ def __init__( self.device = device self.gen_kwargs = gen_kwargs + # This dictionary is inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets + phoneme_viseme_map_file="TTS/STV/phoneme_viseme_map.json" + with open(phoneme_viseme_map_file, 'r') as f: + self.phoneme_viseme_map = json.load(f) + # Initialize the automatic speech recognition pipeline self.asr_pipeline = pipeline( "automatic-speech-recognition", model=model_name, device=device @@ -51,275 +56,20 @@ def _map_phonemes_to_visemes( - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets Args: - data (Dict[str, Any]): A dictionary containing phoneme data, where `data['chunks']` + data (Dict[str, Any]): A dictionary containing phoneme data, where data['chunks'] holds a list of phonemes and their timestamps. Returns: - List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme - ID and the corresponding timestamp. + List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme ID + and the corresponding timestamp. """ - - def _phoneme_to_viseme(phoneme: str) -> List[int]: - """ - Converts a phoneme to its corresponding viseme(s). - - Args: - phoneme (str): The phoneme to map to viseme. - - Returns: - List[int]: A list of viseme IDs corresponding to the phoneme. - """ - # inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets - phoneme_viseme_map = { - # basic - 'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], - 'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], - 'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], - 'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0], - - # ar-EG - "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], - "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], - "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], - "m": [21], "n": [19], "r": [13], "ʕ": [12], - - # bg-BG - "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], - "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], - "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], - "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], - "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15], - - # ca-ES - "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], - "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], - "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], - "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], - - # cs-CZ - "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], - "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], - "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], - "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], - "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13], - - # da-DK - "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], - "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], - "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], - "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], - "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7], - - # de-DE/de-CH/de-AT - "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], - "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], - "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], - "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], - "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], - "ʒ": [16], "ʔ": [19], - - # el-GR - "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], - "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], - "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], - "x": [12], "z": [15], - - # en-GB/en-IE/en-AU - "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], - "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], - "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], - "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], - "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], - - # en-US/en-CA - "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], - "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], - "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], - "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], - "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], - "dʒ": [19, 16], "l": [14], "ɹ": [13], - - # es-ES - "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], - "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], - "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], - - # es-MX - "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], - "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], - "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], - - # fi-FI - "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], - "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], - "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], - "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], - "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], - "s": [15], "ʃ": [16], "t": [19], "ʋ": [18], - - # fr-FR/fr-CA/fr-CH - "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], - "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], - "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], - "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15], - - # he-IL - "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], - "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], - "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16], - - # hr-HR - "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], - "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], - "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], - "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16], - - # hu-HU - "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], - "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], - "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], - "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], - "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], - "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], - "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], - "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16], - - # id-ID - "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], - "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], - "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], - "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15], - - # it-IT - "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], - "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], - "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], - "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], - "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], - "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], - "v": [18], "vː": [18], "w": [7], "z": [15], - - # ko-KR - "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], - "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], - "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], - "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], - "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], - "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19], - - # ms-MY - "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], - "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], - "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], - "r": [13], "h": [12], "j": [6], "w": [7], "l": [14], - - # nb-NO - "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], - "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], - "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], - "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], - "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19], - - # nl-NL/nl-BE - "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], - "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], - "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], - "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], - "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16], - - # pl-PL - "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], - "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], - "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], - "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], - "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], - "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16], - - # pt-BR - "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], - "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], - "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], - "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20], - - # pt-PT - "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], - "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], - "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], - "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], - "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], - "v": [18], "w": [7], "z": [15], "ʒ": [16], - - # ro-RO - "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], - "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], - "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], - "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], - "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], - "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16], - - # ru-RU - "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], - "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], - "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], - "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], - "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6], - - # sk-SK - "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], - "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], - "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], - "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], - "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], - "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], - "w": [7], - - # sl-SI - "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], - "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], - "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], - "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], - "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], - "z": [15], - - # sv-SE - "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], - "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], - "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], - "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], - "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19], - - # th-TH - "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], - "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], - "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], - "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], - "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19], - - # tr-TR - "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], - "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], - "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], - "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], - "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], - - # vi-VN - "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], - "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], - "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], - "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], - "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], - "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], - "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7] - } - return phoneme_viseme_map.get(phoneme, []) - viseme_list = [] chunks = data.get('chunks', []) - for i, chunk in enumerate(chunks): + for _, chunk in enumerate(chunks): phoneme = chunk.get('text', None) timestamp = chunk.get('timestamp', None) - visemes = _phoneme_to_viseme(phoneme) + visemes = self.phoneme_viseme_map.get(phoneme, []) for viseme in visemes: viseme_list.append({ @@ -331,11 +81,21 @@ def _phoneme_to_viseme(phoneme: str) -> List[int]: def process(self, audio_file: str) -> List[Dict[str, Any]]: - """Process an audio file and convert speech to visemes.""" + """Process an audio file and convert speech to visemes. + + Heuristically, we found that the model requires at least 0.5 seconds of audio to run phoneme recognition. + This value may be also depended on the model, the language, and other factors. + + Args: + audio_file (str): The path to the audio file. + + Returns: + List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme + ID and the corresponding timestamp. + """ # Perform ASR to get phoneme data asr_result = self.asr_pipeline(audio_file, return_timestamps='char') # Map phonemes to visemes viseme_data = self._map_phonemes_to_visemes(asr_result) - return viseme_data \ No newline at end of file From 20ec10bba6614125a19f994987c2ddf8409f6de0 Mon Sep 17 00:00:00 2001 From: fabiocat93 Date: Sun, 6 Oct 2024 10:45:48 -0400 Subject: [PATCH 3/7] adding pre-commit hooks for codespell and ruff style check --- .pre-commit-config.yaml | 12 ++++++++++++ LLM/chat.py | 2 +- LLM/language_model.py | 4 ++-- LLM/mlx_language_model.py | 4 ++-- LLM/openai_api_language_model.py | 4 ++-- README.md | 4 ++-- STT/lightning_whisper_mlx_handler.py | 3 +-- STT/paraformer_handler.py | 2 +- STT/whisper_stt_handler.py | 2 +- TTS/STV/speech_to_visemes.py | 2 +- arguments_classes/parler_tts_arguments.py | 2 +- 11 files changed, 26 insertions(+), 15 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..185a1e0 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,12 @@ +repos: + - repo: https://github.com/codespell-project/codespell + rev: v2.2.5 # Specify the latest stable version + hooks: + - id: codespell + args: ["-w"] # The -w flag tells codespell to automatically apply fixes + + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: v0.1.1 # Replace with the latest stable version of ruff-pre-commit + hooks: + - id: ruff + args: ["--fix"] # This will automatically fix linting issues diff --git a/LLM/chat.py b/LLM/chat.py index bc8ac4f..6f5569d 100644 --- a/LLM/chat.py +++ b/LLM/chat.py @@ -6,7 +6,7 @@ class Chat: def __init__(self, size): self.size = size self.init_chat_message = None - # maxlen is necessary pair, since a each new step we add an prompt and assitant answer + # maxlen is necessary pair, since a each new step we add an prompt and assistant answer self.buffer = [] def append(self, item): diff --git a/LLM/language_model.py b/LLM/language_model.py index ddeb34b..202e007 100644 --- a/LLM/language_model.py +++ b/LLM/language_model.py @@ -68,7 +68,7 @@ def setup( if init_chat_role: if not init_chat_prompt: raise ValueError( - "An initial promt needs to be specified when setting init_chat_role." + "An initial prompt needs to be specified when setting init_chat_role." ) self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt}) self.user_role = user_role @@ -111,7 +111,7 @@ def warmup(self): ) def process(self, prompt): - logger.debug("infering language model...") + logger.debug("inferring language model...") language_code = None if isinstance(prompt, tuple): prompt, language_code = prompt diff --git a/LLM/mlx_language_model.py b/LLM/mlx_language_model.py index 87812c5..8269b3b 100644 --- a/LLM/mlx_language_model.py +++ b/LLM/mlx_language_model.py @@ -42,7 +42,7 @@ def setup( if init_chat_role: if not init_chat_prompt: raise ValueError( - "An initial promt needs to be specified when setting init_chat_role." + "An initial prompt needs to be specified when setting init_chat_role." ) self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt}) self.user_role = user_role @@ -68,7 +68,7 @@ def warmup(self): ) def process(self, prompt): - logger.debug("infering language model...") + logger.debug("inferring language model...") language_code = None if isinstance(prompt, tuple): diff --git a/LLM/openai_api_language_model.py b/LLM/openai_api_language_model.py index dcbabe0..2866867 100644 --- a/LLM/openai_api_language_model.py +++ b/LLM/openai_api_language_model.py @@ -44,7 +44,7 @@ def setup( if init_chat_role: if not init_chat_prompt: raise ValueError( - "An initial promt needs to be specified when setting init_chat_role." + "An initial prompt needs to be specified when setting init_chat_role." ) self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt}) self.user_role = user_role @@ -54,7 +54,7 @@ def setup( def warmup(self): logger.info(f"Warming up {self.__class__.__name__}") start = time.time() - response = self.client.chat.completions.create( + _ = self.client.chat.completions.create( model=self.model_name, messages=[ {"role": "system", "content": "You are a helpful assistant"}, diff --git a/README.md b/README.md index 02c1676..9f0765c 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ The pipeline can be run in two ways: - **Server/Client approach**: Models run on a server, and audio input/output are streamed from a client. - **Local approach**: Runs locally. -### Recommanded setup +### Recommended setup ### Server/Client Approach @@ -120,7 +120,7 @@ https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install ### Recommended usage with Cuda -Leverage Torch Compile for Whisper and Parler-TTS. **The usage of Parler-TTS allows for audio output streaming, futher reducing the overeall latency** 🚀: +Leverage Torch Compile for Whisper and Parler-TTS. **The usage of Parler-TTS allows for audio output streaming, further reducing the overeall latency** 🚀: ```bash python s2s_pipeline.py \ diff --git a/STT/lightning_whisper_mlx_handler.py b/STT/lightning_whisper_mlx_handler.py index 53b6b5a..2f2d657 100644 --- a/STT/lightning_whisper_mlx_handler.py +++ b/STT/lightning_whisper_mlx_handler.py @@ -4,7 +4,6 @@ from lightning_whisper_mlx import LightningWhisperMLX import numpy as np from rich.console import Console -from copy import copy import torch logger = logging.getLogger(__name__) @@ -55,7 +54,7 @@ def warmup(self): _ = self.model.transcribe(dummy_input)["text"].strip() def process(self, spoken_prompt): - logger.debug("infering whisper...") + logger.debug("inferring whisper...") global pipeline_start pipeline_start = perf_counter() diff --git a/STT/paraformer_handler.py b/STT/paraformer_handler.py index 99fd6ac..dcadc02 100644 --- a/STT/paraformer_handler.py +++ b/STT/paraformer_handler.py @@ -45,7 +45,7 @@ def warmup(self): _ = self.model.generate(dummy_input)[0]["text"].strip().replace(" ", "") def process(self, spoken_prompt): - logger.debug("infering paraformer...") + logger.debug("inferring paraformer...") global pipeline_start pipeline_start = perf_counter() diff --git a/STT/whisper_stt_handler.py b/STT/whisper_stt_handler.py index 0930087..88c578f 100644 --- a/STT/whisper_stt_handler.py +++ b/STT/whisper_stt_handler.py @@ -109,7 +109,7 @@ def warmup(self): ) def process(self, spoken_prompt): - logger.debug("infering whisper...") + logger.debug("inferring whisper...") global pipeline_start pipeline_start = perf_counter() diff --git a/TTS/STV/speech_to_visemes.py b/TTS/STV/speech_to_visemes.py index 861a7d9..16ad95c 100644 --- a/TTS/STV/speech_to_visemes.py +++ b/TTS/STV/speech_to_visemes.py @@ -1,10 +1,10 @@ """This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes.""" +from typing import List, Dict, Any from transformers import pipeline import logging import json logger = logging.getLogger(__name__) -from typing import List, Dict, Any class SpeechToVisemes(): """ diff --git a/arguments_classes/parler_tts_arguments.py b/arguments_classes/parler_tts_arguments.py index 1bb0f21..b519751 100644 --- a/arguments_classes/parler_tts_arguments.py +++ b/arguments_classes/parler_tts_arguments.py @@ -57,6 +57,6 @@ class ParlerTTSHandlerArguments: max_prompt_pad_length: int = field( default=8, metadata={ - "help": "When using compilation, the prompt as to be padded to closest power of 2. This parameters sets the maximun power of 2 possible." + "help": "When using compilation, the prompt as to be padded to closest power of 2. This parameters sets the maximum power of 2 possible." }, ) From 522f716383a2fe8c6339c24992694ceddc8a6c67 Mon Sep 17 00:00:00 2001 From: fabiocat93 Date: Sun, 6 Oct 2024 10:52:23 -0400 Subject: [PATCH 4/7] removing TTS/STV/phoneme_viseme_map_readable.json.txt --- TTS/STV/phoneme_viseme_map_readable.json.txt | 241 ------------------- 1 file changed, 241 deletions(-) delete mode 100644 TTS/STV/phoneme_viseme_map_readable.json.txt diff --git a/TTS/STV/phoneme_viseme_map_readable.json.txt b/TTS/STV/phoneme_viseme_map_readable.json.txt deleted file mode 100644 index 911f8c7..0000000 --- a/TTS/STV/phoneme_viseme_map_readable.json.txt +++ /dev/null @@ -1,241 +0,0 @@ -{ - # basic - 'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], - 'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], - 'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], - 'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0], - - # ar-EG - "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], - "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], - "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], - "m": [21], "n": [19], "r": [13], "ʕ": [12], - - # bg-BG - "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], - "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], - "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], - "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], - "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15], - - # ca-ES - "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], - "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], - "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], - "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], - - # cs-CZ - "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], - "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], - "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], - "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], - "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13], - - # da-DK - "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], - "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], - "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], - "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], - "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7], - - # de-DE/de-CH/de-AT - "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], - "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], - "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], - "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], - "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], - "ʒ": [16], "ʔ": [19], - - # el-GR - "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], - "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], - "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], - "x": [12], "z": [15], - - # en-GB/en-IE/en-AU - "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], - "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], - "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], - "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], - "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], - - # en-US/en-CA - "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], - "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], - "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], - "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], - "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], - "dʒ": [19, 16], "l": [14], "ɹ": [13], - - # es-ES - "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], - "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], - "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16], - - # es-MX - "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], - "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], - "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], - - # fi-FI - "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], - "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], - "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], - "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], - "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], - "s": [15], "ʃ": [16], "t": [19], "ʋ": [18], - - # fr-FR/fr-CA/fr-CH - "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], - "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], - "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], - "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15], - - # he-IL - "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], - "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], - "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16], - - # hr-HR - "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], - "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], - "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], - "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16], - - # hu-HU - "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], - "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], - "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], - "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], - "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], - "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], - "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], - "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16], - - # id-ID - "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], - "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], - "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], - "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15], - - # it-IT - "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], - "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], - "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], - "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], - "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], - "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], - "v": [18], "vː": [18], "w": [7], "z": [15], - - # ko-KR - "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], - "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], - "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], - "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], - "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], - "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19], - - # ms-MY - "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], - "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], - "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], - "r": [13], "h": [12], "j": [6], "w": [7], "l": [14], - - # nb-NO - "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], - "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], - "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], - "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], - "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19], - - # nl-NL/nl-BE - "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], - "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], - "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], - "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], - "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16], - - # pl-PL - "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], - "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], - "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], - "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], - "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], - "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16], - - # pt-BR - "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], - "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], - "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], - "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20], - - # pt-PT - "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], - "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], - "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], - "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], - "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], - "v": [18], "w": [7], "z": [15], "ʒ": [16], - - # ro-RO - "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], - "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], - "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], - "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], - "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], - "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16], - - # ru-RU - "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], - "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], - "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], - "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], - "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6], - - # sk-SK - "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], - "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], - "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], - "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], - "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], - "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], - "w": [7], - - # sl-SI - "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], - "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], - "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], - "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], - "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], - "z": [15], - - # sv-SE - "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], - "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], - "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], - "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], - "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19], - - # th-TH - "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], - "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], - "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], - "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], - "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19], - - # tr-TR - "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], - "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], - "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], - "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], - "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16], - - # vi-VN - "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], - "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], - "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], - "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], - "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], - "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], - "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7] -} \ No newline at end of file From 62cd4e13996616af147eb70d57ef70736949ecae Mon Sep 17 00:00:00 2001 From: fabiocat93 Date: Mon, 7 Oct 2024 18:45:22 -0400 Subject: [PATCH 5/7] fixing style issues --- {TTS/STV => STV}/phoneme_viseme_map.json | 0 {TTS/STV => STV}/speech_to_visemes.py | 0 STV/w2v_stv_handler.py | 257 +++++++++++++++++++++++ TTS/chatTTS_handler.py | 79 +++---- TTS/melo_handler.py | 46 ++-- TTS/parler_handler.py | 39 ++-- arguments_classes/w2v_stv_arguments.py | 29 +++ connections/local_audio_streamer.py | 2 +- connections/socket_sender.py | 2 +- listen_and_play.py | 15 +- s2s_pipeline.py | 40 +++- 11 files changed, 391 insertions(+), 118 deletions(-) rename {TTS/STV => STV}/phoneme_viseme_map.json (100%) rename {TTS/STV => STV}/speech_to_visemes.py (100%) create mode 100644 STV/w2v_stv_handler.py create mode 100644 arguments_classes/w2v_stv_arguments.py diff --git a/TTS/STV/phoneme_viseme_map.json b/STV/phoneme_viseme_map.json similarity index 100% rename from TTS/STV/phoneme_viseme_map.json rename to STV/phoneme_viseme_map.json diff --git a/TTS/STV/speech_to_visemes.py b/STV/speech_to_visemes.py similarity index 100% rename from TTS/STV/speech_to_visemes.py rename to STV/speech_to_visemes.py diff --git a/STV/w2v_stv_handler.py b/STV/w2v_stv_handler.py new file mode 100644 index 0000000..7e65403 --- /dev/null +++ b/STV/w2v_stv_handler.py @@ -0,0 +1,257 @@ +import json +import logging +import time +from typing import Any, Dict, Generator, List + +import numpy as np +from rich.console import Console +from transformers import pipeline + +from baseHandler import BaseHandler + +logger = logging.getLogger(__name__) +console = Console() + + +class Wav2Vec2STVHandler(BaseHandler): + """ + Handles the Speech-To-Viseme generation using a Wav2Vec2 model for automatic + speech recognition (ASR) and phoneme mapping to visemes. + + Attributes: + MIN_AUDIO_LENGTH (float): Minimum length of audio (in seconds) required + for phoneme extraction. + """ + + MIN_AUDIO_LENGTH = 0.5 # Minimum audio length in seconds for phoneme extraction + + def setup( + self, + should_listen: bool, + model_name: str = "bookbot/wav2vec2-ljspeech-gruut", + blocksize: int = 512, + device: str = "cuda", + skip: bool = False, + gen_kwargs: Dict[str, Any] = {}, # Not used + ) -> None: + """ + Initializes the handler by loading the ASR model and phoneme-to-viseme map. + + Args: + should_listen (bool): Flag indicating whether the speech-to-speech pipeline should start + listening to the user or not. + model_name (str): Name of the ASR model to use. + Defaults to "bookbot/wav2vec2-ljspeech-gruut". + blocksize (int): Size of each audio block when processing audio. + Defaults to 512. + device (str): Device to run the model on ("cuda", "mps", or "cpu"). + Defaults to "cuda". + skip (bool): If True, the speech-to-viseme process is skipped. + Defaults to False. + gen_kwargs (dict): Additional parameters for speech generation. + + Returns: + None + """ + self.device = device + self.gen_kwargs = gen_kwargs + self.blocksize = blocksize + self.should_listen = should_listen + self.skip = skip + + # Load phoneme-to-viseme map from the JSON file + phoneme_viseme_map_file = "STV/phoneme_viseme_map.json" + with open(phoneme_viseme_map_file, "r") as f: + self.phoneme_viseme_map = json.load(f) + + # Initialize the ASR pipeline using the specified model and device + self.asr_pipeline = pipeline( + "automatic-speech-recognition", + model=model_name, + device=device, + torch_dtype="auto", + ) + self.expected_sampling_rate = self.asr_pipeline.feature_extractor.sampling_rate + + # Initialize an empty dictionary to store audio batch data + self.audio_batch = { + "waveform": np.array([]), + "sampling_rate": self.expected_sampling_rate, + } + self.text_batch = None + self.should_listen_flag = False + + self.warmup() # Perform model warmup + + def warmup(self) -> None: + """Warms up the model with dummy input to prepare it for inference. + + Returns: + None + """ + logger.info(f"Warming up {self.__class__.__name__}") + start_time = time.time() + + # Create dummy input for warmup inference + dummy_input = np.random.randn(self.blocksize).astype(np.int16) + _ = self.speech_to_visemes(dummy_input) + + warmup_time = time.time() - start_time + logger.info( + f"{self.__class__.__name__}: warmed up in {warmup_time:.4f} seconds!" + ) + + def speech_to_visemes(self, audio: Any) -> List[Dict[str, Any]]: + """ + Converts speech audio to visemes by performing Automatic Speech Recognition (ASR) + and mapping phonemes to visemes. + + Args: + audio (Any): The input audio data. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing mapped visemes + and their corresponding timestamps. + + Note: + Heuristically, the input audio should be at least 0.5 seconds long for proper phoneme extraction. + """ + + def _map_phonemes_to_visemes( + data: Dict[str, Any], + ) -> List[Dict[str, Any]]: + """ + Maps extracted phonemes to their corresponding visemes based on a predefined map. + + Args: + data (Dict[str, Any]): Dictionary containing phoneme data where data['chunks'] + holds a list of phonemes and their timestamps. + + Returns: + List[Dict[str, Any]]: A list of dictionaries with viseme IDs and their corresponding timestamps. + """ + viseme_list = [] + chunks = data.get("chunks", []) + + # Map each phoneme to corresponding visemes + for chunk in chunks: + phoneme = chunk.get("text", None) + timestamp = chunk.get("timestamp", None) + visemes = self.phoneme_viseme_map.get(phoneme, []) + + for viseme in visemes: + viseme_list.append({"viseme": viseme, "timestamp": timestamp}) + + return viseme_list + + # Perform ASR to extract phoneme data, including timestamps + try: + asr_result = self.asr_pipeline(audio, return_timestamps="char") + except Exception as e: + logger.error(f"ASR error: {e}") + return [] + # Map the phonemes obtained from ASR to visemes + return _map_phonemes_to_visemes(asr_result) + + def process(self, data: Dict[str, Any]) -> Generator[Dict[str, Any], None, None]: + """ + Processes an audio file to generate visemes and output blocks of audio data + along with corresponding viseme data. + + Args: + data (Dict[str, Any]): Dictionary containing audio, text, and potentially additional information. + + Yields: + Dict: A dictionary containing audio waveform, and optionally viseme data, text, and potentially additional information. + """ + + if "sentence_end" in data and data["sentence_end"]: + self.should_listen_flag = True + if self.skip: # Skip viseme extraction if the flag is set + yield { + "audio": { + "waveform": data["audio"]["waveform"], + "sampling_rate": data["audio"]["sampling_rate"], + }, + "text": data["text"] if "text" in data else None, + } + else: + # Check if text data is present and save it for later + if "text" in data and data["text"] is not None: + self.text_batch = data["text"] + # Concatenate new audio data into the buffer if available and valid + if "audio" in data and data["audio"] is not None: + audio_data = data["audio"] + # Check if the sampling rate is valid and matches the expected one + if audio_data.get("sampling_rate", None) != self.expected_sampling_rate: + logger.error( + f"Expected sampling rate {self.expected_sampling_rate}, " + f"but got {audio_data['sampling_rate']}." + ) + return + # Append the waveform to the audio buffer + self.audio_batch["waveform"] = np.concatenate( + (self.audio_batch["waveform"], audio_data["waveform"]), axis=0 + ) + + # Ensure the total audio length is sufficient for phoneme extraction + if ( + len(self.audio_batch["waveform"]) / self.audio_batch["sampling_rate"] + < self.MIN_AUDIO_LENGTH + ): + return + else: + logger.debug("Starting viseme inference...") + + # Perform viseme inference using the accumulated audio batch + viseme_data = self.speech_to_visemes(self.audio_batch["waveform"]) + logger.debug("Viseme inference completed.") + + # Print the visemes and timestamps to the console + for viseme in viseme_data: + console.print( + f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}" + ) + + # Process the audio in chunks of the defined blocksize + self.audio_batch["waveform"] = self.audio_batch["waveform"].astype( + np.int16 + ) + for i in range(0, len(self.audio_batch["waveform"]), self.blocksize): + chunk_waveform = self.audio_batch["waveform"][ + i : i + self.blocksize + ] + padded_waveform = np.pad( + chunk_waveform, (0, self.blocksize - len(chunk_waveform)) + ) + + chunk_data = { + "audio": { + "waveform": padded_waveform, + "sample_rate": self.audio_batch["sampling_rate"], + } + } + + # Add text and viseme data only in the first chunk + if i == 0: + if self.text_batch: + chunk_data["text"] = self.text_batch + if viseme_data and len(viseme_data) > 0: + chunk_data["visemes"] = viseme_data + yield chunk_data + + # Reset the audio and text buffer after processing + self.audio_batch = { + "waveform": np.array([]), + "sampling_rate": self.expected_sampling_rate, + } + self.text_batch = "" + + if self.should_listen_flag: + self.should_listen.set() + self.should_listen_flag = False + + +# TODO: Test in all modalities and TTS models**: Ensure compatibility with the different models. This requires integration testing with your models and modalities. +# in s2s_pipeline change some names +# remove some prints \ No newline at end of file diff --git a/TTS/chatTTS_handler.py b/TTS/chatTTS_handler.py index 1cee897..6c177c4 100644 --- a/TTS/chatTTS_handler.py +++ b/TTS/chatTTS_handler.py @@ -5,7 +5,6 @@ import numpy as np from rich.console import Console import torch -from .STV.speech_to_visemes import SpeechToVisemes logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", @@ -18,14 +17,11 @@ class ChatTTSHandler(BaseHandler): def setup( self, - should_listen, device="cuda", gen_kwargs={}, # Unused stream=True, chunk_size=512, - viseme_flag = True ): - self.should_listen = should_listen self.device = device self.model = ChatTTS.Chat() self.model.load(compile=False) # Doesn't work for me with True @@ -35,9 +31,7 @@ def setup( self.params_infer_code = ChatTTS.Chat.InferCodeParams( spk_emb=rnd_spk_emb, ) - self.viseme_flag = viseme_flag - if self.viseme_flag: - self.speech_to_visemes = SpeechToVisemes() + self.output_sampling_rate = 16000 self.warmup() def warmup(self): @@ -45,6 +39,8 @@ def warmup(self): _ = self.model.infer("text") def process(self, llm_sentence): + if isinstance(llm_sentence, tuple): + llm_sentence, _ = llm_sentence # Ignore language console.print(f"[green]ASSISTANT: {llm_sentence}") if self.device == "mps": import time @@ -64,67 +60,62 @@ def process(self, llm_sentence): wavs = [np.array([])] for gen in wavs_gen: if gen[0] is None or len(gen[0]) == 0: - self.should_listen.set() - return + return { + "text": llm_sentence, + "sentence_end": True + } # Resample the audio to 16000 Hz - audio_chunk = librosa.resample(gen[0], orig_sr=24000, target_sr=16000) + audio_chunk = librosa.resample(gen[0], orig_sr=24000, target_sr=self.output_sampling_rate) # Ensure the audio is converted to mono (single channel) if len(audio_chunk.shape) > 1: audio_chunk = librosa.to_mono(audio_chunk) audio_chunk = (audio_chunk * 32768).astype(np.int16) - - # Process visemes if viseme_flag is set - if self.viseme_flag: - visemes = self.speech_to_visemes.process(audio_chunk) - for viseme in visemes: - console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") - else: - visemes = None - + # Loop through audio chunks, yielding dict for each chunk for i in range(0, len(audio_chunk), self.chunk_size): chunk_data = { - "audio": np.pad( - audio_chunk[i : i + self.chunk_size], - (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), - ) + "audio": { + "waveform": np.pad( + audio_chunk[i : i + self.chunk_size], + (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), + ), + "sampling_rate": self.output_sampling_rate, + } } - # Include text and visemes for the first chunk + # Include text for the first chunk if i == 0: chunk_data["text"] = llm_sentence # Assuming llm_sentence is defined elsewhere - chunk_data["visemes"] = visemes - + if i >= len(audio_chunk) - self.chunk_size: + # This is the last round + chunk_data["sentence_end"] = True yield chunk_data else: wavs = wavs_gen if len(wavs[0]) == 0: - self.should_listen.set() - return - audio_chunk = librosa.resample(wavs[0], orig_sr=24000, target_sr=16000) + return { + "sentence_end": True + } + audio_chunk = librosa.resample(wavs[0], orig_sr=24000, target_sr=self.output_sampling_rate) # Ensure the audio is converted to mono (single channel) if len(audio_chunk.shape) > 1: audio_chunk = librosa.to_mono(audio_chunk) audio_chunk = (audio_chunk * 32768).astype(np.int16) - if self.viseme_flag: - visemes = self.speech_to_visemes.process(audio_chunk) - for viseme in visemes: - console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") - else: - visemes = None - for i in range(0, len(audio_chunk), self.chunk_size): chunk_data = { - "audio": np.pad( - audio_chunk[i : i + self.chunk_size], - (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), - ) + "audio": { + "waveform": np.pad( + audio_chunk[i : i + self.chunk_size], + (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])), + ), + "sampling_rate": self.output_sampling_rate, + } } - # For the first chunk, include text and visemes + # For the first chunk, include text if i == 0: chunk_data["text"] = llm_sentence - chunk_data["visemes"] = visemes + if i >= len(audio_chunk) - self.chunk_size: + # This is the last round + chunk_data["sentence_end"] = True yield chunk_data - - self.should_listen.set() diff --git a/TTS/melo_handler.py b/TTS/melo_handler.py index 64fbbbc..be25007 100644 --- a/TTS/melo_handler.py +++ b/TTS/melo_handler.py @@ -6,8 +6,6 @@ from rich.console import Console import torch -from .STV.speech_to_visemes import SpeechToVisemes - logger = logging.getLogger(__name__) console = Console() @@ -33,15 +31,12 @@ class MeloTTSHandler(BaseHandler): def setup( self, - should_listen, - device="mps", + device="auto", language="en", speaker_to_id="en", gen_kwargs={}, # Unused blocksize=512, - viseme_flag = True # To obtain timestamped visemes ): - self.should_listen = should_listen self.device = device self.language = language self.model = TTS( @@ -51,10 +46,7 @@ def setup( WHISPER_LANGUAGE_TO_MELO_SPEAKER[speaker_to_id] ] self.blocksize = blocksize - - self.viseme_flag = viseme_flag - if self.viseme_flag: - self.speech_to_visemes = SpeechToVisemes() + self.output_sampling_rate = 16000 self.warmup() @@ -103,29 +95,27 @@ def process(self, llm_sentence): logger.error(f"Error in MeloTTSHandler: {e}") audio_chunk = np.array([]) if len(audio_chunk) == 0: - self.should_listen.set() - return - audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000) + return { + "text": llm_sentence, + "sentence_end": True + } + audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=self.output_sampling_rate) audio_chunk = (audio_chunk * 32768).astype(np.int16) - if self.viseme_flag: - visemes = self.speech_to_visemes.process(audio_chunk) - for viseme in visemes: - console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") - else: - visemes = None - for i in range(0, len(audio_chunk), self.blocksize): chunk_data = { - "audio": np.pad( - audio_chunk[i : i + self.blocksize], - (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])) - ) + "audio": { + "waveform": np.pad( + audio_chunk[i : i + self.blocksize], + (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])) + ), + "sampling_rate": self.output_sampling_rate + } } - # For the first chunk, include text and visemes + # For the first chunk, include text if i == 0: chunk_data["text"] = llm_sentence - chunk_data["visemes"] = visemes + if i >= len(audio_chunk) - self.blocksize: + # This is the last round + chunk_data["sentence_end"] = True yield chunk_data - - self.should_listen.set() diff --git a/TTS/parler_handler.py b/TTS/parler_handler.py index 0703180..2b84e8d 100644 --- a/TTS/parler_handler.py +++ b/TTS/parler_handler.py @@ -14,8 +14,6 @@ from transformers.utils.import_utils import ( is_flash_attn_2_available, ) -from .STV.speech_to_visemes import SpeechToVisemes - torch._inductor.config.fx_graph_cache = True # mind about this parameter ! should be >= 2 * number of padded prompt sizes for TTS torch._dynamo.config.cache_size_limit = 15 @@ -35,7 +33,6 @@ class ParlerTTSHandler(BaseHandler): def setup( self, - should_listen, model_name="ylacombe/parler-tts-mini-jenny-30H", device="cuda", torch_dtype="float16", @@ -48,9 +45,7 @@ def setup( ), play_steps_s=1, blocksize=512, - viseme_flag = True ): - self.should_listen = should_listen self.device = device self.torch_dtype = getattr(torch, torch_dtype) self.gen_kwargs = gen_kwargs @@ -79,10 +74,7 @@ def setup( self.model.forward = torch.compile( self.model.forward, mode=self.compile_mode, fullgraph=True ) - - self.viseme_flag = viseme_flag - if self.viseme_flag: - self.speech_to_visemes = SpeechToVisemes() + self.output_sampling_rate = 16000 self.warmup() @@ -186,27 +178,24 @@ def process(self, llm_sentence): logger.info( f"Time to first audio: {perf_counter() - pipeline_start:.3f}" ) - audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000) + audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=self.output_sampling_rate) audio_chunk = (audio_chunk * 32768).astype(np.int16) - if self.viseme_flag: - visemes = self.speech_to_visemes.process(audio_chunk) - for viseme in visemes: - console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}") - else: - visemes = None - for i in range(0, len(audio_chunk), self.blocksize): chunk_data = { - "audio": np.pad( - audio_chunk[i : i + self.blocksize], - (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])) - ) + "audio": { + "waveform": np.pad( + audio_chunk[i : i + self.blocksize], + (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])) + ), + "sampling_rate": self.output_sampling_rate + } } - # For the first chunk, include text and visemes + # For the first chunk, include text if i == 0: chunk_data["text"] = llm_sentence - chunk_data["visemes"] = visemes - yield chunk_data + if i >= len(audio_chunk) - self.blocksize: + # This is the last round + chunk_data["sentence_end"] = True - self.should_listen.set() + yield chunk_data diff --git a/arguments_classes/w2v_stv_arguments.py b/arguments_classes/w2v_stv_arguments.py new file mode 100644 index 0000000..229610a --- /dev/null +++ b/arguments_classes/w2v_stv_arguments.py @@ -0,0 +1,29 @@ +"""This file contains the arguments for the Wav2Vec2STVHandler.""" +from dataclasses import dataclass, field + +@dataclass +class Wav2Vec2STVHandlerArguments: + stv_model_name: str = field( + default="bookbot/wav2vec2-ljspeech-gruut", + metadata={ + "help": "The pretrained language model to use. Default is 'bookbot/wav2vec2-ljspeech-gruut'." + }, + ) + stv_device: str = field( + default="cuda", + metadata={ + "help": "The device type on which the model will run. Default is 'cuda' for GPU acceleration." + }, + ) + stv_blocksize: int = field( + default=512, + metadata={ + "help": "The blocksize of the model. Default is 512." + }, + ) + stv_skip: bool = field( + default=False, + metadata={ + "help": "If True, skips the STV generation. Default is False." + }, + ) diff --git a/connections/local_audio_streamer.py b/connections/local_audio_streamer.py index d42fbe7..5faef4a 100644 --- a/connections/local_audio_streamer.py +++ b/connections/local_audio_streamer.py @@ -38,7 +38,7 @@ def callback(indata, outdata, frames, time, status): visemes = data['visemes'] logger.info(f"Visemes: {visemes}") """ - outdata[:] = data['audio'][:, np.newaxis] + outdata[:] = data['audio']['waveform'][:, np.newaxis] logger.debug("Available devices:") logger.debug(sd.query_devices()) diff --git a/connections/socket_sender.py b/connections/socket_sender.py index fb5c7cb..f849bf3 100644 --- a/connections/socket_sender.py +++ b/connections/socket_sender.py @@ -33,7 +33,7 @@ def run(self): packet = {} if 'audio' in data and data['audio'] is not None: audio_chunk = data['audio'] - packet['audio'] = data['audio'] + packet['audio'] = audio_chunk if 'text' in data and data['text'] is not None: packet['text'] = data['text'] if 'visemes' in data and data['visemes'] is not None: diff --git a/listen_and_play.py b/listen_and_play.py index 2082a5e..b1f282f 100644 --- a/listen_and_play.py +++ b/listen_and_play.py @@ -92,17 +92,10 @@ def receive_full_chunk(conn, chunk_size): serialized_packet = receive_full_chunk(recv_socket, packet_length) if serialized_packet: # Step 4: Deserialize the packet using pickle - packet = pickle.loads(serialized_packet) - # Step 5: Extract the packet contents - if 'text' in packet: - pass - # print(packet['text']) - if 'visemes' in packet: - pass - # print(packet['visemes']) - - # Step 6: Put the packet audio data into the queue for sending - recv_queue.put(packet['audio'].tobytes()) + packet = pickle.loads(serialized_packet) + # Step 5: Put the packet audio data into the queue for sending, if any + if 'audio' in packet and packet['audio'] is not None and 'waveform' in packet['audio'] and packet['audio']['waveform'] is not None: + recv_queue.put(packet['audio']['waveform'].tobytes()) try: send_stream = sd.RawInputStream( diff --git a/s2s_pipeline.py b/s2s_pipeline.py index 1da202e..4c86bf5 100644 --- a/s2s_pipeline.py +++ b/s2s_pipeline.py @@ -8,6 +8,7 @@ from typing import Optional from sys import platform from VAD.vad_handler import VADHandler +from STV.w2v_stv_handler import Wav2Vec2STVHandler from arguments_classes.chat_tts_arguments import ChatTTSHandlerArguments from arguments_classes.language_model_arguments import LanguageModelHandlerArguments from arguments_classes.mlx_language_model_arguments import ( @@ -22,6 +23,7 @@ from arguments_classes.whisper_stt_arguments import WhisperSTTHandlerArguments from arguments_classes.melo_tts_arguments import MeloTTSHandlerArguments from arguments_classes.open_api_language_model_arguments import OpenApiLanguageModelHandlerArguments +from arguments_classes.w2v_stv_arguments import Wav2Vec2STVHandlerArguments import torch import nltk from rich.console import Console @@ -82,6 +84,7 @@ def parse_arguments(): ParlerTTSHandlerArguments, MeloTTSHandlerArguments, ChatTTSHandlerArguments, + Wav2Vec2STVHandlerArguments, ) ) @@ -148,6 +151,8 @@ def overwrite_device_argument(common_device: Optional[str], *handler_kwargs): kwargs.stt_device = common_device if hasattr(kwargs, "paraformer_stt_device"): kwargs.paraformer_stt_device = common_device + if hasattr(kwargs, "stv_device"): + kwargs.stv_device = common_device def prepare_module_args(module_kwargs, *handler_kwargs): @@ -167,6 +172,7 @@ def prepare_all_args( parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs, + stv_handler_kwargs, ): prepare_module_args( module_kwargs, @@ -178,6 +184,7 @@ def prepare_all_args( parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs, + stv_handler_kwargs ) @@ -189,6 +196,7 @@ def prepare_all_args( rename_args(parler_tts_handler_kwargs, "tts") rename_args(melo_tts_handler_kwargs, "melo") rename_args(chat_tts_handler_kwargs, "chat_tts") + rename_args(stv_handler_kwargs, "stv") def initialize_queues_and_events(): @@ -200,6 +208,7 @@ def initialize_queues_and_events(): "spoken_prompt_queue": Queue(), "text_prompt_queue": Queue(), "lm_response_queue": Queue(), + "send_viseme_queue": Queue(), } @@ -216,6 +225,7 @@ def build_pipeline( parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs, + stv_handler_kwargs, queues_and_events, ): stop_event = queues_and_events["stop_event"] @@ -225,11 +235,13 @@ def build_pipeline( spoken_prompt_queue = queues_and_events["spoken_prompt_queue"] text_prompt_queue = queues_and_events["text_prompt_queue"] lm_response_queue = queues_and_events["lm_response_queue"] + send_viseme_queue = queues_and_events["send_viseme_queue"] + if module_kwargs.mode == "local": from connections.local_audio_streamer import LocalAudioStreamer local_audio_streamer = LocalAudioStreamer( - input_queue=recv_audio_chunks_queue, output_queue=send_audio_chunks_queue + input_queue=recv_audio_chunks_queue, output_queue=send_viseme_queue ) comms_handlers = [local_audio_streamer] should_listen.set() @@ -248,7 +260,7 @@ def build_pipeline( ), SocketSender( stop_event, - send_audio_chunks_queue, + send_viseme_queue, host=socket_sender_kwargs.send_host, port=socket_sender_kwargs.send_port, ), @@ -264,9 +276,17 @@ def build_pipeline( stt = get_stt_handler(module_kwargs, stop_event, spoken_prompt_queue, text_prompt_queue, whisper_stt_handler_kwargs, paraformer_stt_handler_kwargs) lm = get_llm_handler(module_kwargs, stop_event, text_prompt_queue, lm_response_queue, language_model_handler_kwargs, open_api_language_model_handler_kwargs, mlx_language_model_handler_kwargs) - tts = get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chunks_queue, should_listen, parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs) + tts = get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chunks_queue, parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs) + + stv = Wav2Vec2STVHandler( + stop_event, + queue_in=send_audio_chunks_queue, + queue_out=send_viseme_queue, + setup_args=(should_listen,), + setup_kwargs=vars(stv_handler_kwargs), + ) - return ThreadManager([*comms_handlers, vad, stt, lm, tts]) + return ThreadManager([*comms_handlers, vad, stt, lm, tts, stv]) def get_stt_handler(module_kwargs, stop_event, spoken_prompt_queue, text_prompt_queue, whisper_stt_handler_kwargs, paraformer_stt_handler_kwargs): @@ -337,14 +357,14 @@ def get_llm_handler( raise ValueError("The LLM should be either transformers or mlx-lm") -def get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chunks_queue, should_listen, parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs): +def get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chunks_queue, parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs): if module_kwargs.tts == "parler": from TTS.parler_handler import ParlerTTSHandler return ParlerTTSHandler( stop_event, queue_in=lm_response_queue, queue_out=send_audio_chunks_queue, - setup_args=(should_listen,), + setup_args=(), setup_kwargs=vars(parler_tts_handler_kwargs), ) elif module_kwargs.tts == "melo": @@ -355,11 +375,12 @@ def get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chu "Error importing MeloTTSHandler. You might need to run: python -m unidic download" ) raise e + return MeloTTSHandler( stop_event, queue_in=lm_response_queue, queue_out=send_audio_chunks_queue, - setup_args=(should_listen,), + setup_args=(), setup_kwargs=vars(melo_tts_handler_kwargs), ) elif module_kwargs.tts == "chatTTS": @@ -372,7 +393,7 @@ def get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chu stop_event, queue_in=lm_response_queue, queue_out=send_audio_chunks_queue, - setup_args=(should_listen,), + setup_args=(), setup_kwargs=vars(chat_tts_handler_kwargs), ) else: @@ -393,6 +414,7 @@ def main(): parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs, + stv_handler_kwargs, ) = parse_arguments() setup_logger(module_kwargs.log_level) @@ -407,6 +429,7 @@ def main(): parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs, + stv_handler_kwargs ) queues_and_events = initialize_queues_and_events() @@ -424,6 +447,7 @@ def main(): parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs, + stv_handler_kwargs, queues_and_events, ) From 66c4f78b632d4c5b68ee720fea4505a63faa1a7e Mon Sep 17 00:00:00 2001 From: fabiocat93 Date: Tue, 8 Oct 2024 18:46:58 -0400 Subject: [PATCH 6/7] integrating speech to visemes as part of the s2s flow --- .gitignore | 4 +++- README.md | 11 +++++++++++ STT/paraformer_handler.py | 1 - STV/w2v_stv_handler.py | 6 +----- connections/local_audio_streamer.py | 10 ---------- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 95dc6c6..344d83d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ __pycache__ tmp cache -mlx_models/ \ No newline at end of file +mlx_models/ +asset/ +config/ \ No newline at end of file diff --git a/README.md b/README.md index 9f0765c..1d517ba 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ This repository implements a speech-to-speech cascaded pipeline consisting of th 2. **Speech to Text (STT)** 3. **Language Model (LM)** 4. **Text to Speech (TTS)** +5. **Speech to Visemes (STV)** ### Modularity The pipeline provides a fully open and modular approach, with a focus on leveraging models available through the Transformers library on the Hugging Face hub. The code is designed for easy modification, and we already support device-specific and external library implementations: @@ -50,6 +51,9 @@ The pipeline provides a fully open and modular approach, with a focus on leverag - [MeloTTS](https://github.com/myshell-ai/MeloTTS) - [ChatTTS](https://github.com/2noise/ChatTTS?tab=readme-ov-file) +**STV** +- [Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/en/model_doc/wav2vec2_phoneme) + [Phoneme to viseme mapping](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-speech-synthesis-viseme?tabs=visemeid&pivots=programming-language-python#map-phonemes-to-visemes) + ## Setup Clone the repository: @@ -216,6 +220,13 @@ For example: --lm_model_name google/gemma-2b-it ``` + +### STV parameters +See [Wav2Vec2STVHandlerArguments](arguments_classes/w2v_stv_arguments.py) class. Notably: +- `stv_model_name` is by default `bookbot/wav2vec2-ljspeech-gruut` and has been chosen because accurate and fast enough +- `stv_skip`, flag it to `True` if you don't need visemes + + ### Generation parameters Other generation parameters of the model's generate method can be set using the part's prefix + `_gen_`, e.g., `--stt_gen_max_new_tokens 128`. These parameters can be added to the pipeline part's arguments class if not already exposed. diff --git a/STT/paraformer_handler.py b/STT/paraformer_handler.py index dcadc02..09d481b 100644 --- a/STT/paraformer_handler.py +++ b/STT/paraformer_handler.py @@ -28,7 +28,6 @@ def setup( device="cuda", gen_kwargs={}, ): - print(model_name) if len(model_name.split("/")) > 1: model_name = model_name.split("/")[-1] self.device = device diff --git a/STV/w2v_stv_handler.py b/STV/w2v_stv_handler.py index 7e65403..20c2cef 100644 --- a/STV/w2v_stv_handler.py +++ b/STV/w2v_stv_handler.py @@ -60,6 +60,7 @@ def setup( self.skip = skip # Load phoneme-to-viseme map from the JSON file + # inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets phoneme_viseme_map_file = "STV/phoneme_viseme_map.json" with open(phoneme_viseme_map_file, "r") as f: self.phoneme_viseme_map = json.load(f) @@ -250,8 +251,3 @@ def process(self, data: Dict[str, Any]) -> Generator[Dict[str, Any], None, None] if self.should_listen_flag: self.should_listen.set() self.should_listen_flag = False - - -# TODO: Test in all modalities and TTS models**: Ensure compatibility with the different models. This requires integration testing with your models and modalities. -# in s2s_pipeline change some names -# remove some prints \ No newline at end of file diff --git a/connections/local_audio_streamer.py b/connections/local_audio_streamer.py index 5faef4a..99b9d83 100644 --- a/connections/local_audio_streamer.py +++ b/connections/local_audio_streamer.py @@ -28,16 +28,6 @@ def callback(indata, outdata, frames, time, status): outdata[:] = 0 * outdata else: data = self.output_queue.get() - """ - # Check if text data is present and log it - if data.get('text') is not None: - text = data['text'] - logger.info(f"Text: {text}") - # Check if viseme data is present and log it - if data.get('visemes') is not None: - visemes = data['visemes'] - logger.info(f"Visemes: {visemes}") - """ outdata[:] = data['audio']['waveform'][:, np.newaxis] logger.debug("Available devices:") From c7b85e18e978b988e84e5b2db368b7599aacb101 Mon Sep 17 00:00:00 2001 From: Fabio Catania Date: Tue, 8 Oct 2024 18:53:31 -0400 Subject: [PATCH 7/7] Delete STV/speech_to_visemes.py --- STV/speech_to_visemes.py | 101 --------------------------------------- 1 file changed, 101 deletions(-) delete mode 100644 STV/speech_to_visemes.py diff --git a/STV/speech_to_visemes.py b/STV/speech_to_visemes.py deleted file mode 100644 index 16ad95c..0000000 --- a/STV/speech_to_visemes.py +++ /dev/null @@ -1,101 +0,0 @@ -"""This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes.""" -from typing import List, Dict, Any -from transformers import pipeline -import logging -import json - -logger = logging.getLogger(__name__) - -class SpeechToVisemes(): - """ - Handles the conversion of speech to visemes using a phoneme-to-viseme mapping. - - Attributes: - model_name (str): The name of the model to use for speech recognition. - device (str): The device to run the model on (e.g., "cpu", "mps", "cuda"). - gen_kwargs (dict): Additional generation parameters for the speech recognition pipeline. - asr_pipeline (transformers.Pipeline): The automatic speech recognition pipeline. - """ - - def __init__( - self, - model_name="bookbot/wav2vec2-ljspeech-gruut", - device="mps", - gen_kwargs={}, - ): - """ - Initializes the SpeechToVisemes class with the specified parameters. - - Args: - model_name (str, optional): The name of the model to use for speech recognition. - device (str, optional): The device to run the model on. - gen_kwargs (dict, optional): Additional generation parameters for the speech recognition pipeline. - """ - self.device = device - self.gen_kwargs = gen_kwargs - - # This dictionary is inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets - phoneme_viseme_map_file="TTS/STV/phoneme_viseme_map.json" - with open(phoneme_viseme_map_file, 'r') as f: - self.phoneme_viseme_map = json.load(f) - - # Initialize the automatic speech recognition pipeline - self.asr_pipeline = pipeline( - "automatic-speech-recognition", model=model_name, device=device - ) - - def _map_phonemes_to_visemes( - self, - data: Dict[str, Any], - ) -> List[Dict[str, Any]]: - """ - Maps phonemes to corresponding visemes with timestamps. - - Refer to the following references for more information on the phoneme-to-viseme mapping: - - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-speech-synthesis-viseme?tabs=visemeid&pivots=programming-language-python#map-phonemes-to-visemes - - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets - - Args: - data (Dict[str, Any]): A dictionary containing phoneme data, where data['chunks'] - holds a list of phonemes and their timestamps. - - Returns: - List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme ID - and the corresponding timestamp. - """ - viseme_list = [] - chunks = data.get('chunks', []) - - for _, chunk in enumerate(chunks): - phoneme = chunk.get('text', None) - timestamp = chunk.get('timestamp', None) - visemes = self.phoneme_viseme_map.get(phoneme, []) - - for viseme in visemes: - viseme_list.append({ - 'viseme': viseme, - 'timestamp': timestamp - }) - - return viseme_list - - - def process(self, audio_file: str) -> List[Dict[str, Any]]: - """Process an audio file and convert speech to visemes. - - Heuristically, we found that the model requires at least 0.5 seconds of audio to run phoneme recognition. - This value may be also depended on the model, the language, and other factors. - - Args: - audio_file (str): The path to the audio file. - - Returns: - List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme - ID and the corresponding timestamp. - """ - # Perform ASR to get phoneme data - asr_result = self.asr_pipeline(audio_file, return_timestamps='char') - # Map phonemes to visemes - viseme_data = self._map_phonemes_to_visemes(asr_result) - return viseme_data - \ No newline at end of file