From ba4368b35eee3601a5030a667dfe3dfd2a2b31c7 Mon Sep 17 00:00:00 2001
From: fabiocat93 <fabio-cat93@hotmail.it>
Date: Mon, 9 Sep 2024 18:18:40 +0200
Subject: [PATCH 1/7] adding speech_to_visemes

---
 TTS/STV/speech_to_visemes.py              | 341 ++++++++++++++++++++++
 TTS/chatTTS_handler.py                    |  66 ++++-
 TTS/melo_handler.py                       |  32 +-
 TTS/parler_handler.py                     |  29 +-
 arguments_classes/parler_tts_arguments.py |   2 +-
 connections/local_audio_streamer.py       |  13 +-
 connections/socket_sender.py              |  33 ++-
 listen_and_play.py                        |  37 ++-
 8 files changed, 520 insertions(+), 33 deletions(-)
 create mode 100644 TTS/STV/speech_to_visemes.py

diff --git a/TTS/STV/speech_to_visemes.py b/TTS/STV/speech_to_visemes.py
new file mode 100644
index 0000000..11d16dd
--- /dev/null
+++ b/TTS/STV/speech_to_visemes.py
@@ -0,0 +1,341 @@
+"""This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes."""
+from transformers import pipeline
+import logging
+import numpy as np
+
+logger = logging.getLogger(__name__)
+from typing import List, Dict, Any, Optional
+
+class SpeechToVisemes():
+    """
+    Handles the conversion of speech to visemes using a phoneme-to-viseme mapping.
+
+    Attributes:
+        model_name (str): The name of the model to use for speech recognition.
+        device (str): The device to run the model on (e.g., "cpu", "mps", "cuda").
+        gen_kwargs (dict): Additional generation parameters for the speech recognition pipeline.
+        asr_pipeline (transformers.Pipeline): The automatic speech recognition pipeline.
+    """
+
+    def __init__(
+        self,
+        model_name="bookbot/wav2vec2-ljspeech-gruut",
+        device="mps",
+        gen_kwargs={},
+    ):
+        """
+        Initializes the SpeechToVisemes class with the specified parameters.
+
+        Args:
+            model_name (str, optional): The name of the model to use for speech recognition.
+            device (str, optional): The device to run the model on.
+            gen_kwargs (dict, optional): Additional generation parameters for the speech recognition pipeline.
+        """
+        self.device = device
+        self.gen_kwargs = gen_kwargs
+
+        # Initialize the automatic speech recognition pipeline
+        self.asr_pipeline = pipeline(
+            "automatic-speech-recognition", model=model_name, device=device
+        )
+
+    def _map_phonemes_to_visemes(
+        self, 
+        data: Dict[str, Any], 
+    ) -> List[Dict[str, Any]]:
+        """
+        Maps phonemes to corresponding visemes with timestamps.
+
+        Refer to the following references for more information on the phoneme-to-viseme mapping:
+            - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-speech-synthesis-viseme?tabs=visemeid&pivots=programming-language-python#map-phonemes-to-visemes
+            - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets
+
+        Args:
+            data (Dict[str, Any]): A dictionary containing phoneme data, where `data['chunks']` 
+                holds a list of phonemes and their timestamps.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme 
+            ID and the corresponding timestamp.
+        """
+
+        def _phoneme_to_viseme(phoneme: str) -> List[int]:
+            """
+            Converts a phoneme to its corresponding viseme(s).
+
+            Args:
+                phoneme (str): The phoneme to map to viseme.
+
+            Returns:
+                List[int]: A list of viseme IDs corresponding to the phoneme.
+            """
+            # inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets
+            phoneme_viseme_map = {
+                # basic
+                'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], 
+                'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], 
+                'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], 
+                'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0],
+
+                # ar-EG
+                "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], 
+                "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], 
+                "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], 
+                "m": [21], "n": [19], "r": [13], "ʕ": [12],
+
+                # bg-BG
+                "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], 
+                "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], 
+                "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], 
+                "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], 
+                "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15],
+
+                # ca-ES
+                "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], 
+                "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], 
+                "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], 
+                "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16],
+
+                # cs-CZ
+                "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], 
+                "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], 
+                "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], 
+                "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], 
+                "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13],
+
+                # da-DK
+                "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], 
+                "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], 
+                "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], 
+                "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], 
+                "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7],
+
+                # de-DE/de-CH/de-AT
+                "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], 
+                "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], 
+                "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], 
+                "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], 
+                "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], 
+                "ʒ": [16], "ʔ": [19],
+
+                # el-GR
+                "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], 
+                "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], 
+                "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], 
+                "x": [12], "z": [15],
+
+                # en-GB/en-IE/en-AU
+                "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], 
+                "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], 
+                "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], 
+                "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], 
+                "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16],
+
+                # en-US/en-CA
+                "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], 
+                "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], 
+                "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], 
+                "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], 
+                "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], 
+                "dʒ": [19, 16], "l": [14], "ɹ": [13],
+
+                # es-ES
+                "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], 
+                "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], 
+                "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16],
+
+                # es-MX
+                "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], 
+                "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], 
+                "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12],
+
+                # fi-FI
+                "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], 
+                "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], 
+                "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], 
+                "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], 
+                "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], 
+                "s": [15], "ʃ": [16], "t": [19], "ʋ": [18],
+
+                # fr-FR/fr-CA/fr-CH
+                "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], 
+                "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], 
+                "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], 
+                "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15],
+
+                # he-IL
+                "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], 
+                "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], 
+                "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16],
+
+                # hr-HR
+                "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], 
+                "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], 
+                "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], 
+                "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16],
+
+                # hu-HU
+                "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], 
+                "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], 
+                "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], 
+                "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], 
+                "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], 
+                "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], 
+                "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], 
+                "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16],
+
+                # id-ID
+                "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], 
+                "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], 
+                "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], 
+                "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15],
+
+                # it-IT
+                "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], 
+                "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], 
+                "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], 
+                "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], 
+                "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], 
+                "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], 
+                "v": [18], "vː": [18], "w": [7], "z": [15],
+
+                # ko-KR
+                "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], 
+                "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], 
+                "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], 
+                "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], 
+                "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], 
+                "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19],
+
+                # ms-MY
+                "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], 
+                "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], 
+                "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], 
+                "r": [13], "h": [12], "j": [6], "w": [7], "l": [14],
+
+                # nb-NO
+                "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], 
+                "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], 
+                "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], 
+                "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], 
+                "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19],
+
+                # nl-NL/nl-BE
+                "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], 
+                "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], 
+                "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], 
+                "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], 
+                "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16],
+
+                # pl-PL
+                "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], 
+                "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], 
+                "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], 
+                "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], 
+                "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], 
+                "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16],
+
+                # pt-BR
+                "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], 
+                "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], 
+                "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], 
+                "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20],
+
+                # pt-PT
+                "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], 
+                "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], 
+                "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], 
+                "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], 
+                "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], 
+                "v": [18], "w": [7], "z": [15], "ʒ": [16],
+
+                # ro-RO
+                "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], 
+                "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], 
+                "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], 
+                "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], 
+                "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], 
+                "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16],
+
+                # ru-RU
+                "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], 
+                "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], 
+                "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], 
+                "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], 
+                "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6],
+
+                # sk-SK
+                "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], 
+                "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], 
+                "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], 
+                "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], 
+                "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], 
+                "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], 
+                "w": [7],
+
+                # sl-SI
+                "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], 
+                "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], 
+                "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], 
+                "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], 
+                "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], 
+                "z": [15],
+
+                # sv-SE
+                "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], 
+                "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], 
+                "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], 
+                "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], 
+                "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19],
+
+                # th-TH
+                "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], 
+                "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], 
+                "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], 
+                "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], 
+                "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19],
+
+                # tr-TR
+                "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], 
+                "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], 
+                "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], 
+                "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], 
+                "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16],
+
+                # vi-VN
+                "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], 
+                "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], 
+                "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], 
+                "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], 
+                "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], 
+                "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], 
+                "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7]
+            }
+            return phoneme_viseme_map.get(phoneme, [])
+
+        viseme_list = []
+        chunks = data.get('chunks', [])
+
+        for i, chunk in enumerate(chunks):
+            phoneme = chunk.get('text', None)
+            timestamp = chunk.get('timestamp', None)
+            visemes = _phoneme_to_viseme(phoneme)
+            
+            for viseme in visemes:
+                viseme_list.append({
+                    'viseme': viseme,
+                    'timestamp': timestamp
+                })
+
+        return viseme_list
+
+
+    def process(self, audio_file: str) -> List[Dict[str, Any]]:
+        """Process an audio file and convert speech to visemes."""
+        # Perform ASR to get phoneme data
+        asr_result = self.asr_pipeline(audio_file, return_timestamps='char')
+        # Map phonemes to visemes
+        viseme_data = self._map_phonemes_to_visemes(asr_result)
+
+        return viseme_data
+    
\ No newline at end of file
diff --git a/TTS/chatTTS_handler.py b/TTS/chatTTS_handler.py
index 6bdc6bf..1cee897 100644
--- a/TTS/chatTTS_handler.py
+++ b/TTS/chatTTS_handler.py
@@ -5,6 +5,7 @@
 import numpy as np
 from rich.console import Console
 import torch
+from .STV.speech_to_visemes import SpeechToVisemes
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -22,6 +23,7 @@ def setup(
         gen_kwargs={},  # Unused
         stream=True,
         chunk_size=512,
+        viseme_flag = True
     ):
         self.should_listen = should_listen
         self.device = device
@@ -33,6 +35,9 @@ def setup(
         self.params_infer_code = ChatTTS.Chat.InferCodeParams(
             spk_emb=rnd_spk_emb,
         )
+        self.viseme_flag = viseme_flag
+        if self.viseme_flag:
+            self.speech_to_visemes = SpeechToVisemes()
         self.warmup()
 
     def warmup(self):
@@ -61,22 +66,65 @@ def process(self, llm_sentence):
                 if gen[0] is None or len(gen[0]) == 0:
                     self.should_listen.set()
                     return
+                
+                # Resample the audio to 16000 Hz
                 audio_chunk = librosa.resample(gen[0], orig_sr=24000, target_sr=16000)
-                audio_chunk = (audio_chunk * 32768).astype(np.int16)[0]
-                while len(audio_chunk) > self.chunk_size:
-                    yield audio_chunk[: self.chunk_size]  # 返回前 chunk_size 字节的数据
-                    audio_chunk = audio_chunk[self.chunk_size :]  # 移除已返回的数据
-                yield np.pad(audio_chunk, (0, self.chunk_size - len(audio_chunk)))
+                # Ensure the audio is converted to mono (single channel)
+                if len(audio_chunk.shape) > 1:
+                    audio_chunk = librosa.to_mono(audio_chunk)
+                audio_chunk = (audio_chunk * 32768).astype(np.int16)
+                
+                # Process visemes if viseme_flag is set
+                if self.viseme_flag:
+                    visemes = self.speech_to_visemes.process(audio_chunk)
+                    for viseme in visemes:
+                        console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}")
+                else:
+                    visemes = None
+                
+                # Loop through audio chunks, yielding dict for each chunk
+                for i in range(0, len(audio_chunk), self.chunk_size):
+                    chunk_data = {
+                        "audio": np.pad(
+                            audio_chunk[i : i + self.chunk_size],
+                            (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])),
+                        )
+                    }
+                    # Include text and visemes for the first chunk
+                    if i == 0:
+                        chunk_data["text"] = llm_sentence  # Assuming llm_sentence is defined elsewhere
+                        chunk_data["visemes"] = visemes
+                
+                    yield chunk_data
         else:
             wavs = wavs_gen
             if len(wavs[0]) == 0:
                 self.should_listen.set()
                 return
             audio_chunk = librosa.resample(wavs[0], orig_sr=24000, target_sr=16000)
+            # Ensure the audio is converted to mono (single channel)
+            if len(audio_chunk.shape) > 1:
+                audio_chunk = librosa.to_mono(audio_chunk)
             audio_chunk = (audio_chunk * 32768).astype(np.int16)
+
+            if self.viseme_flag:
+                visemes = self.speech_to_visemes.process(audio_chunk)
+                for viseme in visemes:
+                    console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}")
+            else:
+                visemes = None
+
             for i in range(0, len(audio_chunk), self.chunk_size):
-                yield np.pad(
-                    audio_chunk[i : i + self.chunk_size],
-                    (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])),
-                )
+                chunk_data = {
+                    "audio": np.pad(
+                        audio_chunk[i : i + self.chunk_size],
+                        (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])),
+                    )
+                }
+                # For the first chunk, include text and visemes
+                if i == 0:
+                    chunk_data["text"] = llm_sentence
+                    chunk_data["visemes"] = visemes            
+                yield chunk_data
+
         self.should_listen.set()
diff --git a/TTS/melo_handler.py b/TTS/melo_handler.py
index b1b2226..fc33730 100644
--- a/TTS/melo_handler.py
+++ b/TTS/melo_handler.py
@@ -6,6 +6,8 @@
 from rich.console import Console
 import torch
 
+from .STV.speech_to_visemes import SpeechToVisemes
+
 logger = logging.getLogger(__name__)
 
 console = Console()
@@ -28,7 +30,6 @@
     "ko": "KR",
 }
 
-
 class MeloTTSHandler(BaseHandler):
     def setup(
         self,
@@ -38,6 +39,7 @@ def setup(
         speaker_to_id="en",
         gen_kwargs={},  # Unused
         blocksize=512,
+        viseme_flag = True # To obtain timestamped visemes
     ):
         self.should_listen = should_listen
         self.device = device
@@ -49,6 +51,11 @@ def setup(
             WHISPER_LANGUAGE_TO_MELO_SPEAKER[speaker_to_id]
         ]
         self.blocksize = blocksize
+
+        self.viseme_flag = viseme_flag
+        if self.viseme_flag:
+            self.speech_to_visemes = SpeechToVisemes()
+
         self.warmup()
 
     def warmup(self):
@@ -100,10 +107,25 @@ def process(self, llm_sentence):
             return
         audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000)
         audio_chunk = (audio_chunk * 32768).astype(np.int16)
+
+        if self.viseme_flag:
+            visemes = self.speech_to_visemes.process(audio_chunk)
+            for viseme in visemes:
+                console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}")
+        else:
+            visemes = None
+
         for i in range(0, len(audio_chunk), self.blocksize):
-            yield np.pad(
-                audio_chunk[i : i + self.blocksize],
-                (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])),
-            )
+            chunk_data = {
+                "audio": np.pad(
+                    audio_chunk[i : i + self.blocksize],
+                    (0, self.blocksize - len(audio_chunk[i : i + self.blocksize]))
+                )
+            }
+            # For the first chunk, include text and visemes
+            if i == 0:
+                chunk_data["text"] = llm_sentence
+                chunk_data["visemes"] = visemes            
+            yield chunk_data
 
         self.should_listen.set()
diff --git a/TTS/parler_handler.py b/TTS/parler_handler.py
index 5cc0ce9..4c52d5b 100644
--- a/TTS/parler_handler.py
+++ b/TTS/parler_handler.py
@@ -14,6 +14,7 @@
 from transformers.utils.import_utils import (
     is_flash_attn_2_available,
 )
+from .STV.speech_to_visemes import SpeechToVisemes
 
 torch._inductor.config.fx_graph_cache = True
 # mind about this parameter ! should be >= 2 * number of padded prompt sizes for TTS
@@ -47,6 +48,7 @@ def setup(
         ),
         play_steps_s=1,
         blocksize=512,
+        viseme_flag = True
     ):
         self.should_listen = should_listen
         self.device = device
@@ -78,6 +80,10 @@ def setup(
                 self.model.forward, mode=self.compile_mode, fullgraph=True
             )
 
+        self.viseme_flag = viseme_flag
+        if self.viseme_flag:
+            self.speech_to_visemes = SpeechToVisemes()
+
         self.warmup()
 
     def prepare_model_inputs(
@@ -179,10 +185,25 @@ def process(self, llm_sentence):
                 )
             audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000)
             audio_chunk = (audio_chunk * 32768).astype(np.int16)
+
+            if self.viseme_flag:
+                visemes = self.speech_to_visemes.process(audio_chunk)
+                for viseme in visemes:
+                    console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}")
+            else:
+                visemes = None
+
             for i in range(0, len(audio_chunk), self.blocksize):
-                yield np.pad(
-                    audio_chunk[i : i + self.blocksize],
-                    (0, self.blocksize - len(audio_chunk[i : i + self.blocksize])),
-                )
+                chunk_data = {
+                    "audio": np.pad(
+                        audio_chunk[i : i + self.blocksize],
+                        (0, self.blocksize - len(audio_chunk[i : i + self.blocksize]))
+                    )
+                }
+                # For the first chunk, include text and visemes
+                if i == 0:
+                    chunk_data["text"] = llm_sentence
+                    chunk_data["visemes"] = visemes            
+                yield chunk_data
 
         self.should_listen.set()
diff --git a/arguments_classes/parler_tts_arguments.py b/arguments_classes/parler_tts_arguments.py
index 5159432..1bb0f21 100644
--- a/arguments_classes/parler_tts_arguments.py
+++ b/arguments_classes/parler_tts_arguments.py
@@ -36,7 +36,7 @@ class ParlerTTSHandlerArguments:
     tts_gen_max_new_tokens: int = field(
         default=512,
         metadata={
-            "help": "Maximum number of new tokens to generate in a single completion. Default is 256, which corresponds to ~6 secs"
+            "help": "Maximum number of new tokens to generate in a single completion. Default is 512, which corresponds to ~6 secs"
         },
     )
     description: str = field(
diff --git a/connections/local_audio_streamer.py b/connections/local_audio_streamer.py
index 389dcb8..d42fbe7 100644
--- a/connections/local_audio_streamer.py
+++ b/connections/local_audio_streamer.py
@@ -27,7 +27,18 @@ def callback(indata, outdata, frames, time, status):
                 self.input_queue.put(indata.copy())
                 outdata[:] = 0 * outdata
             else:
-                outdata[:] = self.output_queue.get()[:, np.newaxis]
+                data = self.output_queue.get()
+                """
+                # Check if text data is present and log it
+                if data.get('text') is not None:
+                    text = data['text']
+                    logger.info(f"Text: {text}")
+                # Check if viseme data is present and log it
+                if data.get('visemes') is not None:
+                    visemes = data['visemes']
+                    logger.info(f"Visemes: {visemes}")
+                """
+                outdata[:] = data['audio'][:, np.newaxis]
 
         logger.debug("Available devices:")
         logger.debug(sd.query_devices())
diff --git a/connections/socket_sender.py b/connections/socket_sender.py
index 11ed210..fb5c7cb 100644
--- a/connections/socket_sender.py
+++ b/connections/socket_sender.py
@@ -1,6 +1,8 @@
 import socket
 from rich.console import Console
 import logging
+import pickle
+import struct
 
 logger = logging.getLogger(__name__)
 
@@ -11,7 +13,6 @@ class SocketSender:
     """
     Handles sending generated audio packets to the clients.
     """
-
     def __init__(self, stop_event, queue_in, host="0.0.0.0", port=12346):
         self.stop_event = stop_event
         self.queue_in = queue_in
@@ -28,9 +29,31 @@ def run(self):
         logger.info("sender connected")
 
         while not self.stop_event.is_set():
-            audio_chunk = self.queue_in.get()
-            self.conn.sendall(audio_chunk)
-            if isinstance(audio_chunk, bytes) and audio_chunk == b"END":
-                break
+            data = self.queue_in.get()
+            packet = {}
+            if 'audio' in data and data['audio'] is not None:
+                audio_chunk = data['audio']
+                packet['audio'] = data['audio']
+            if 'text' in data and data['text'] is not None:
+                packet['text'] = data['text']
+            if 'visemes' in data and data['visemes'] is not None:
+                packet['visemes'] = data['visemes']
+
+            # Serialize the packet using pickle
+            serialized_packet = pickle.dumps(packet)
+
+            # Compute the length of the serialized packet
+            packet_length = len(serialized_packet)
+
+            # Send the packet length as a 4-byte integer using struct
+            self.conn.sendall(struct.pack('!I', packet_length))
+
+            # Send the serialized packet
+            self.conn.sendall(serialized_packet)
+
+            if 'audio' in data and data['audio'] is not None:
+                if isinstance(audio_chunk, bytes) and audio_chunk == b"END":
+                    break
+            
         self.conn.close()
         logger.info("Sender closed")
diff --git a/listen_and_play.py b/listen_and_play.py
index 35eabd6..2082a5e 100644
--- a/listen_and_play.py
+++ b/listen_and_play.py
@@ -4,15 +4,16 @@
 from dataclasses import dataclass, field
 import sounddevice as sd
 from transformers import HfArgumentParser
-
+import struct
+import pickle
 
 @dataclass
 class ListenAndPlayArguments:
     send_rate: int = field(default=16000, metadata={"help": "In Hz. Default is 16000."})
     recv_rate: int = field(default=16000, metadata={"help": "In Hz. Default is 16000."})
     list_play_chunk_size: int = field(
-        default=1024,
-        metadata={"help": "The size of data chunks (in bytes). Default is 1024."},
+        default=512,
+        metadata={"help": "The size of data chunks (in bytes). Default is 512."},
     )
     host: str = field(
         default="localhost",
@@ -33,7 +34,7 @@ class ListenAndPlayArguments:
 def listen_and_play(
     send_rate=16000,
     recv_rate=44100,
-    list_play_chunk_size=1024,
+    list_play_chunk_size=512,
     host="localhost",
     send_port=12345,
     recv_port=12346,
@@ -79,9 +80,29 @@ def receive_full_chunk(conn, chunk_size):
             return data
 
         while not stop_event.is_set():
-            data = receive_full_chunk(recv_socket, list_play_chunk_size * 2)
-            if data:
-                recv_queue.put(data)
+            # Step 1: Receive the first 4 bytes to get the packet length
+            length_data = receive_full_chunk(recv_socket, 4)
+            if not length_data:
+                continue  # Handle disconnection or data not available
+
+            # Step 2: Unpack the length (4 bytes)
+            packet_length = struct.unpack('!I', length_data)[0]
+
+            # Step 3: Receive the full packet based on the length
+            serialized_packet = receive_full_chunk(recv_socket, packet_length)
+            if serialized_packet:
+                # Step 4: Deserialize the packet using pickle
+                packet = pickle.loads(serialized_packet)
+                # Step 5: Extract the packet contents
+                if 'text' in packet:
+                    pass
+                    # print(packet['text'])
+                if 'visemes' in packet:
+                    pass
+                    # print(packet['visemes'])
+                
+                # Step 6: Put the packet audio data into the queue for sending
+                recv_queue.put(packet['audio'].tobytes())
 
     try:
         send_stream = sd.RawInputStream(
@@ -123,4 +144,4 @@ def receive_full_chunk(conn, chunk_size):
 if __name__ == "__main__":
     parser = HfArgumentParser((ListenAndPlayArguments,))
     (listen_and_play_kwargs,) = parser.parse_args_into_dataclasses()
-    listen_and_play(**vars(listen_and_play_kwargs))
+    listen_and_play(**vars(listen_and_play_kwargs))
\ No newline at end of file

From 7ff873cf43816e1b6434657d06328fd4cae09cf3 Mon Sep 17 00:00:00 2001
From: fabiocat93 <fabio-cat93@hotmail.it>
Date: Sun, 6 Oct 2024 10:41:14 -0400
Subject: [PATCH 2/7] picking phoneme_viseme_map from json file

---
 .gitignore                                   |   3 +-
 TTS/STV/phoneme_viseme_map.json              |   1 +
 TTS/STV/phoneme_viseme_map_readable.json.txt | 241 ++++++++++++++++
 TTS/STV/speech_to_visemes.py                 | 288 ++-----------------
 4 files changed, 268 insertions(+), 265 deletions(-)
 create mode 100644 TTS/STV/phoneme_viseme_map.json
 create mode 100644 TTS/STV/phoneme_viseme_map_readable.json.txt

diff --git a/.gitignore b/.gitignore
index 33b7875..95dc6c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 __pycache__
 tmp
-cache
\ No newline at end of file
+cache
+mlx_models/
\ No newline at end of file
diff --git a/TTS/STV/phoneme_viseme_map.json b/TTS/STV/phoneme_viseme_map.json
new file mode 100644
index 0000000..8c91531
--- /dev/null
+++ b/TTS/STV/phoneme_viseme_map.json
@@ -0,0 +1 @@
+{"æ":[1],"ə":[1],"ʌ":[1],"ɑ":[2],"ɔ":[3],"ɛ":[4],"ʊ":[4],"ɝ":[5],"j":[6],"i":[6],"ɪ":[6],"w":[7],"u":[7],"o":[8],"aʊ":[9],"ɔɪ":[10],"aɪ":[11],"h":[12],"ɹ":[13],"l":[14],"s":[15],"z":[15],"ʃ":[16],"tʃ":[19,16],"dʒ":[19,16],"ʒ":[16],"ð":[17],"f":[18],"v":[18],"d":[19],"t":[19],"n":[19],"θ":[19],"k":[20],"g":[20],"ŋ":[20],"p":[21],"b":[21],"m":[21]," ":[0],"a":[2],"aː":[2],"iː":[6],"uː":[7],"dˤ":[19],"q":[20],"tˤ":[19],"ʔ":[19],"ħ":[12],"ðˤ":[17],"ɣ":[20],"x":[12],"sˤ":[15],"r":[13],"ʕ":[12],"j͡a":[6,2],"ɤ":[1],"j͡u":[6,7],"t͡s":[19,15],"zʲ":[15],"lʲ":[14],"nʲ":[19],"d͡ʒ":[19,16],"mʲ":[21],"tʲ":[19],"rʲ":[13],"pʲ":[21],"dʲ":[19],"vʲ":[18],"sʲ":[15],"bʲ":[21],"kʲ":[20],"gʲ":[20],"fʲ":[18],"t͡ʃ":[19,16],"d͡z":[19,15],"e":[4],"β":[21],"ʎ":[14],"ɲ":[19],"ɾ":[19],"ɛː":[4],"oː":[8],"o͡ʊ̯":[8,4],"a͡ʊ":[2,4],"ɛ͡ʊ̯":[4,4],"c":[16],"ɟ":[16],"r̝":[13],"ɦ":[12],"ɱ":[21],"r̝̊":[13],"ɑː":[2],"ɒ":[2],"ɒː":[2],"ɔː":[3],"ɐ":[4],"æː":[1],"ø":[1],"øː":[1],"eː":[4],"œ":[4],"œː":[4],"y":[4],"yː":[4],"kʰ":[20],"pʰ":[21],"ʁ":[13],"ɐ̯":[4],"ɕ":[16],"ʏ":[7],"ai":[2,6],"au":[2,7],"ɔy":[3,4],"ɔʏ̯":[3,4],"ʤ":[16],"pf":[21,18],"ʀ":[13],"ts":[19,15],"ç":[12],"ʝ":[12],"ɛə":[4,1],"ɜː":[5],"eɪ":[4,6],"ɪə":[6,1],"əʊ":[1,4],"ʊə":[4,1],"iy":[6],"oʊ":[8,4],"ju":[6,7],"ɪɹ":[6,13],"ɛɹ":[4,13],"ʊɹ":[4,13],"aɪɹ":[11,13],"aʊɹ":[9,13],"ɔɹ":[3,13],"ɑɹ":[2,13],"ɚ":[1],"j͡j":[6,6],"ɑ͡i":[2,6],"ɑ͡u":[2,7],"æ͡i":[1,6],"æ͡y":[1,4],"e͡i":[4,6],"ø͡i":[1,6],"ø͡y":[1,4],"e͡u":[4,7],"e͡y":[4,4],"i͡e":[6,4],"i͡u":[6,7],"i͡y":[6,4],"o͡i":[8,6],"o͡u":[8,7],"u͡i":[7,6],"u͡o":[7,8],"y͡ø":[4,1],"y͡i":[4,6],"ʋ":[18],"ɑ̃":[2],"ɛ̃":[4],"ɔ̃":[3],"œ̃":[4],"ɥ":[7],"n‿":[19],"t‿":[19],"z‿":[15],"ʨ":[16],"ʥ":[16],"bː":[21],"dː":[19],"ɟː":[16],"d͡ʒː":[19,16],"dz":[19,15],"dzː":[19,15],"fː":[18],"gː":[20],"hː":[12],"jː":[6],"ɲː":[19],"kː":[20],"lː":[14],"mː":[21],"nː":[19],"pː":[21],"rː":[13],"sː":[15],"ʃː":[16],"tː":[19],"cː":[16],"t͡sː":[19,15],"t͡ʃː":[19,16],"vː":[18],"ɰ":[20],"zː":[15],"ʒː":[16],"a͡i":[2,6],"ɔ͡i":[3,6],"ɛj":[4,6],"ɛu":[4,7],"ei":[4,6],"eu":[4,7],"ɔj":[3,6],"oi":[8,6],"ou":[8,7],"ʧ":[16],"tʃː":[19,16],"ʣ":[15],"ʣː":[15],"ʤː":[16],"ʎː":[14],"ʦ":[15],"ʦː":[15],"ɯ":[6],"ɰ͡i":[20,6],"w͡a":[7,2],"w͡ɛ":[7,4],"w͡e":[7,4],"w͡i":[7,6],"w͡ʌ":[7,1],"j͡ɛ":[6,4],"j͡e":[6,4],"j͡ʌ":[6,1],"j͡o":[6,8],"b̥":[21],"t͡ɕʰ":[19,16],"d̥":[19],"g̥":[20],"d͡ʑ":[19,16],"d͡ʑ̥":[19,16],"t͡ɕ":[19,16],"sʰ":[15],"tʰ":[19],"ʉ":[6],"ʉː":[6],"æɪ":[1,6],"æʉ":[1,6],"ɑɪ":[2,6],"œʏ":[4,7],"ɔʏ":[3,7],"ʉɪ":[6,6],"ʂ":[15],"ɖ":[19],"ɭ":[14],"ɳ":[19],"ʈ":[19],"ɛ͡i":[4,6],"œ͡y":[4,4],"χ":[12],"ɨ":[6],"t͡ʂ":[19,15],"d̪ʲ":[19],"ɡ":[20],"d͡ʐ":[19,15],"l̪ʲ":[14],"t̪ʲ":[19],"xʲ":[12],"ʑ":[16],"ĩ":[6],"ũ":[7],"ɐ̃":[4],"ẽ":[4],"õ":[8],"w̃":[7],"j̃":[6],"ɐj":[4,6],"ɐ̃j̃":[4,6],"ɐ̃w̃":[4,7],"ɐ͡w":[4,7],"a͡j":[2,6],"ɔ͡j":[3,6],"a͡w":[2,7],"ɛ͡w":[4,7],"e͡w":[4,7],"i͡w":[6,7],"o͡j":[8,6],"õj̃":[8,6],"u͡j":[7,6],"ũj̃":[7,6],"ɫ":[14],"e̯a":[4,2],"e̯o":[4,8],"o̯a":[8,2],"d͡ʒʲ":[19,16],"ʃʲ":[16],"t͡sʲ":[19,15],"t͡ʃʲ":[19,16],"ʒʲ":[16],"ʐ":[15],"ɕː":[16],"i͡a":[6,2],"r̩":[13],"r̩ː":[13],"l̩":[14],"l̩ː":[14],"ɴ":[19],"u̯":[7],"i̯":[6],"dˡ":[19],"dn":[19,19],"tˡ":[19],"tn":[19,19],"ʍ":[7],"a‿u":[2,7],"ɶ":[8],"ɵ":[1],"ɧ":[16],"ia":[6,2],"əː":[1],"ua":[7,2],"ɯː":[6],"ɯa":[6,2],"tɕʰ":[19,16],"œ͡ɟ":[4,16],"i͡ɟ":[6,16],"o͡ɟ":[8,16],"u͡ɟ":[7,16],"ɯ͡ɟ":[6,16],"y͡ɟ":[4,16],"ɮ":[6],"u͡a":[7,2],"ɛ̆j":[4,6],"ə͡j":[1,6],"i͡e͡w":[6,4,7],"ɨ͡ə":[6,1],"ie":[6,4],"ăw":[2,7],"ăj":[2,6],"ɨ͡ə͡j":[6,1,6],"ɔ̆w":[3,7],"ɨ͡w":[6,7],"e͡j":[4,6],"ɨ͡ʌ͡w":[6,1,7],"ɨ͡j":[6,6],"iə":[6,1],"a͡ʲ":[2],"ɓ":[21],"ɗ":[19]}
\ No newline at end of file
diff --git a/TTS/STV/phoneme_viseme_map_readable.json.txt b/TTS/STV/phoneme_viseme_map_readable.json.txt
new file mode 100644
index 0000000..911f8c7
--- /dev/null
+++ b/TTS/STV/phoneme_viseme_map_readable.json.txt
@@ -0,0 +1,241 @@
+{
+    # basic
+    'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], 
+    'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], 
+    'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], 
+    'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0],
+
+    # ar-EG
+    "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], 
+    "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], 
+    "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], 
+    "m": [21], "n": [19], "r": [13], "ʕ": [12],
+
+    # bg-BG
+    "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], 
+    "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], 
+    "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], 
+    "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], 
+    "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15],
+
+    # ca-ES
+    "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], 
+    "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], 
+    "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], 
+    "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16],
+
+    # cs-CZ
+    "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], 
+    "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], 
+    "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], 
+    "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], 
+    "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13],
+
+    # da-DK
+    "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], 
+    "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], 
+    "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], 
+    "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], 
+    "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7],
+
+    # de-DE/de-CH/de-AT
+    "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], 
+    "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], 
+    "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], 
+    "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], 
+    "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], 
+    "ʒ": [16], "ʔ": [19],
+
+    # el-GR
+    "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], 
+    "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], 
+    "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], 
+    "x": [12], "z": [15],
+
+    # en-GB/en-IE/en-AU
+    "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], 
+    "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], 
+    "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], 
+    "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], 
+    "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16],
+
+    # en-US/en-CA
+    "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], 
+    "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], 
+    "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], 
+    "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], 
+    "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], 
+    "dʒ": [19, 16], "l": [14], "ɹ": [13],
+
+    # es-ES
+    "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], 
+    "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], 
+    "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16],
+
+    # es-MX
+    "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], 
+    "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], 
+    "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12],
+
+    # fi-FI
+    "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], 
+    "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], 
+    "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], 
+    "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], 
+    "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], 
+    "s": [15], "ʃ": [16], "t": [19], "ʋ": [18],
+
+    # fr-FR/fr-CA/fr-CH
+    "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], 
+    "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], 
+    "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], 
+    "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15],
+
+    # he-IL
+    "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], 
+    "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], 
+    "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16],
+
+    # hr-HR
+    "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], 
+    "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], 
+    "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], 
+    "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16],
+
+    # hu-HU
+    "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], 
+    "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], 
+    "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], 
+    "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], 
+    "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], 
+    "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], 
+    "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], 
+    "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16],
+
+    # id-ID
+    "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], 
+    "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], 
+    "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], 
+    "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15],
+
+    # it-IT
+    "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], 
+    "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], 
+    "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], 
+    "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], 
+    "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], 
+    "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], 
+    "v": [18], "vː": [18], "w": [7], "z": [15],
+
+    # ko-KR
+    "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], 
+    "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], 
+    "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], 
+    "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], 
+    "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], 
+    "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19],
+
+    # ms-MY
+    "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], 
+    "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], 
+    "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], 
+    "r": [13], "h": [12], "j": [6], "w": [7], "l": [14],
+
+    # nb-NO
+    "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], 
+    "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], 
+    "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], 
+    "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], 
+    "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19],
+
+    # nl-NL/nl-BE
+    "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], 
+    "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], 
+    "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], 
+    "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], 
+    "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16],
+
+    # pl-PL
+    "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], 
+    "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], 
+    "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], 
+    "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], 
+    "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], 
+    "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16],
+
+    # pt-BR
+    "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], 
+    "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], 
+    "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], 
+    "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20],
+
+    # pt-PT
+    "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], 
+    "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], 
+    "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], 
+    "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], 
+    "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], 
+    "v": [18], "w": [7], "z": [15], "ʒ": [16],
+
+    # ro-RO
+    "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], 
+    "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], 
+    "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], 
+    "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], 
+    "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], 
+    "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16],
+
+    # ru-RU
+    "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], 
+    "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], 
+    "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], 
+    "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], 
+    "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6],
+
+    # sk-SK
+    "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], 
+    "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], 
+    "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], 
+    "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], 
+    "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], 
+    "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], 
+    "w": [7],
+
+    # sl-SI
+    "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], 
+    "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], 
+    "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], 
+    "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], 
+    "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], 
+    "z": [15],
+
+    # sv-SE
+    "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], 
+    "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], 
+    "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], 
+    "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], 
+    "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19],
+
+    # th-TH
+    "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], 
+    "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], 
+    "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], 
+    "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], 
+    "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19],
+
+    # tr-TR
+    "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], 
+    "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], 
+    "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], 
+    "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], 
+    "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16],
+
+    # vi-VN
+    "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], 
+    "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], 
+    "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], 
+    "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], 
+    "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], 
+    "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], 
+    "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7]
+}
\ No newline at end of file
diff --git a/TTS/STV/speech_to_visemes.py b/TTS/STV/speech_to_visemes.py
index 11d16dd..861a7d9 100644
--- a/TTS/STV/speech_to_visemes.py
+++ b/TTS/STV/speech_to_visemes.py
@@ -1,10 +1,10 @@
 """This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes."""
 from transformers import pipeline
 import logging
-import numpy as np
+import json
 
 logger = logging.getLogger(__name__)
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any
 
 class SpeechToVisemes():
     """
@@ -34,6 +34,11 @@ def __init__(
         self.device = device
         self.gen_kwargs = gen_kwargs
 
+        # This dictionary is inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets
+        phoneme_viseme_map_file="TTS/STV/phoneme_viseme_map.json"
+        with open(phoneme_viseme_map_file, 'r') as f:
+            self.phoneme_viseme_map = json.load(f)
+
         # Initialize the automatic speech recognition pipeline
         self.asr_pipeline = pipeline(
             "automatic-speech-recognition", model=model_name, device=device
@@ -51,275 +56,20 @@ def _map_phonemes_to_visemes(
             - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets
 
         Args:
-            data (Dict[str, Any]): A dictionary containing phoneme data, where `data['chunks']` 
+            data (Dict[str, Any]): A dictionary containing phoneme data, where data['chunks'] 
                 holds a list of phonemes and their timestamps.
 
         Returns:
-            List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme 
-            ID and the corresponding timestamp.
+            List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme ID 
+                and the corresponding timestamp.
         """
-
-        def _phoneme_to_viseme(phoneme: str) -> List[int]:
-            """
-            Converts a phoneme to its corresponding viseme(s).
-
-            Args:
-                phoneme (str): The phoneme to map to viseme.
-
-            Returns:
-                List[int]: A list of viseme IDs corresponding to the phoneme.
-            """
-            # inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets
-            phoneme_viseme_map = {
-                # basic
-                'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], 
-                'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], 
-                'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], 
-                'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0],
-
-                # ar-EG
-                "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], 
-                "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], 
-                "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], 
-                "m": [21], "n": [19], "r": [13], "ʕ": [12],
-
-                # bg-BG
-                "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], 
-                "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], 
-                "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], 
-                "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], 
-                "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15],
-
-                # ca-ES
-                "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], 
-                "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], 
-                "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], 
-                "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16],
-
-                # cs-CZ
-                "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], 
-                "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], 
-                "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], 
-                "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], 
-                "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13],
-
-                # da-DK
-                "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], 
-                "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], 
-                "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], 
-                "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], 
-                "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7],
-
-                # de-DE/de-CH/de-AT
-                "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], 
-                "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], 
-                "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], 
-                "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], 
-                "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], 
-                "ʒ": [16], "ʔ": [19],
-
-                # el-GR
-                "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], 
-                "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], 
-                "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], 
-                "x": [12], "z": [15],
-
-                # en-GB/en-IE/en-AU
-                "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], 
-                "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], 
-                "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], 
-                "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], 
-                "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16],
-
-                # en-US/en-CA
-                "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], 
-                "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], 
-                "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], 
-                "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], 
-                "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], 
-                "dʒ": [19, 16], "l": [14], "ɹ": [13],
-
-                # es-ES
-                "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], 
-                "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], 
-                "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16],
-
-                # es-MX
-                "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], 
-                "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], 
-                "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12],
-
-                # fi-FI
-                "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], 
-                "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], 
-                "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], 
-                "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], 
-                "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], 
-                "s": [15], "ʃ": [16], "t": [19], "ʋ": [18],
-
-                # fr-FR/fr-CA/fr-CH
-                "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], 
-                "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], 
-                "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], 
-                "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15],
-
-                # he-IL
-                "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], 
-                "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], 
-                "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16],
-
-                # hr-HR
-                "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], 
-                "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], 
-                "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], 
-                "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16],
-
-                # hu-HU
-                "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], 
-                "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], 
-                "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], 
-                "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], 
-                "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], 
-                "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], 
-                "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], 
-                "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16],
-
-                # id-ID
-                "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], 
-                "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], 
-                "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], 
-                "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15],
-
-                # it-IT
-                "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], 
-                "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], 
-                "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], 
-                "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], 
-                "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], 
-                "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], 
-                "v": [18], "vː": [18], "w": [7], "z": [15],
-
-                # ko-KR
-                "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], 
-                "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], 
-                "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], 
-                "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], 
-                "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], 
-                "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19],
-
-                # ms-MY
-                "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], 
-                "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], 
-                "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], 
-                "r": [13], "h": [12], "j": [6], "w": [7], "l": [14],
-
-                # nb-NO
-                "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], 
-                "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], 
-                "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], 
-                "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], 
-                "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19],
-
-                # nl-NL/nl-BE
-                "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], 
-                "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], 
-                "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], 
-                "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], 
-                "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16],
-
-                # pl-PL
-                "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], 
-                "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], 
-                "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], 
-                "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], 
-                "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], 
-                "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16],
-
-                # pt-BR
-                "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], 
-                "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], 
-                "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], 
-                "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20],
-
-                # pt-PT
-                "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], 
-                "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], 
-                "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], 
-                "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], 
-                "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], 
-                "v": [18], "w": [7], "z": [15], "ʒ": [16],
-
-                # ro-RO
-                "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], 
-                "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], 
-                "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], 
-                "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], 
-                "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], 
-                "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16],
-
-                # ru-RU
-                "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], 
-                "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], 
-                "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], 
-                "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], 
-                "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6],
-
-                # sk-SK
-                "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], 
-                "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], 
-                "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], 
-                "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], 
-                "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], 
-                "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], 
-                "w": [7],
-
-                # sl-SI
-                "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], 
-                "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], 
-                "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], 
-                "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], 
-                "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], 
-                "z": [15],
-
-                # sv-SE
-                "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], 
-                "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], 
-                "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], 
-                "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], 
-                "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19],
-
-                # th-TH
-                "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], 
-                "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], 
-                "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], 
-                "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], 
-                "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19],
-
-                # tr-TR
-                "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], 
-                "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], 
-                "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], 
-                "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], 
-                "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16],
-
-                # vi-VN
-                "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], 
-                "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], 
-                "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], 
-                "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], 
-                "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], 
-                "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], 
-                "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7]
-            }
-            return phoneme_viseme_map.get(phoneme, [])
-
         viseme_list = []
         chunks = data.get('chunks', [])
 
-        for i, chunk in enumerate(chunks):
+        for _, chunk in enumerate(chunks):
             phoneme = chunk.get('text', None)
             timestamp = chunk.get('timestamp', None)
-            visemes = _phoneme_to_viseme(phoneme)
+            visemes = self.phoneme_viseme_map.get(phoneme, [])
             
             for viseme in visemes:
                 viseme_list.append({
@@ -331,11 +81,21 @@ def _phoneme_to_viseme(phoneme: str) -> List[int]:
 
 
     def process(self, audio_file: str) -> List[Dict[str, Any]]:
-        """Process an audio file and convert speech to visemes."""
+        """Process an audio file and convert speech to visemes.
+        
+        Heuristically, we found that the model requires at least 0.5 seconds of audio to run phoneme recognition.
+        This value may be also depended on the model, the language, and other factors.
+
+        Args:
+            audio_file (str): The path to the audio file.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme 
+                ID and the corresponding timestamp.
+        """
         # Perform ASR to get phoneme data
         asr_result = self.asr_pipeline(audio_file, return_timestamps='char')
         # Map phonemes to visemes
         viseme_data = self._map_phonemes_to_visemes(asr_result)
-
         return viseme_data
     
\ No newline at end of file

From 20ec10bba6614125a19f994987c2ddf8409f6de0 Mon Sep 17 00:00:00 2001
From: fabiocat93 <fabio-cat93@hotmail.it>
Date: Sun, 6 Oct 2024 10:45:48 -0400
Subject: [PATCH 3/7] adding pre-commit hooks for codespell and ruff style
 check

---
 .pre-commit-config.yaml                   | 12 ++++++++++++
 LLM/chat.py                               |  2 +-
 LLM/language_model.py                     |  4 ++--
 LLM/mlx_language_model.py                 |  4 ++--
 LLM/openai_api_language_model.py          |  4 ++--
 README.md                                 |  4 ++--
 STT/lightning_whisper_mlx_handler.py      |  3 +--
 STT/paraformer_handler.py                 |  2 +-
 STT/whisper_stt_handler.py                |  2 +-
 TTS/STV/speech_to_visemes.py              |  2 +-
 arguments_classes/parler_tts_arguments.py |  2 +-
 11 files changed, 26 insertions(+), 15 deletions(-)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..185a1e0
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,12 @@
+repos:
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.5  # Specify the latest stable version
+    hooks:
+      - id: codespell
+        args: ["-w"]  # The -w flag tells codespell to automatically apply fixes
+
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.1.1  # Replace with the latest stable version of ruff-pre-commit
+    hooks:
+      - id: ruff
+        args: ["--fix"]  # This will automatically fix linting issues
diff --git a/LLM/chat.py b/LLM/chat.py
index bc8ac4f..6f5569d 100644
--- a/LLM/chat.py
+++ b/LLM/chat.py
@@ -6,7 +6,7 @@ class Chat:
     def __init__(self, size):
         self.size = size
         self.init_chat_message = None
-        # maxlen is necessary pair, since a each new step we add an prompt and assitant answer
+        # maxlen is necessary pair, since a each new step we add an prompt and assistant answer
         self.buffer = []
 
     def append(self, item):
diff --git a/LLM/language_model.py b/LLM/language_model.py
index ddeb34b..202e007 100644
--- a/LLM/language_model.py
+++ b/LLM/language_model.py
@@ -68,7 +68,7 @@ def setup(
         if init_chat_role:
             if not init_chat_prompt:
                 raise ValueError(
-                    "An initial promt needs to be specified when setting init_chat_role."
+                    "An initial prompt needs to be specified when setting init_chat_role."
                 )
             self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
         self.user_role = user_role
@@ -111,7 +111,7 @@ def warmup(self):
             )
 
     def process(self, prompt):
-        logger.debug("infering language model...")
+        logger.debug("inferring language model...")
         language_code = None
         if isinstance(prompt, tuple):
             prompt, language_code = prompt
diff --git a/LLM/mlx_language_model.py b/LLM/mlx_language_model.py
index 87812c5..8269b3b 100644
--- a/LLM/mlx_language_model.py
+++ b/LLM/mlx_language_model.py
@@ -42,7 +42,7 @@ def setup(
         if init_chat_role:
             if not init_chat_prompt:
                 raise ValueError(
-                    "An initial promt needs to be specified when setting init_chat_role."
+                    "An initial prompt needs to be specified when setting init_chat_role."
                 )
             self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
         self.user_role = user_role
@@ -68,7 +68,7 @@ def warmup(self):
             )
 
     def process(self, prompt):
-        logger.debug("infering language model...")
+        logger.debug("inferring language model...")
         language_code = None
 
         if isinstance(prompt, tuple):
diff --git a/LLM/openai_api_language_model.py b/LLM/openai_api_language_model.py
index dcbabe0..2866867 100644
--- a/LLM/openai_api_language_model.py
+++ b/LLM/openai_api_language_model.py
@@ -44,7 +44,7 @@ def setup(
         if init_chat_role:
             if not init_chat_prompt:
                 raise ValueError(
-                    "An initial promt needs to be specified when setting init_chat_role."
+                    "An initial prompt needs to be specified when setting init_chat_role."
                 )
             self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
         self.user_role = user_role
@@ -54,7 +54,7 @@ def setup(
     def warmup(self):
         logger.info(f"Warming up {self.__class__.__name__}")
         start = time.time()
-        response = self.client.chat.completions.create(
+        _ = self.client.chat.completions.create(
             model=self.model_name,
             messages=[
                 {"role": "system", "content": "You are a helpful assistant"},
diff --git a/README.md b/README.md
index 02c1676..9f0765c 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ The pipeline can be run in two ways:
 - **Server/Client approach**: Models run on a server, and audio input/output are streamed from a client.
 - **Local approach**: Runs locally.
 
-### Recommanded setup 
+### Recommended setup 
 
 ### Server/Client Approach
 
@@ -120,7 +120,7 @@ https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install
 
 ### Recommended usage with Cuda
 
-Leverage Torch Compile for Whisper and Parler-TTS. **The usage of Parler-TTS allows for audio output streaming, futher reducing the overeall latency** 🚀:
+Leverage Torch Compile for Whisper and Parler-TTS. **The usage of Parler-TTS allows for audio output streaming, further reducing the overeall latency** 🚀:
 
 ```bash
 python s2s_pipeline.py \
diff --git a/STT/lightning_whisper_mlx_handler.py b/STT/lightning_whisper_mlx_handler.py
index 53b6b5a..2f2d657 100644
--- a/STT/lightning_whisper_mlx_handler.py
+++ b/STT/lightning_whisper_mlx_handler.py
@@ -4,7 +4,6 @@
 from lightning_whisper_mlx import LightningWhisperMLX
 import numpy as np
 from rich.console import Console
-from copy import copy
 import torch
 
 logger = logging.getLogger(__name__)
@@ -55,7 +54,7 @@ def warmup(self):
             _ = self.model.transcribe(dummy_input)["text"].strip()
 
     def process(self, spoken_prompt):
-        logger.debug("infering whisper...")
+        logger.debug("inferring whisper...")
 
         global pipeline_start
         pipeline_start = perf_counter()
diff --git a/STT/paraformer_handler.py b/STT/paraformer_handler.py
index 99fd6ac..dcadc02 100644
--- a/STT/paraformer_handler.py
+++ b/STT/paraformer_handler.py
@@ -45,7 +45,7 @@ def warmup(self):
             _ = self.model.generate(dummy_input)[0]["text"].strip().replace(" ", "")
 
     def process(self, spoken_prompt):
-        logger.debug("infering paraformer...")
+        logger.debug("inferring paraformer...")
 
         global pipeline_start
         pipeline_start = perf_counter()
diff --git a/STT/whisper_stt_handler.py b/STT/whisper_stt_handler.py
index 0930087..88c578f 100644
--- a/STT/whisper_stt_handler.py
+++ b/STT/whisper_stt_handler.py
@@ -109,7 +109,7 @@ def warmup(self):
             )
 
     def process(self, spoken_prompt):
-        logger.debug("infering whisper...")
+        logger.debug("inferring whisper...")
 
         global pipeline_start
         pipeline_start = perf_counter()
diff --git a/TTS/STV/speech_to_visemes.py b/TTS/STV/speech_to_visemes.py
index 861a7d9..16ad95c 100644
--- a/TTS/STV/speech_to_visemes.py
+++ b/TTS/STV/speech_to_visemes.py
@@ -1,10 +1,10 @@
 """This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes."""
+from typing import List, Dict, Any
 from transformers import pipeline
 import logging
 import json
 
 logger = logging.getLogger(__name__)
-from typing import List, Dict, Any
 
 class SpeechToVisemes():
     """
diff --git a/arguments_classes/parler_tts_arguments.py b/arguments_classes/parler_tts_arguments.py
index 1bb0f21..b519751 100644
--- a/arguments_classes/parler_tts_arguments.py
+++ b/arguments_classes/parler_tts_arguments.py
@@ -57,6 +57,6 @@ class ParlerTTSHandlerArguments:
     max_prompt_pad_length: int = field(
         default=8,
         metadata={
-            "help": "When using compilation, the prompt as to be padded to closest power of 2. This parameters sets the maximun power of 2 possible."
+            "help": "When using compilation, the prompt as to be padded to closest power of 2. This parameters sets the maximum power of 2 possible."
         },
     )

From 522f716383a2fe8c6339c24992694ceddc8a6c67 Mon Sep 17 00:00:00 2001
From: fabiocat93 <fabio-cat93@hotmail.it>
Date: Sun, 6 Oct 2024 10:52:23 -0400
Subject: [PATCH 4/7] removing TTS/STV/phoneme_viseme_map_readable.json.txt

---
 TTS/STV/phoneme_viseme_map_readable.json.txt | 241 -------------------
 1 file changed, 241 deletions(-)
 delete mode 100644 TTS/STV/phoneme_viseme_map_readable.json.txt

diff --git a/TTS/STV/phoneme_viseme_map_readable.json.txt b/TTS/STV/phoneme_viseme_map_readable.json.txt
deleted file mode 100644
index 911f8c7..0000000
--- a/TTS/STV/phoneme_viseme_map_readable.json.txt
+++ /dev/null
@@ -1,241 +0,0 @@
-{
-    # basic
-    'æ': [1], 'ə': [1], 'ʌ': [1], 'ɑ': [2], 'ɔ': [3], 'ɛ': [4], 'ʊ': [4], 'ɝ': [5], 'j': [6], 'i': [6], 'ɪ': [6], 
-    'w': [7], 'u': [7], 'o': [8], 'aʊ': [9], 'ɔɪ': [10], 'aɪ': [11], 'h': [12], 'ɹ': [13], 'l': [14], 's': [15], 
-    'z': [15], 'ʃ': [16], 'tʃ': [16], 'dʒ': [16], 'ʒ': [16], 'ð': [17], 'f': [18], 'v': [18], 'd': [19], 't': [19], 
-    'n': [19], 'θ': [19], 'k': [20], 'g': [20], 'ŋ': [20], 'p': [21], 'b': [21], 'm': [21], ' ': [0],
-
-    # ar-EG
-    "a": [2], "aː": [2], "i": [6], "iː": [6], "u": [7], "uː": [7], "b": [21], "d": [19], "g": [20], "k": [20], 
-    "t": [19], "dˤ": [19], "q": [20], "tˤ": [19], "ʔ": [19], "f": [18], "h": [12], "ħ": [12], "s": [15], "θ": [19], 
-    "z": [15], "ðˤ": [17], "ð": [17], "ɣ": [20], "x": [12], "ʃ": [16], "sˤ": [15], "j": [6], "w": [7], "l": [14], 
-    "m": [21], "n": [19], "r": [13], "ʕ": [12],
-
-    # bg-BG
-    "i": [6], "ɛ": [4], "ɔ": [3], "a": [2], "u": [7], "j͡a": [6, 2], "ɤ": [1], "j͡u": [6, 7], "n": [19], "ʒ": [16], 
-    "k": [20], "t͡s": [19, 15], "t": [19], "p": [21], "r": [13], "s": [15], "d": [19], "x": [12], "zʲ": [15], 
-    "lʲ": [14], "l": [14], "nʲ": [19], "v": [18], "m": [21], "b": [21], "g": [20], "d͡ʒ": [19, 16], "f": [18], 
-    "mʲ": [21], "tʲ": [19], "rʲ": [13], "pʲ": [21], "dʲ": [19], "j": [6], "vʲ": [18], "sʲ": [15], "bʲ": [21], 
-    "kʲ": [20], "gʲ": [20], "fʲ": [18], "z": [15], "ʃ": [16], "t͡ʃ": [19, 16], "d͡z": [19, 15],
-
-    # ca-ES
-    "a": [2], "ɔ": [3], "ə": [1], "e": [4], "ɛ": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], 
-    "d": [19], "ð": [17], "f": [18], "g": [20], "ɣ": [20], "j": [6], "d͡ʒ": [19, 16], "k": [20], "l": [14], "ʎ": [14], 
-    "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "ʃ": [16], "t": [19], 
-    "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16],
-
-    # cs-CZ
-    "ɪ": [6], "ɛ": [4], "a": [2], "o": [8], "u": [7], "iː": [6], "ɛː": [4], "aː": [2], "oː": [8], "uː": [7], 
-    "o͡ʊ̯": [8, 4], "a͡ʊ": [2, 4], "ɛ͡ʊ̯": [4, 4], "ə": [1], "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], 
-    "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], 
-    "v": [18], "s": [15], "z": [15], "r̝": [13], "ʃ": [16], "ʒ": [16], "j": [6], "x": [12], "ɦ": [12], "r": [13], 
-    "l": [14], "m": [21], "n": [19], "ŋ": [20], "ɲ": [19], "ɱ": [21], "r̝̊": [13],
-
-    # da-DK
-    "a": [2], "ɑ": [2], "ɑː": [2], "ɛ": [4], "ɛː": [4], "ɔ": [3], "ɒ": [2], "ɒː": [2], "ɔː": [3], "ɐ": [4], 
-    "æː": [1], "e": [4], "ø": [1], "øː": [1], "ə": [1], "eː": [4], "i": [6], "iː": [6], "o": [8], "œ": [4], 
-    "œː": [4], "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "d": [19], "ð": [17], "f": [18], 
-    "g": [20], "h": [12], "j": [6], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], "ʔ": [19], 
-    "ʁ": [13], "ɐ̯": [4], "s": [15], "ɕ": [16], "t": [19], "v": [18], "w": [7],
-
-    # de-DE/de-CH/de-AT
-    "aː": [2], "a": [2], "ɔ": [3], "ɛː": [4], "ɛ": [4], "ə": [1], "iː": [6], "ɪ": [6], "øː": [1], "o": [8], 
-    "oː": [8], "œ": [4], "e": [4], "eː": [4], "uː": [7], "ʊ": [4], "yː": [4], "ʏ": [7], "ai": [2, 6], "au": [2, 7], 
-    "ɔy": [3, 4], "ɔʏ̯": [3, 4], "ɐ": [4], "b": [21], "d": [19], "ʤ": [16], "f": [18], "g": [20], "h": [12], 
-    "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pf": [21, 18], "ʀ": [13], "r": [13], 
-    "ʁ": [13], "s": [15], "ʃ": [16], "t": [19], "ts": [19, 15], "tʃ": [19, 16], "v": [18], "x": [12], "z": [15], 
-    "ʒ": [16], "ʔ": [19],
-
-    # el-GR
-    "a": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "c": [16], "ç": [12], "d": [19], "ð": [17], 
-    "d͡z": [19, 15], "f": [18], "g": [20], "ɣ": [20], "ɟ": [16], "j": [6], "ʝ": [12], "k": [20], "l": [14], 
-    "m": [21], "n": [19], "p": [21], "ɾ": [19], "s": [15], "t": [19], "θ": [19], "t͡s": [19, 15], "v": [18], 
-    "x": [12], "z": [15],
-
-    # en-GB/en-IE/en-AU
-    "ɑː": [2], "æ": [1], "ʌ": [1], "ɛə": [4, 1], "aʊ": [2, 4], "ə": [1], "aɪ": [2, 6], "ɛ": [4], "ɜː": [5], 
-    "eɪ": [4, 6], "ɪ": [6], "ɪə": [6, 1], "iː": [6], "ɒ": [2], "ɔː": [3], "əʊ": [1, 4], "ɔɪ": [3, 6], "ʊ": [4], 
-    "ʊə": [4, 1], "uː": [7], "b": [21], "tʃ": [19, 16], "d": [19], "ð": [17], "f": [18], "g": [20], "h": [12], 
-    "j": [6], "dʒ": [19, 16], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɹ": [13], 
-    "s": [15], "ʃ": [16], "t": [19], "θ": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16],
-
-    # en-US/en-CA
-    "iy": [6], "ɪ": [6], "eɪ": [4, 6], "ɛ": [4], "æ": [1], "ɑ": [2], "ɔ": [3], "ʊ": [4], "oʊ": [8, 4], "u": [7], 
-    "ʌ": [1], "aɪ": [11], "aʊ": [9], "ɔɪ": [10], "ju": [6, 7], "ə": [1], "ɪɹ": [6, 13], "ɛɹ": [4, 13], "ʊɹ": [4, 13], 
-    "aɪɹ": [11, 13], "aʊɹ": [9, 13], "ɔɹ": [3, 13], "ɑɹ": [2, 13], "ɝ": [5], "ɚ": [1], "w": [7], "j": [6], 
-    "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "m": [21], "n": [19], "ŋ": [20], "f": [18], 
-    "v": [18], "θ": [19], "ð": [17], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], "h": [12], "tʃ": [19, 16], 
-    "dʒ": [19, 16], "l": [14], "ɹ": [13],
-
-    # es-ES
-    "a": [2], "i": [6], "e": [4], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], 
-    "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], 
-    "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12], "z": [15], "ʒ": [16],
-
-    # es-MX
-    "ɑ": [2], "e": [4], "i": [6], "o": [8], "u": [7], "b": [21], "β": [21], "t͡ʃ": [19, 16], "d": [19], "ð": [17], 
-    "f": [18], "g": [20], "ɣ": [20], "j": [6], "j͡j": [6, 6], "k": [20], "l": [14], "ʎ": [14], "m": [21], "n": [19], 
-    "ɲ": [19], "p": [21], "ɾ": [19], "r": [13], "s": [15], "t": [19], "θ": [19], "w": [7], "x": [12],
-
-    # fi-FI
-    "ɑ": [2], "ɑ͡i": [2, 6], "ɑ͡u": [2, 7], "ɑː": [2], "æ": [1], "æ͡i": [1, 6], "æ͡y": [1, 4], "æː": [1], "e": [4], 
-    "e͡i": [4, 6], "ø": [1], "ø͡i": [1, 6], "ø͡y": [1, 4], "øː": [1], "e͡u": [4, 7], "e͡y": [4, 4], "eː": [4], "i": [6], 
-    "i͡e": [6, 4], "i͡u": [6, 7], "i͡y": [6, 4], "iː": [6], "o": [8], "o͡i": [8, 6], "o͡u": [8, 7], "oː": [8], "u": [7], 
-    "u͡i": [7, 6], "u͡o": [7, 8], "uː": [7], "y": [4], "y͡ø": [4, 1], "y͡i": [4, 6], "yː": [4], "b": [21], "d": [19], 
-    "f": [18], "g": [20], "h": [12], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], 
-    "s": [15], "ʃ": [16], "t": [19], "ʋ": [18],
-
-    # fr-FR/fr-CA/fr-CH
-    "a": [2], "ɑ": [2], "ɑ̃": [2], "ə": [1], "ɛ": [4], "ø": [1], "e": [4], "ɛ̃": [4], "i": [6], "œ": [4], "ɔ": [3], 
-    "ɔ̃": [3], "o": [8], "œ̃": [4], "u": [7], "y": [4], "b": [21], "d": [19], "f": [18], "g": [20], "ɲ": [19], 
-    "ɥ": [7], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʁ": [13], "s": [15], "ʃ": [16], 
-    "t": [19], "v": [18], "w": [7], "j": [6], "z": [15], "n‿": [19], "t‿": [19], "z‿": [15],
-
-    # he-IL
-    "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], 
-    "ʔ": [19], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], "h": [12], "t͡s": [19, 15], "m": [21], 
-    "n": [19], "l": [14], "ʁ": [13], "j": [6], "ʒ": [16], "tʃ": [19, 16], "dʒ": [19, 16],
-
-    # hr-HR
-    "e": [4], "eː": [4], "i": [6], "iː": [6], "u": [7], "uː": [7], "a": [2], "aː": [2], "o": [8], "oː": [8], 
-    "d": [19], "v": [18], "s": [15], "t": [19], "n": [19], "l": [14], "ʎ": [14], "t͡s": [19, 15], "t͡ʃ": [19, 16], 
-    "j": [6], "x": [12], "z": [15], "ʒ": [16], "r": [13], "k": [20], "m": [21], "p": [21], "g": [20], "ʨ": [16], 
-    "f": [18], "b": [21], "d͡ʒ": [19, 16], "ɲ": [19], "ʥ": [16], "ʃ": [16],
-
-    # hu-HU
-    "ø": [1], "øː": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "i": [6], "iː": [6], "o": [8], "ɒ": [2], 
-    "oː": [8], "u": [7], "uː": [7], "y": [4], "yː": [4], "b": [21], "bː": [21], "d": [19], "ɟ": [16], "dː": [19], 
-    "ɟː": [16], "d͡ʒ": [19, 16], "d͡ʒː": [19, 16], "dz": [19, 15], "dzː": [19, 15], "f": [18], "fː": [18], 
-    "g": [20], "gː": [20], "h": [12], "hː": [12], "j": [6], "ɲ": [19], "jː": [6], "ɲː": [19], "k": [20], 
-    "kː": [20], "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "ŋ": [20], "nː": [19], "p": [21], 
-    "pː": [21], "r": [13], "rː": [13], "s": [15], "ʃ": [16], "sː": [15], "ʃː": [16], "t": [19], "c": [16], 
-    "tː": [19], "cː": [16], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sː": [19, 15], "t͡ʃː": [19, 16], "v": [18], 
-    "vː": [18], "x": [12], "ɰ": [20], "z": [15], "ʒ": [16], "zː": [15], "ʒː": [16],
-
-    # id-ID
-    "ə": [1], "a": [2], "a͡i": [2, 6], "a͡ʊ": [2, 4], "e": [4], "ɛ": [4], "ɪ": [6], "i": [6], "ɔ": [3], "o": [8], 
-    "ɔ͡i": [3, 6], "u": [7], "ʊ": [4], "ʔ": [19], "b": [21], "d": [19], "d͡ʒ": [19, 16], "f": [18], "g": [20], 
-    "h": [12], "ɲ": [19], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "r": [13], 
-    "s": [15], "ʃ": [16], "t": [19], "t͡ʃ": [19, 16], "w": [7], "x": [12], "z": [15],
-
-    # it-IT
-    "a": [2], "ai": [2, 6], "au": [2, 7], "e": [4], "ɛ": [4], "ɛj": [4, 6], "ɛu": [4, 7], "ei": [4, 6], "eu": [4, 7], 
-    "i": [6], "u": [7], "o": [8], "ɔ": [3], "ɔj": [3, 6], "oi": [8, 6], "ou": [8, 7], "b": [21], "bː": [21], 
-    "ʧ": [16], "tʃː": [19, 16], "kː": [20], "d": [19], "dː": [19], "ʣ": [15], "ʣː": [15], "f": [18], "fː": [18], 
-    "ʤ": [16], "ʤː": [16], "g": [20], "gː": [20], "ʎ": [14], "ʎː": [14], "ɲː": [19], "ɲ": [19], "j": [6], "k": [20], 
-    "l": [14], "lː": [14], "m": [21], "mː": [21], "n": [19], "nː": [19], "p": [21], "pː": [21], "ɾ": [19], 
-    "rː": [13], "s": [15], "sː": [15], "ʃ": [16], "ʃː": [16], "t": [19], "tː": [19], "ʦ": [15], "ʦː": [15], 
-    "v": [18], "vː": [18], "w": [7], "z": [15],
-
-    # ko-KR
-    "a": [2], "ɛ": [4], "e": [4], "ɯ": [6], "i": [6], "ʌ": [1], "o": [8], "u": [7], "ɰ͡i": [20, 6], "ø": [1], 
-    "w͡a": [7, 2], "w͡ɛ": [7, 4], "w͡e": [7, 4], "w͡i": [7, 6], "w͡ʌ": [7, 1], "j͡a": [6, 2], "j͡ɛ": [6, 4], 
-    "j͡e": [6, 4], "j͡ʌ": [6, 1], "j͡o": [6, 8], "j͡u": [6, 7], "b̥": [21], "p": [21], "b": [21], "t͡ɕʰ": [19, 16], 
-    "d̥": [19], "t": [19], "d": [19], "g̥": [20], "k": [20], "g": [20], "h": [12], "ɦ": [12], "d͡ʑ": [19, 16], 
-    "d͡ʑ̥": [19, 16], "t͡ɕ": [19, 16], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "pʰ": [21], 
-    "ɾ": [19], "sʰ": [15], "s": [15], "tʰ": [19],
-
-    # ms-MY
-    "i": [6], "u": [7], "ə": [1], "e": [4], "o": [8], "a": [2], "a͡i": [2, 6], "au": [2, 7], "oi": [8, 6], 
-    "p": [21], "b": [21], "t": [19], "d": [19], "k": [20], "g": [20], "ʔ": [19], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], 
-    "m": [21], "n": [19], "ɲ": [19], "ŋ": [20], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], "x": [12], 
-    "r": [13], "h": [12], "j": [6], "w": [7], "l": [14],
-
-    # nb-NO
-    "ɑ": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɛ": [4], "øː": [1], "eː": [4], "ɪ": [6], "iː": [6], "ɔ": [3], 
-    "œ": [4], "oː": [8], "u": [7], "uː": [7], "ʏ": [7], "ʉ": [6], "ʉː": [6], "yː": [4], "æɪ": [1, 6], 
-    "æʉ": [1, 6], "ɑɪ": [2, 6], "œʏ": [4, 7], "ɔʏ": [3, 7], "ʉɪ": [6, 6], "p": [21], "t": [19], "k": [20], 
-    "b": [21], "d": [19], "g": [20], "f": [18], "h": [12], "s": [15], "ʂ": [15], "ç": [12], "v": [18], "m": [21], 
-    "n": [19], "ŋ": [20], "l": [14], "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʈ": [19],
-
-    # nl-NL/nl-BE
-    "ɑ": [2], "aː": [2], "ɑ̃": [2], "ɑ͡u": [2, 7], "ɛ": [4], "eː": [4], "ɛː": [4], "ɛ͡i": [4, 6], "ɛ̃": [4], 
-    "øː": [1], "ɪ": [6], "i": [6], "ɔ": [3], "u": [7], "ɔː": [3], "ɔ̃": [3], "oː": [8], "ʏ": [7], "ə": [1], 
-    "œ͡y": [4, 4], "œ": [4], "y": [4], "b": [21], "d": [19], "f": [18], "χ": [12], "ʔ": [19], "ɦ": [12], 
-    "g": [20], "j": [6], "k": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ʀ": [13], "s": [15], 
-    "ʃ": [16], "t": [19], "w": [7], "v": [18], "ʋ": [18], "z": [15], "ʒ": [16],
-
-    # pl-PL
-    "a": [2], "ɛ": [4], "ɛ̃": [4], "i": [6], "ɨ": [6], "ɔ": [3], "ɔ̃": [3], "u": [7], "b": [21], "bʲ": [21], 
-    "t͡ɕ": [19, 16], "t͡ʂ": [19, 15], "c": [16], "d": [19], "d̪ʲ": [19], "d͡z": [19, 15], "d͡ʑ": [19, 16], 
-    "f": [18], "fʲ": [18], "ɡ": [20], "ɟ": [16], "d͡ʐ": [19, 15], "k": [20], "l": [14], "l̪ʲ": [14], "m": [21], 
-    "mʲ": [21], "n": [19], "ŋ": [20], "ɲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], 
-    "ɕ": [16], "ʃ": [16], "t": [19], "t̪ʲ": [19], "t͡s": [19, 15], "v": [18], "vʲ": [18], "w": [7], "x": [12], 
-    "xʲ": [12], "j": [6], "z": [15], "ʑ": [16], "ʒ": [16],
-
-    # pt-BR
-    "i": [6], "ĩ": [6], "a": [2], "ɔ": [3], "u": [7], "ũ": [7], "o": [8], "e": [4], "ɐ̃": [4], "ə": [1], 
-    "ɛ": [4], "ẽ": [4], "õ": [8], "w̃": [7], "w": [7], "p": [21], "b": [21], "t": [19], "d": [19], "g": [20], 
-    "m": [21], "n": [19], "ɲ": [19], "f": [18], "v": [18], "ɾ": [19], "s": [15], "z": [15], "ʃ": [16], "ʒ": [16], 
-    "x": [12], "tʃ": [19, 16], "dʒ": [19, 16], "l": [14], "ʎ": [14], "j̃": [6], "j": [6], "k": [20],
-
-    # pt-PT
-    "a": [2], "ɐ": [4], "ɐj": [4, 6], "ɐ̃": [4], "ɐ̃j̃": [4, 6], "ɐ̃w̃": [4, 7], "ɐ͡w": [4, 7], "a͡j": [2, 6], 
-    "ɔ": [3], "ɔ͡j": [3, 6], "a͡w": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛ͡w": [4, 7], "ẽ": [4], "e͡w": [4, 7], 
-    "i": [6], "ĩ": [6], "i͡w": [6, 7], "o": [8], "o͡j": [8, 6], "õ": [8], "õj̃": [8, 6], "u": [7], "u͡j": [7, 6], 
-    "ũ": [7], "ũj̃": [7, 6], "b": [21], "d": [19], "ɾ": [19], "f": [18], "g": [20], "j": [6], "k": [20], "l": [14], 
-    "ɫ": [14], "ʎ": [14], "m": [21], "n": [19], "ɲ": [19], "p": [21], "ʀ": [13], "s": [15], "ʃ": [16], "t": [19], 
-    "v": [18], "w": [7], "z": [15], "ʒ": [16],
-
-    # ro-RO
-    "ə": [1], "ɨ": [6], "a": [2], "e": [4], "e̯a": [4, 2], "e̯o": [4, 8], "i": [6], "o": [8], "o̯a": [8, 2], 
-    "u": [7], "b": [21], "bʲ": [21], "d": [19], "d͡ʒ": [19, 16], "d͡ʒʲ": [19, 16], "f": [18], "fʲ": [18], "g": [20], 
-    "gʲ": [20], "h": [12], "j": [6], "k": [20], "kʲ": [20], "l": [14], "lʲ": [14], "m": [21], "mʲ": [21], "n": [19], 
-    "ŋ": [20], "nʲ": [19], "p": [21], "pʲ": [21], "r": [13], "rʲ": [13], "s": [15], "ʃ": [16], "ʃʲ": [16], "t": [19], 
-    "tʲ": [19], "t͡s": [19, 15], "t͡ʃ": [19, 16], "t͡sʲ": [19, 15], "t͡ʃʲ": [19, 16], "v": [18], "vʲ": [18], "w": [7], 
-    "z": [15], "ʒ": [16], "zʲ": [15], "ʒʲ": [16],
-
-    # ru-RU
-    "a": [2], "ʌ": [1], "ə": [1], "ɛ": [4], "i": [6], "ɪ": [6], "ɨ": [6], "ɔ": [3], "u": [7], "p": [21], "pʲ": [21], 
-    "b": [21], "bʲ": [21], "t": [19], "tʲ": [19], "d": [19], "dʲ": [19], "k": [20], "kʲ": [20], "g": [20], 
-    "gʲ": [20], "x": [12], "xʲ": [12], "f": [18], "fʲ": [18], "v": [18], "vʲ": [18], "s": [15], "sʲ": [15], 
-    "z": [15], "zʲ": [15], "ʂ": [15], "ʐ": [15], "t͡s": [19, 15], "t͡ɕ": [19, 16], "ɕː": [16], "m": [21], 
-    "mʲ": [21], "n": [19], "nʲ": [19], "l": [14], "lʲ": [14], "r": [13], "rʲ": [13], "j": [6],
-
-    # sk-SK
-    "i": [6], "e": [4], "a": [2], "o": [8], "u": [7], "ʉ": [6], "iː": [6], "eː": [4], "aː": [2], "oː": [8], 
-    "uː": [7], "i͡a": [6, 2], "i͡e": [6, 4], "i͡u": [6, 7], "u͡o": [7, 8], "au": [2, 7], "ou": [8, 7], "ə": [1], 
-    "p": [21], "b": [21], "t": [19], "d": [19], "c": [16], "ɟ": [16], "k": [20], "g": [20], "t͡s": [19, 15], 
-    "d͡z": [19, 15], "t͡ʃ": [19, 16], "d͡ʒ": [19, 16], "f": [18], "v": [18], "s": [15], "z": [15], "ʃ": [16], 
-    "ʒ": [16], "x": [12], "ɦ": [12], "r": [13], "r̩": [13], "r̩ː": [13], "l": [14], "l̩": [14], "l̩ː": [14], 
-    "ʎ": [14], "m": [21], "ɱ": [21], "n": [19], "ɴ": [19], "ŋ": [20], "ɲ": [19], "u̯": [7], "i̯": [6], "j": [6], 
-    "w": [7],
-
-    # sl-SI
-    "ə": [1], "a": [2], "aː": [2], "ɛ": [4], "eː": [4], "ɛː": [4], "i": [6], "iː": [6], "ɔ": [3], "ɔː": [3], 
-    "oː": [8], "u": [7], "uː": [7], "b": [21], "d": [19], "dˡ": [19], "dn": [19, 19], "d͡ʒ": [19, 16], 
-    "d͡z": [19, 15], "f": [18], "ɱ": [21], "ɣ": [20], "g": [20], "ɪ": [6], "j": [6], "k": [20], "l": [14], "lʲ": [14], 
-    "m": [21], "ŋ": [20], "n": [19], "nʲ": [19], "p": [21], "r": [13], "s": [15], "ʃ": [16], "t": [19], "tˡ": [19], 
-    "tn": [19, 19], "t͡ʃ": [19, 16], "t͡s": [19, 15], "u̯": [7], "v": [18], "w": [7], "ʍ": [7], "x": [12], "ʒ": [16], 
-    "z": [15],
-
-    # sv-SE
-    "a": [2], "æ": [1], "æː": [1], "ɑː": [2], "ɔ": [3], "a‿u": [2, 7], "ə": [1], "e": [4], "ɛ": [4], "ɛː": [4], 
-    "eː": [4], "ɶ": [8], "œː": [4], "œ": [4], "øː": [1], "ɪ": [6], "iː": [6], "ʊ": [4], "uː": [7], "oː": [8], 
-    "ɵ": [1], "ʉː": [6], "y": [4], "yː": [4], "p": [21], "t": [19], "k": [20], "b": [21], "d": [19], "g": [20], 
-    "f": [18], "h": [12], "s": [15], "ɧ": [16], "ɕ": [16], "v": [18], "m": [21], "n": [19], "ŋ": [20], "l": [14], 
-    "r": [13], "j": [6], "ɖ": [19], "ɭ": [14], "ɳ": [19], "ʂ": [15], "ʈ": [19],
-
-    # th-TH
-    "a": [2], "aː": [2], "e": [4], "eː": [4], "i": [6], "iː": [6], "ia": [6, 2], "o": [8], "oː": [8], "ə": [1], 
-    "əː": [1], "u": [7], "uː": [7], "ua": [7, 2], "ɯ": [6], "ɯː": [6], "ɯa": [6, 2], "ɛ": [4], "ɛː": [4], 
-    "ɔ": [3], "ɔː": [3], "b": [21], "t͡ɕ": [19, 16], "tɕʰ": [19, 16], "d": [19], "f": [18], "h": [12], "j": [6], 
-    "k": [20], "kʰ": [20], "l": [14], "m": [21], "n": [19], "ŋ": [20], "p": [21], "pʰ": [21], "r": [13], 
-    "s": [15], "t": [19], "tʰ": [19], "w": [7], "ʔ": [19],
-
-    # tr-TR
-    "a": [2], "ɑː": [2], "e": [4], "eː": [4], "œ": [4], "œ͡ɟ": [4, 16], "i": [6], "i͡ɟ": [6, 16], "o": [8], 
-    "o͡ɟ": [8, 16], "u": [7], "u͡ɟ": [7, 16], "ɯ": [6], "ɯ͡ɟ": [6, 16], "y": [4], "y͡ɟ": [4, 16], "b": [21], 
-    "c": [16], "t͡ʃ": [19, 16], "d": [19], "f": [18], "ɡ": [20], "ɣ": [20], "ɟ": [16], "h": [12], "j": [6], 
-    "d͡ʒ": [19, 16], "k": [20], "l": [14], "ɮ": [6], "m": [21], "n": [19], "ŋ": [20], "p": [21], "ɾ": [19], 
-    "s": [15], "ʃ": [16], "t": [19], "v": [18], "w": [7], "z": [15], "ʒ": [16],
-
-    # vi-VN
-    "a": [2], "ɛ": [4], "i": [6], "ɔ": [3], "u": [7], "u͡a": [7, 2], "a͡j": [2, 6], "ɛ̆j": [4, 6], "ə͡j": [1, 6], 
-    "o": [8], "i͡e͡w": [6, 4, 7], "ɨ͡ə": [6, 1], "ɔ͡i": [3, 6], "ə": [1], "ie": [6, 4], "u͡j": [7, 6], "a͡w": [2, 7], 
-    "ɨ": [6], "ɐ": [4], "ăw": [2, 7], "ăj": [2, 6], "ɨ͡ə͡j": [6, 1, 6], "o͡j": [8, 6], "əː": [1], "e": [4], 
-    "ɔ̆w": [3, 7], "ɛ͡w": [4, 7], "i͡w": [6, 7], "ɨ͡w": [6, 7], "e͡j": [4, 6], "ɨ͡ʌ͡w": [6, 1, 7], "ɨ͡j": [6, 6], 
-    "ɪ": [6], "iə": [6, 1], "a͡ʲ": [2], "ɓ": [21], "k": [20], "z": [15], "j": [6], "ɹ": [13], "f": [18], "ɣ": [20], 
-    "h": [12], "l": [14], "m": [21], "n": [19], "p": [21], "s": [15], "ʂ": [15], "t": [19], "v": [18], "ɗ": [19], 
-    "ŋ": [20], "x": [12], "ɲ": [19], "tʰ": [19], "ʈ": [19], "t͡ʃ": [19, 16], "w": [7]
-}
\ No newline at end of file

From 62cd4e13996616af147eb70d57ef70736949ecae Mon Sep 17 00:00:00 2001
From: fabiocat93 <fabio-cat93@hotmail.it>
Date: Mon, 7 Oct 2024 18:45:22 -0400
Subject: [PATCH 5/7] fixing style issues

---
 {TTS/STV => STV}/phoneme_viseme_map.json |   0
 {TTS/STV => STV}/speech_to_visemes.py    |   0
 STV/w2v_stv_handler.py                   | 257 +++++++++++++++++++++++
 TTS/chatTTS_handler.py                   |  79 +++----
 TTS/melo_handler.py                      |  46 ++--
 TTS/parler_handler.py                    |  39 ++--
 arguments_classes/w2v_stv_arguments.py   |  29 +++
 connections/local_audio_streamer.py      |   2 +-
 connections/socket_sender.py             |   2 +-
 listen_and_play.py                       |  15 +-
 s2s_pipeline.py                          |  40 +++-
 11 files changed, 391 insertions(+), 118 deletions(-)
 rename {TTS/STV => STV}/phoneme_viseme_map.json (100%)
 rename {TTS/STV => STV}/speech_to_visemes.py (100%)
 create mode 100644 STV/w2v_stv_handler.py
 create mode 100644 arguments_classes/w2v_stv_arguments.py

diff --git a/TTS/STV/phoneme_viseme_map.json b/STV/phoneme_viseme_map.json
similarity index 100%
rename from TTS/STV/phoneme_viseme_map.json
rename to STV/phoneme_viseme_map.json
diff --git a/TTS/STV/speech_to_visemes.py b/STV/speech_to_visemes.py
similarity index 100%
rename from TTS/STV/speech_to_visemes.py
rename to STV/speech_to_visemes.py
diff --git a/STV/w2v_stv_handler.py b/STV/w2v_stv_handler.py
new file mode 100644
index 0000000..7e65403
--- /dev/null
+++ b/STV/w2v_stv_handler.py
@@ -0,0 +1,257 @@
+import json
+import logging
+import time
+from typing import Any, Dict, Generator, List
+
+import numpy as np
+from rich.console import Console
+from transformers import pipeline
+
+from baseHandler import BaseHandler
+
+logger = logging.getLogger(__name__)
+console = Console()
+
+
+class Wav2Vec2STVHandler(BaseHandler):
+    """
+    Handles the Speech-To-Viseme generation using a Wav2Vec2 model for automatic
+    speech recognition (ASR) and phoneme mapping to visemes.
+
+    Attributes:
+        MIN_AUDIO_LENGTH (float): Minimum length of audio (in seconds) required
+                                  for phoneme extraction.
+    """
+
+    MIN_AUDIO_LENGTH = 0.5  # Minimum audio length in seconds for phoneme extraction
+
+    def setup(
+        self,
+        should_listen: bool,
+        model_name: str = "bookbot/wav2vec2-ljspeech-gruut",
+        blocksize: int = 512,
+        device: str = "cuda",
+        skip: bool = False,
+        gen_kwargs: Dict[str, Any] = {},  # Not used
+    ) -> None:
+        """
+        Initializes the handler by loading the ASR model and phoneme-to-viseme map.
+
+        Args:
+            should_listen (bool): Flag indicating whether the speech-to-speech pipeline should start
+                listening to the user or not.
+            model_name (str): Name of the ASR model to use.
+                Defaults to "bookbot/wav2vec2-ljspeech-gruut".
+            blocksize (int): Size of each audio block when processing audio.
+                Defaults to 512.
+            device (str): Device to run the model on ("cuda", "mps", or "cpu").
+                Defaults to "cuda".
+            skip (bool): If True, the speech-to-viseme process is skipped.
+                Defaults to False.
+            gen_kwargs (dict): Additional parameters for speech generation.
+
+        Returns:
+            None
+        """
+        self.device = device
+        self.gen_kwargs = gen_kwargs
+        self.blocksize = blocksize
+        self.should_listen = should_listen
+        self.skip = skip
+
+        # Load phoneme-to-viseme map from the JSON file
+        phoneme_viseme_map_file = "STV/phoneme_viseme_map.json"
+        with open(phoneme_viseme_map_file, "r") as f:
+            self.phoneme_viseme_map = json.load(f)
+
+        # Initialize the ASR pipeline using the specified model and device
+        self.asr_pipeline = pipeline(
+            "automatic-speech-recognition",
+            model=model_name,
+            device=device,
+            torch_dtype="auto",
+        )
+        self.expected_sampling_rate = self.asr_pipeline.feature_extractor.sampling_rate
+
+        # Initialize an empty dictionary to store audio batch data
+        self.audio_batch = {
+            "waveform": np.array([]),
+            "sampling_rate": self.expected_sampling_rate,
+        }
+        self.text_batch = None
+        self.should_listen_flag = False
+
+        self.warmup()  # Perform model warmup
+
+    def warmup(self) -> None:
+        """Warms up the model with dummy input to prepare it for inference.
+
+        Returns:
+            None
+        """
+        logger.info(f"Warming up {self.__class__.__name__}")
+        start_time = time.time()
+
+        # Create dummy input for warmup inference
+        dummy_input = np.random.randn(self.blocksize).astype(np.int16)
+        _ = self.speech_to_visemes(dummy_input)
+
+        warmup_time = time.time() - start_time
+        logger.info(
+            f"{self.__class__.__name__}: warmed up in {warmup_time:.4f} seconds!"
+        )
+
+    def speech_to_visemes(self, audio: Any) -> List[Dict[str, Any]]:
+        """
+        Converts speech audio to visemes by performing Automatic Speech Recognition (ASR)
+        and mapping phonemes to visemes.
+
+        Args:
+            audio (Any): The input audio data.
+
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries containing mapped visemes
+                                  and their corresponding timestamps.
+
+        Note:
+            Heuristically, the input audio should be at least 0.5 seconds long for proper phoneme extraction.
+        """
+
+        def _map_phonemes_to_visemes(
+            data: Dict[str, Any],
+        ) -> List[Dict[str, Any]]:
+            """
+            Maps extracted phonemes to their corresponding visemes based on a predefined map.
+
+            Args:
+                data (Dict[str, Any]): Dictionary containing phoneme data where data['chunks']
+                                    holds a list of phonemes and their timestamps.
+
+            Returns:
+                List[Dict[str, Any]]: A list of dictionaries with viseme IDs and their corresponding timestamps.
+            """
+            viseme_list = []
+            chunks = data.get("chunks", [])
+
+            # Map each phoneme to corresponding visemes
+            for chunk in chunks:
+                phoneme = chunk.get("text", None)
+                timestamp = chunk.get("timestamp", None)
+                visemes = self.phoneme_viseme_map.get(phoneme, [])
+
+                for viseme in visemes:
+                    viseme_list.append({"viseme": viseme, "timestamp": timestamp})
+
+            return viseme_list
+
+        # Perform ASR to extract phoneme data, including timestamps
+        try:
+            asr_result = self.asr_pipeline(audio, return_timestamps="char")
+        except Exception as e:
+            logger.error(f"ASR error: {e}")
+            return []
+        # Map the phonemes obtained from ASR to visemes
+        return _map_phonemes_to_visemes(asr_result)
+
+    def process(self, data: Dict[str, Any]) -> Generator[Dict[str, Any], None, None]:
+        """
+        Processes an audio file to generate visemes and output blocks of audio data
+        along with corresponding viseme data.
+
+        Args:
+            data (Dict[str, Any]): Dictionary containing audio, text, and potentially additional information.
+
+        Yields:
+            Dict: A dictionary containing audio waveform, and optionally viseme data, text, and potentially additional information.
+        """
+
+        if "sentence_end" in data and data["sentence_end"]:
+            self.should_listen_flag = True
+        if self.skip:  # Skip viseme extraction if the flag is set
+            yield {
+                "audio": {
+                    "waveform": data["audio"]["waveform"],
+                    "sampling_rate": data["audio"]["sampling_rate"],
+                },
+                "text": data["text"] if "text" in data else None,
+            }
+        else:
+            # Check if text data is present and save it for later
+            if "text" in data and data["text"] is not None:
+                self.text_batch = data["text"]
+            # Concatenate new audio data into the buffer if available and valid
+            if "audio" in data and data["audio"] is not None:
+                audio_data = data["audio"]
+                # Check if the sampling rate is valid and matches the expected one
+                if audio_data.get("sampling_rate", None) != self.expected_sampling_rate:
+                    logger.error(
+                        f"Expected sampling rate {self.expected_sampling_rate}, "
+                        f"but got {audio_data['sampling_rate']}."
+                    )
+                    return
+                # Append the waveform to the audio buffer
+                self.audio_batch["waveform"] = np.concatenate(
+                    (self.audio_batch["waveform"], audio_data["waveform"]), axis=0
+                )
+
+            # Ensure the total audio length is sufficient for phoneme extraction
+            if (
+                len(self.audio_batch["waveform"]) / self.audio_batch["sampling_rate"]
+                < self.MIN_AUDIO_LENGTH
+            ):
+                return
+            else:
+                logger.debug("Starting viseme inference...")
+
+                # Perform viseme inference using the accumulated audio batch
+                viseme_data = self.speech_to_visemes(self.audio_batch["waveform"])
+                logger.debug("Viseme inference completed.")
+
+                # Print the visemes and timestamps to the console
+                for viseme in viseme_data:
+                    console.print(
+                        f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}"
+                    )
+
+                # Process the audio in chunks of the defined blocksize
+                self.audio_batch["waveform"] = self.audio_batch["waveform"].astype(
+                    np.int16
+                )
+                for i in range(0, len(self.audio_batch["waveform"]), self.blocksize):
+                    chunk_waveform = self.audio_batch["waveform"][
+                        i : i + self.blocksize
+                    ]
+                    padded_waveform = np.pad(
+                        chunk_waveform, (0, self.blocksize - len(chunk_waveform))
+                    )
+
+                    chunk_data = {
+                        "audio": {
+                            "waveform": padded_waveform,
+                            "sample_rate": self.audio_batch["sampling_rate"],
+                        }
+                    }
+
+                    # Add text and viseme data only in the first chunk
+                    if i == 0:
+                        if self.text_batch:
+                            chunk_data["text"] = self.text_batch
+                        if viseme_data and len(viseme_data) > 0:
+                            chunk_data["visemes"] = viseme_data
+                    yield chunk_data
+
+                # Reset the audio and text buffer after processing
+                self.audio_batch = {
+                    "waveform": np.array([]),
+                    "sampling_rate": self.expected_sampling_rate,
+                }
+                self.text_batch = ""
+        
+        if self.should_listen_flag:
+            self.should_listen.set()
+            self.should_listen_flag = False
+
+
+# TODO: Test in all modalities and TTS models**: Ensure compatibility with the different models. This requires integration testing with your models and modalities.
+# in s2s_pipeline change some names
+# remove some prints
\ No newline at end of file
diff --git a/TTS/chatTTS_handler.py b/TTS/chatTTS_handler.py
index 1cee897..6c177c4 100644
--- a/TTS/chatTTS_handler.py
+++ b/TTS/chatTTS_handler.py
@@ -5,7 +5,6 @@
 import numpy as np
 from rich.console import Console
 import torch
-from .STV.speech_to_visemes import SpeechToVisemes
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -18,14 +17,11 @@
 class ChatTTSHandler(BaseHandler):
     def setup(
         self,
-        should_listen,
         device="cuda",
         gen_kwargs={},  # Unused
         stream=True,
         chunk_size=512,
-        viseme_flag = True
     ):
-        self.should_listen = should_listen
         self.device = device
         self.model = ChatTTS.Chat()
         self.model.load(compile=False)  # Doesn't work for me with True
@@ -35,9 +31,7 @@ def setup(
         self.params_infer_code = ChatTTS.Chat.InferCodeParams(
             spk_emb=rnd_spk_emb,
         )
-        self.viseme_flag = viseme_flag
-        if self.viseme_flag:
-            self.speech_to_visemes = SpeechToVisemes()
+        self.output_sampling_rate = 16000
         self.warmup()
 
     def warmup(self):
@@ -45,6 +39,8 @@ def warmup(self):
         _ = self.model.infer("text")
 
     def process(self, llm_sentence):
+        if isinstance(llm_sentence, tuple):
+            llm_sentence, _ = llm_sentence # Ignore language
         console.print(f"[green]ASSISTANT: {llm_sentence}")
         if self.device == "mps":
             import time
@@ -64,67 +60,62 @@ def process(self, llm_sentence):
             wavs = [np.array([])]
             for gen in wavs_gen:
                 if gen[0] is None or len(gen[0]) == 0:
-                    self.should_listen.set()
-                    return
+                    return {
+                        "text": llm_sentence,
+                        "sentence_end": True
+                    }
                 
                 # Resample the audio to 16000 Hz
-                audio_chunk = librosa.resample(gen[0], orig_sr=24000, target_sr=16000)
+                audio_chunk = librosa.resample(gen[0], orig_sr=24000, target_sr=self.output_sampling_rate)
                 # Ensure the audio is converted to mono (single channel)
                 if len(audio_chunk.shape) > 1:
                     audio_chunk = librosa.to_mono(audio_chunk)
                 audio_chunk = (audio_chunk * 32768).astype(np.int16)
-                
-                # Process visemes if viseme_flag is set
-                if self.viseme_flag:
-                    visemes = self.speech_to_visemes.process(audio_chunk)
-                    for viseme in visemes:
-                        console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}")
-                else:
-                    visemes = None
-                
+                                
                 # Loop through audio chunks, yielding dict for each chunk
                 for i in range(0, len(audio_chunk), self.chunk_size):
                     chunk_data = {
-                        "audio": np.pad(
-                            audio_chunk[i : i + self.chunk_size],
-                            (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])),
-                        )
+                        "audio": {
+                            "waveform": np.pad(
+                                audio_chunk[i : i + self.chunk_size],
+                                (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])),
+                            ),
+                            "sampling_rate": self.output_sampling_rate,
+                        }
                     }
-                    # Include text and visemes for the first chunk
+                    # Include text for the first chunk
                     if i == 0:
                         chunk_data["text"] = llm_sentence  # Assuming llm_sentence is defined elsewhere
-                        chunk_data["visemes"] = visemes
-                
+                    if i >= len(audio_chunk) - self.chunk_size:
+                        # This is the last round
+                        chunk_data["sentence_end"] = True
                     yield chunk_data
         else:
             wavs = wavs_gen
             if len(wavs[0]) == 0:
-                self.should_listen.set()
-                return
-            audio_chunk = librosa.resample(wavs[0], orig_sr=24000, target_sr=16000)
+                return {
+                    "sentence_end": True
+                }
+            audio_chunk = librosa.resample(wavs[0], orig_sr=24000, target_sr=self.output_sampling_rate)
             # Ensure the audio is converted to mono (single channel)
             if len(audio_chunk.shape) > 1:
                 audio_chunk = librosa.to_mono(audio_chunk)
             audio_chunk = (audio_chunk * 32768).astype(np.int16)
 
-            if self.viseme_flag:
-                visemes = self.speech_to_visemes.process(audio_chunk)
-                for viseme in visemes:
-                    console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}")
-            else:
-                visemes = None
-
             for i in range(0, len(audio_chunk), self.chunk_size):
                 chunk_data = {
-                    "audio": np.pad(
-                        audio_chunk[i : i + self.chunk_size],
-                        (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])),
-                    )
+                    "audio": {
+                        "waveform": np.pad(
+                            audio_chunk[i : i + self.chunk_size],
+                            (0, self.chunk_size - len(audio_chunk[i : i + self.chunk_size])),
+                        ),
+                        "sampling_rate": self.output_sampling_rate,
+                    }
                 }
-                # For the first chunk, include text and visemes
+                # For the first chunk, include text
                 if i == 0:
                     chunk_data["text"] = llm_sentence
-                    chunk_data["visemes"] = visemes            
+                if i >= len(audio_chunk) - self.chunk_size:
+                    # This is the last round
+                    chunk_data["sentence_end"] = True
                 yield chunk_data
-
-        self.should_listen.set()
diff --git a/TTS/melo_handler.py b/TTS/melo_handler.py
index 64fbbbc..be25007 100644
--- a/TTS/melo_handler.py
+++ b/TTS/melo_handler.py
@@ -6,8 +6,6 @@
 from rich.console import Console
 import torch
 
-from .STV.speech_to_visemes import SpeechToVisemes
-
 logger = logging.getLogger(__name__)
 
 console = Console()
@@ -33,15 +31,12 @@
 class MeloTTSHandler(BaseHandler):
     def setup(
         self,
-        should_listen,
-        device="mps",
+        device="auto",
         language="en",
         speaker_to_id="en",
         gen_kwargs={},  # Unused
         blocksize=512,
-        viseme_flag = True # To obtain timestamped visemes
     ):
-        self.should_listen = should_listen
         self.device = device
         self.language = language
         self.model = TTS(
@@ -51,10 +46,7 @@ def setup(
             WHISPER_LANGUAGE_TO_MELO_SPEAKER[speaker_to_id]
         ]
         self.blocksize = blocksize
-
-        self.viseme_flag = viseme_flag
-        if self.viseme_flag:
-            self.speech_to_visemes = SpeechToVisemes()
+        self.output_sampling_rate = 16000
 
         self.warmup()
 
@@ -103,29 +95,27 @@ def process(self, llm_sentence):
             logger.error(f"Error in MeloTTSHandler: {e}")
             audio_chunk = np.array([])
         if len(audio_chunk) == 0:
-            self.should_listen.set()
-            return
-        audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000)
+            return {
+                "text": llm_sentence,
+                "sentence_end": True
+            }
+        audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=self.output_sampling_rate)
         audio_chunk = (audio_chunk * 32768).astype(np.int16)
 
-        if self.viseme_flag:
-            visemes = self.speech_to_visemes.process(audio_chunk)
-            for viseme in visemes:
-                console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}")
-        else:
-            visemes = None
-
         for i in range(0, len(audio_chunk), self.blocksize):
             chunk_data = {
-                "audio": np.pad(
-                    audio_chunk[i : i + self.blocksize],
-                    (0, self.blocksize - len(audio_chunk[i : i + self.blocksize]))
-                )
+                "audio": {
+                    "waveform": np.pad(
+                        audio_chunk[i : i + self.blocksize],
+                        (0, self.blocksize - len(audio_chunk[i : i + self.blocksize]))
+                    ), 
+                    "sampling_rate": self.output_sampling_rate
+                }
             }
-            # For the first chunk, include text and visemes
+            # For the first chunk, include text
             if i == 0:
                 chunk_data["text"] = llm_sentence
-                chunk_data["visemes"] = visemes            
+            if i >= len(audio_chunk) - self.blocksize:
+                # This is the last round
+                chunk_data["sentence_end"] = True
             yield chunk_data
-
-        self.should_listen.set()
diff --git a/TTS/parler_handler.py b/TTS/parler_handler.py
index 0703180..2b84e8d 100644
--- a/TTS/parler_handler.py
+++ b/TTS/parler_handler.py
@@ -14,8 +14,6 @@
 from transformers.utils.import_utils import (
     is_flash_attn_2_available,
 )
-from .STV.speech_to_visemes import SpeechToVisemes
-
 torch._inductor.config.fx_graph_cache = True
 # mind about this parameter ! should be >= 2 * number of padded prompt sizes for TTS
 torch._dynamo.config.cache_size_limit = 15
@@ -35,7 +33,6 @@
 class ParlerTTSHandler(BaseHandler):
     def setup(
         self,
-        should_listen,
         model_name="ylacombe/parler-tts-mini-jenny-30H",
         device="cuda",
         torch_dtype="float16",
@@ -48,9 +45,7 @@ def setup(
         ),
         play_steps_s=1,
         blocksize=512,
-        viseme_flag = True
     ):
-        self.should_listen = should_listen
         self.device = device
         self.torch_dtype = getattr(torch, torch_dtype)
         self.gen_kwargs = gen_kwargs
@@ -79,10 +74,7 @@ def setup(
             self.model.forward = torch.compile(
                 self.model.forward, mode=self.compile_mode, fullgraph=True
             )
-
-        self.viseme_flag = viseme_flag
-        if self.viseme_flag:
-            self.speech_to_visemes = SpeechToVisemes()
+        self.output_sampling_rate = 16000
 
         self.warmup()
 
@@ -186,27 +178,24 @@ def process(self, llm_sentence):
                 logger.info(
                     f"Time to first audio: {perf_counter() - pipeline_start:.3f}"
                 )
-            audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=16000)
+            audio_chunk = librosa.resample(audio_chunk, orig_sr=44100, target_sr=self.output_sampling_rate)
             audio_chunk = (audio_chunk * 32768).astype(np.int16)
 
-            if self.viseme_flag:
-                visemes = self.speech_to_visemes.process(audio_chunk)
-                for viseme in visemes:
-                    console.print(f"[blue]ASSISTANT_MOUTH_SHAPE: {viseme['viseme']} -- {viseme['timestamp']}")
-            else:
-                visemes = None
-
             for i in range(0, len(audio_chunk), self.blocksize):
                 chunk_data = {
-                    "audio": np.pad(
-                        audio_chunk[i : i + self.blocksize],
-                        (0, self.blocksize - len(audio_chunk[i : i + self.blocksize]))
-                    )
+                    "audio": {
+                        "waveform": np.pad(
+                            audio_chunk[i : i + self.blocksize],
+                            (0, self.blocksize - len(audio_chunk[i : i + self.blocksize]))
+                        ), 
+                        "sampling_rate": self.output_sampling_rate
+                    }
                 }
-                # For the first chunk, include text and visemes
+                # For the first chunk, include text
                 if i == 0:
                     chunk_data["text"] = llm_sentence
-                    chunk_data["visemes"] = visemes            
-                yield chunk_data
+                if i >= len(audio_chunk) - self.blocksize:
+                    # This is the last round
+                    chunk_data["sentence_end"] = True
 
-        self.should_listen.set()
+                yield chunk_data
diff --git a/arguments_classes/w2v_stv_arguments.py b/arguments_classes/w2v_stv_arguments.py
new file mode 100644
index 0000000..229610a
--- /dev/null
+++ b/arguments_classes/w2v_stv_arguments.py
@@ -0,0 +1,29 @@
+"""This file contains the arguments for the Wav2Vec2STVHandler."""
+from dataclasses import dataclass, field
+
+@dataclass
+class Wav2Vec2STVHandlerArguments:
+    stv_model_name: str = field(
+        default="bookbot/wav2vec2-ljspeech-gruut",
+        metadata={
+            "help": "The pretrained language model to use. Default is 'bookbot/wav2vec2-ljspeech-gruut'."
+        },
+    )
+    stv_device: str = field(
+        default="cuda",
+        metadata={
+            "help": "The device type on which the model will run. Default is 'cuda' for GPU acceleration."
+        },
+    )
+    stv_blocksize: int = field(
+        default=512,
+        metadata={
+            "help": "The blocksize of the model. Default is 512."
+        },
+    )
+    stv_skip: bool = field(
+        default=False,
+        metadata={
+            "help": "If True, skips the STV generation. Default is False."
+        },
+    )
diff --git a/connections/local_audio_streamer.py b/connections/local_audio_streamer.py
index d42fbe7..5faef4a 100644
--- a/connections/local_audio_streamer.py
+++ b/connections/local_audio_streamer.py
@@ -38,7 +38,7 @@ def callback(indata, outdata, frames, time, status):
                     visemes = data['visemes']
                     logger.info(f"Visemes: {visemes}")
                 """
-                outdata[:] = data['audio'][:, np.newaxis]
+                outdata[:] = data['audio']['waveform'][:, np.newaxis]
 
         logger.debug("Available devices:")
         logger.debug(sd.query_devices())
diff --git a/connections/socket_sender.py b/connections/socket_sender.py
index fb5c7cb..f849bf3 100644
--- a/connections/socket_sender.py
+++ b/connections/socket_sender.py
@@ -33,7 +33,7 @@ def run(self):
             packet = {}
             if 'audio' in data and data['audio'] is not None:
                 audio_chunk = data['audio']
-                packet['audio'] = data['audio']
+                packet['audio'] = audio_chunk
             if 'text' in data and data['text'] is not None:
                 packet['text'] = data['text']
             if 'visemes' in data and data['visemes'] is not None:
diff --git a/listen_and_play.py b/listen_and_play.py
index 2082a5e..b1f282f 100644
--- a/listen_and_play.py
+++ b/listen_and_play.py
@@ -92,17 +92,10 @@ def receive_full_chunk(conn, chunk_size):
             serialized_packet = receive_full_chunk(recv_socket, packet_length)
             if serialized_packet:
                 # Step 4: Deserialize the packet using pickle
-                packet = pickle.loads(serialized_packet)
-                # Step 5: Extract the packet contents
-                if 'text' in packet:
-                    pass
-                    # print(packet['text'])
-                if 'visemes' in packet:
-                    pass
-                    # print(packet['visemes'])
-                
-                # Step 6: Put the packet audio data into the queue for sending
-                recv_queue.put(packet['audio'].tobytes())
+                packet = pickle.loads(serialized_packet)                
+                # Step 5: Put the packet audio data into the queue for sending, if any
+                if 'audio' in packet and packet['audio'] is not None and 'waveform' in packet['audio'] and packet['audio']['waveform'] is not None:
+                    recv_queue.put(packet['audio']['waveform'].tobytes())
 
     try:
         send_stream = sd.RawInputStream(
diff --git a/s2s_pipeline.py b/s2s_pipeline.py
index 1da202e..4c86bf5 100644
--- a/s2s_pipeline.py
+++ b/s2s_pipeline.py
@@ -8,6 +8,7 @@
 from typing import Optional
 from sys import platform
 from VAD.vad_handler import VADHandler
+from STV.w2v_stv_handler import Wav2Vec2STVHandler
 from arguments_classes.chat_tts_arguments import ChatTTSHandlerArguments
 from arguments_classes.language_model_arguments import LanguageModelHandlerArguments
 from arguments_classes.mlx_language_model_arguments import (
@@ -22,6 +23,7 @@
 from arguments_classes.whisper_stt_arguments import WhisperSTTHandlerArguments
 from arguments_classes.melo_tts_arguments import MeloTTSHandlerArguments
 from arguments_classes.open_api_language_model_arguments import OpenApiLanguageModelHandlerArguments
+from arguments_classes.w2v_stv_arguments import Wav2Vec2STVHandlerArguments
 import torch
 import nltk
 from rich.console import Console
@@ -82,6 +84,7 @@ def parse_arguments():
             ParlerTTSHandlerArguments,
             MeloTTSHandlerArguments,
             ChatTTSHandlerArguments,
+            Wav2Vec2STVHandlerArguments,
         )
     )
 
@@ -148,6 +151,8 @@ def overwrite_device_argument(common_device: Optional[str], *handler_kwargs):
                 kwargs.stt_device = common_device
             if hasattr(kwargs, "paraformer_stt_device"):
                 kwargs.paraformer_stt_device = common_device
+            if hasattr(kwargs, "stv_device"):
+                kwargs.stv_device = common_device
 
 
 def prepare_module_args(module_kwargs, *handler_kwargs):
@@ -167,6 +172,7 @@ def prepare_all_args(
     parler_tts_handler_kwargs,
     melo_tts_handler_kwargs,
     chat_tts_handler_kwargs,
+    stv_handler_kwargs,
 ):
     prepare_module_args(
         module_kwargs,
@@ -178,6 +184,7 @@ def prepare_all_args(
         parler_tts_handler_kwargs,
         melo_tts_handler_kwargs,
         chat_tts_handler_kwargs,
+        stv_handler_kwargs
     )
 
 
@@ -189,6 +196,7 @@ def prepare_all_args(
     rename_args(parler_tts_handler_kwargs, "tts")
     rename_args(melo_tts_handler_kwargs, "melo")
     rename_args(chat_tts_handler_kwargs, "chat_tts")
+    rename_args(stv_handler_kwargs, "stv")
 
 
 def initialize_queues_and_events():
@@ -200,6 +208,7 @@ def initialize_queues_and_events():
         "spoken_prompt_queue": Queue(),
         "text_prompt_queue": Queue(),
         "lm_response_queue": Queue(),
+        "send_viseme_queue": Queue(),
     }
 
 
@@ -216,6 +225,7 @@ def build_pipeline(
     parler_tts_handler_kwargs,
     melo_tts_handler_kwargs,
     chat_tts_handler_kwargs,
+    stv_handler_kwargs,
     queues_and_events,
 ):
     stop_event = queues_and_events["stop_event"]
@@ -225,11 +235,13 @@ def build_pipeline(
     spoken_prompt_queue = queues_and_events["spoken_prompt_queue"]
     text_prompt_queue = queues_and_events["text_prompt_queue"]
     lm_response_queue = queues_and_events["lm_response_queue"]
+    send_viseme_queue = queues_and_events["send_viseme_queue"]
+
     if module_kwargs.mode == "local":
         from connections.local_audio_streamer import LocalAudioStreamer
 
         local_audio_streamer = LocalAudioStreamer(
-            input_queue=recv_audio_chunks_queue, output_queue=send_audio_chunks_queue
+            input_queue=recv_audio_chunks_queue, output_queue=send_viseme_queue
         )
         comms_handlers = [local_audio_streamer]
         should_listen.set()
@@ -248,7 +260,7 @@ def build_pipeline(
             ),
             SocketSender(
                 stop_event,
-                send_audio_chunks_queue,
+                send_viseme_queue,
                 host=socket_sender_kwargs.send_host,
                 port=socket_sender_kwargs.send_port,
             ),
@@ -264,9 +276,17 @@ def build_pipeline(
 
     stt = get_stt_handler(module_kwargs, stop_event, spoken_prompt_queue, text_prompt_queue, whisper_stt_handler_kwargs, paraformer_stt_handler_kwargs)
     lm = get_llm_handler(module_kwargs, stop_event, text_prompt_queue, lm_response_queue, language_model_handler_kwargs, open_api_language_model_handler_kwargs, mlx_language_model_handler_kwargs)
-    tts = get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chunks_queue, should_listen, parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs)
+    tts = get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chunks_queue, parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs)
+
+    stv = Wav2Vec2STVHandler(
+        stop_event,
+        queue_in=send_audio_chunks_queue,
+        queue_out=send_viseme_queue,
+        setup_args=(should_listen,),
+        setup_kwargs=vars(stv_handler_kwargs),
+    )
 
-    return ThreadManager([*comms_handlers, vad, stt, lm, tts])
+    return ThreadManager([*comms_handlers, vad, stt, lm, tts, stv])
 
 
 def get_stt_handler(module_kwargs, stop_event, spoken_prompt_queue, text_prompt_queue, whisper_stt_handler_kwargs, paraformer_stt_handler_kwargs):
@@ -337,14 +357,14 @@ def get_llm_handler(
         raise ValueError("The LLM should be either transformers or mlx-lm")
 
 
-def get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chunks_queue, should_listen, parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs):
+def get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chunks_queue, parler_tts_handler_kwargs, melo_tts_handler_kwargs, chat_tts_handler_kwargs):
     if module_kwargs.tts == "parler":
         from TTS.parler_handler import ParlerTTSHandler
         return ParlerTTSHandler(
             stop_event,
             queue_in=lm_response_queue,
             queue_out=send_audio_chunks_queue,
-            setup_args=(should_listen,),
+            setup_args=(),
             setup_kwargs=vars(parler_tts_handler_kwargs),
         )
     elif module_kwargs.tts == "melo":
@@ -355,11 +375,12 @@ def get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chu
                 "Error importing MeloTTSHandler. You might need to run: python -m unidic download"
             )
             raise e
+            
         return MeloTTSHandler(
             stop_event,
             queue_in=lm_response_queue,
             queue_out=send_audio_chunks_queue,
-            setup_args=(should_listen,),
+            setup_args=(),
             setup_kwargs=vars(melo_tts_handler_kwargs),
         )
     elif module_kwargs.tts == "chatTTS":
@@ -372,7 +393,7 @@ def get_tts_handler(module_kwargs, stop_event, lm_response_queue, send_audio_chu
             stop_event,
             queue_in=lm_response_queue,
             queue_out=send_audio_chunks_queue,
-            setup_args=(should_listen,),
+            setup_args=(),
             setup_kwargs=vars(chat_tts_handler_kwargs),
         )
     else:
@@ -393,6 +414,7 @@ def main():
         parler_tts_handler_kwargs,
         melo_tts_handler_kwargs,
         chat_tts_handler_kwargs,
+        stv_handler_kwargs,
     ) = parse_arguments()
 
     setup_logger(module_kwargs.log_level)
@@ -407,6 +429,7 @@ def main():
         parler_tts_handler_kwargs,
         melo_tts_handler_kwargs,
         chat_tts_handler_kwargs,
+        stv_handler_kwargs
     )
 
     queues_and_events = initialize_queues_and_events()
@@ -424,6 +447,7 @@ def main():
         parler_tts_handler_kwargs,
         melo_tts_handler_kwargs,
         chat_tts_handler_kwargs,
+        stv_handler_kwargs,
         queues_and_events,
     )
 

From 66c4f78b632d4c5b68ee720fea4505a63faa1a7e Mon Sep 17 00:00:00 2001
From: fabiocat93 <fabio-cat93@hotmail.it>
Date: Tue, 8 Oct 2024 18:46:58 -0400
Subject: [PATCH 6/7] integrating speech to visemes as part of the s2s flow

---
 .gitignore                          |  4 +++-
 README.md                           | 11 +++++++++++
 STT/paraformer_handler.py           |  1 -
 STV/w2v_stv_handler.py              |  6 +-----
 connections/local_audio_streamer.py | 10 ----------
 5 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index 95dc6c6..344d83d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 __pycache__
 tmp
 cache
-mlx_models/
\ No newline at end of file
+mlx_models/
+asset/
+config/
\ No newline at end of file
diff --git a/README.md b/README.md
index 9f0765c..1d517ba 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ This repository implements a speech-to-speech cascaded pipeline consisting of th
 2. **Speech to Text (STT)**
 3. **Language Model (LM)**
 4. **Text to Speech (TTS)**
+5. **Speech to Visemes (STV)**
 
 ### Modularity
 The pipeline provides a fully open and modular approach, with a focus on leveraging models available through the Transformers library on the Hugging Face hub. The code is designed for easy modification, and we already support device-specific and external library implementations:
@@ -50,6 +51,9 @@ The pipeline provides a fully open and modular approach, with a focus on leverag
 - [MeloTTS](https://github.com/myshell-ai/MeloTTS)
 - [ChatTTS](https://github.com/2noise/ChatTTS?tab=readme-ov-file)
 
+**STV**
+- [Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/en/model_doc/wav2vec2_phoneme) + [Phoneme to viseme mapping](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-speech-synthesis-viseme?tabs=visemeid&pivots=programming-language-python#map-phonemes-to-visemes)
+
 ## Setup
 
 Clone the repository:
@@ -216,6 +220,13 @@ For example:
 --lm_model_name google/gemma-2b-it
 ```
 
+
+### STV parameters
+See [Wav2Vec2STVHandlerArguments](arguments_classes/w2v_stv_arguments.py) class. Notably:
+- `stv_model_name` is by default `bookbot/wav2vec2-ljspeech-gruut` and has been chosen because accurate and fast enough
+- `stv_skip`, flag it to `True` if you don't need visemes
+
+
 ### Generation parameters
 
 Other generation parameters of the model's generate method can be set using the part's prefix + `_gen_`, e.g., `--stt_gen_max_new_tokens 128`. These parameters can be added to the pipeline part's arguments class if not already exposed.
diff --git a/STT/paraformer_handler.py b/STT/paraformer_handler.py
index dcadc02..09d481b 100644
--- a/STT/paraformer_handler.py
+++ b/STT/paraformer_handler.py
@@ -28,7 +28,6 @@ def setup(
         device="cuda",
         gen_kwargs={},
     ):
-        print(model_name)
         if len(model_name.split("/")) > 1:
             model_name = model_name.split("/")[-1]
         self.device = device
diff --git a/STV/w2v_stv_handler.py b/STV/w2v_stv_handler.py
index 7e65403..20c2cef 100644
--- a/STV/w2v_stv_handler.py
+++ b/STV/w2v_stv_handler.py
@@ -60,6 +60,7 @@ def setup(
         self.skip = skip
 
         # Load phoneme-to-viseme map from the JSON file
+        # inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets
         phoneme_viseme_map_file = "STV/phoneme_viseme_map.json"
         with open(phoneme_viseme_map_file, "r") as f:
             self.phoneme_viseme_map = json.load(f)
@@ -250,8 +251,3 @@ def process(self, data: Dict[str, Any]) -> Generator[Dict[str, Any], None, None]
         if self.should_listen_flag:
             self.should_listen.set()
             self.should_listen_flag = False
-
-
-# TODO: Test in all modalities and TTS models**: Ensure compatibility with the different models. This requires integration testing with your models and modalities.
-# in s2s_pipeline change some names
-# remove some prints
\ No newline at end of file
diff --git a/connections/local_audio_streamer.py b/connections/local_audio_streamer.py
index 5faef4a..99b9d83 100644
--- a/connections/local_audio_streamer.py
+++ b/connections/local_audio_streamer.py
@@ -28,16 +28,6 @@ def callback(indata, outdata, frames, time, status):
                 outdata[:] = 0 * outdata
             else:
                 data = self.output_queue.get()
-                """
-                # Check if text data is present and log it
-                if data.get('text') is not None:
-                    text = data['text']
-                    logger.info(f"Text: {text}")
-                # Check if viseme data is present and log it
-                if data.get('visemes') is not None:
-                    visemes = data['visemes']
-                    logger.info(f"Visemes: {visemes}")
-                """
                 outdata[:] = data['audio']['waveform'][:, np.newaxis]
 
         logger.debug("Available devices:")

From c7b85e18e978b988e84e5b2db368b7599aacb101 Mon Sep 17 00:00:00 2001
From: Fabio Catania <fabiocat@mit.edu>
Date: Tue, 8 Oct 2024 18:53:31 -0400
Subject: [PATCH 7/7] Delete STV/speech_to_visemes.py

---
 STV/speech_to_visemes.py | 101 ---------------------------------------
 1 file changed, 101 deletions(-)
 delete mode 100644 STV/speech_to_visemes.py

diff --git a/STV/speech_to_visemes.py b/STV/speech_to_visemes.py
deleted file mode 100644
index 16ad95c..0000000
--- a/STV/speech_to_visemes.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""This module contains the SpeechToVisemes class, which handles the conversion of speech to visemes."""
-from typing import List, Dict, Any
-from transformers import pipeline
-import logging
-import json
-
-logger = logging.getLogger(__name__)
-
-class SpeechToVisemes():
-    """
-    Handles the conversion of speech to visemes using a phoneme-to-viseme mapping.
-
-    Attributes:
-        model_name (str): The name of the model to use for speech recognition.
-        device (str): The device to run the model on (e.g., "cpu", "mps", "cuda").
-        gen_kwargs (dict): Additional generation parameters for the speech recognition pipeline.
-        asr_pipeline (transformers.Pipeline): The automatic speech recognition pipeline.
-    """
-
-    def __init__(
-        self,
-        model_name="bookbot/wav2vec2-ljspeech-gruut",
-        device="mps",
-        gen_kwargs={},
-    ):
-        """
-        Initializes the SpeechToVisemes class with the specified parameters.
-
-        Args:
-            model_name (str, optional): The name of the model to use for speech recognition.
-            device (str, optional): The device to run the model on.
-            gen_kwargs (dict, optional): Additional generation parameters for the speech recognition pipeline.
-        """
-        self.device = device
-        self.gen_kwargs = gen_kwargs
-
-        # This dictionary is inspired by https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets
-        phoneme_viseme_map_file="TTS/STV/phoneme_viseme_map.json"
-        with open(phoneme_viseme_map_file, 'r') as f:
-            self.phoneme_viseme_map = json.load(f)
-
-        # Initialize the automatic speech recognition pipeline
-        self.asr_pipeline = pipeline(
-            "automatic-speech-recognition", model=model_name, device=device
-        )
-
-    def _map_phonemes_to_visemes(
-        self, 
-        data: Dict[str, Any], 
-    ) -> List[Dict[str, Any]]:
-        """
-        Maps phonemes to corresponding visemes with timestamps.
-
-        Refer to the following references for more information on the phoneme-to-viseme mapping:
-            - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-speech-synthesis-viseme?tabs=visemeid&pivots=programming-language-python#map-phonemes-to-visemes
-            - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-ssml-phonetic-sets
-
-        Args:
-            data (Dict[str, Any]): A dictionary containing phoneme data, where data['chunks'] 
-                holds a list of phonemes and their timestamps.
-
-        Returns:
-            List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme ID 
-                and the corresponding timestamp.
-        """
-        viseme_list = []
-        chunks = data.get('chunks', [])
-
-        for _, chunk in enumerate(chunks):
-            phoneme = chunk.get('text', None)
-            timestamp = chunk.get('timestamp', None)
-            visemes = self.phoneme_viseme_map.get(phoneme, [])
-            
-            for viseme in visemes:
-                viseme_list.append({
-                    'viseme': viseme,
-                    'timestamp': timestamp
-                })
-
-        return viseme_list
-
-
-    def process(self, audio_file: str) -> List[Dict[str, Any]]:
-        """Process an audio file and convert speech to visemes.
-        
-        Heuristically, we found that the model requires at least 0.5 seconds of audio to run phoneme recognition.
-        This value may be also depended on the model, the language, and other factors.
-
-        Args:
-            audio_file (str): The path to the audio file.
-
-        Returns:
-            List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the viseme 
-                ID and the corresponding timestamp.
-        """
-        # Perform ASR to get phoneme data
-        asr_result = self.asr_pipeline(audio_file, return_timestamps='char')
-        # Map phonemes to visemes
-        viseme_data = self._map_phonemes_to_visemes(asr_result)
-        return viseme_data
-    
\ No newline at end of file