huggingface
diff --git a/‎Dockerfile.arm64
+13 b/‎Dockerfile.arm64
+13
diff --git a/‎LLM/mlx_language_model.py
+16-3 b/‎LLM/mlx_language_model.py
+16-3
diff --git a/‎LLM/openai_api_language_model.py
+89 b/‎LLM/openai_api_language_model.py
+89
diff --git a/‎README.md
+70-18 b/‎README.md
+70-18
diff --git a/‎STT/lightning_whisper_mlx_handler.py
+32-3 b/‎STT/lightning_whisper_mlx_handler.py
+32-3
diff --git a/‎TTS/melo_handler.py
+2-2 b/‎TTS/melo_handler.py
+2-2
diff --git a/‎TTS/parler_handler.py
+1-1 b/‎TTS/parler_handler.py
+1-1
diff --git a/‎VAD/vad_handler.py
+4 b/‎VAD/vad_handler.py
+4
@@ -0,0 +1,13 @@
+FROM nvcr.io/nvidia/l4t-pytorch:r35.2.1-pth2.0-py3
+
+ENV PYTHONUNBUFFERED 1
+
+WORKDIR /usr/src/app
+
+# Install packages
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
@@ -9,6 +9,14 @@
 
 console = Console()
 
+WHISPER_LANGUAGE_TO_LLM_LANGUAGE = {
+    "en": "english",
+    "fr": "french",
+    "es": "spanish",
+    "zh": "chinese",
+    "ja": "japanese",
+    "ko": "korean",
+}
 
 class MLXLanguageModelHandler(BaseHandler):
     """
@@ -44,7 +52,7 @@ def setup(
     def warmup(self):
         logger.info(f"Warming up {self.__class__.__name__}")
 
-        dummy_input_text = "Write me a poem about Machine Learning."
+        dummy_input_text = "Repeat the word 'home'."
         dummy_chat = [{"role": self.user_role, "content": dummy_input_text}]
 
         n_steps = 2
@@ -61,6 +69,11 @@ def warmup(self):
 
     def process(self, prompt):
         logger.debug("infering language model...")
+        language_code = None
+
+        if isinstance(prompt, tuple):
+            prompt, language_code = prompt
+            prompt = f"Please reply to my message in {WHISPER_LANGUAGE_TO_LLM_LANGUAGE[language_code]}. " + prompt
 
         self.chat.append({"role": self.user_role, "content": prompt})
 
@@ -86,9 +99,9 @@ def process(self, prompt):
             output += t
             curr_output += t
             if curr_output.endswith((".", "?", "!", "<|end|>")):
-                yield curr_output.replace("<|end|>", "")
+                yield (curr_output.replace("<|end|>", ""), language_code)
                 curr_output = ""
         generated_text = output.replace("<|end|>", "")
         torch.mps.empty_cache()
 
-        self.chat.append({"role": "assistant", "content": generated_text})
+        self.chat.append({"role": "assistant", "content": generated_text})
@@ -0,0 +1,89 @@
+from openai import OpenAI
+from LLM.chat import Chat
+from baseHandler import BaseHandler
+from rich.console import Console
+import logging
+import time
+logger = logging.getLogger(__name__)
+
+console = Console()
+from nltk import sent_tokenize
+
+class OpenApiModelHandler(BaseHandler):
+    """
+    Handles the language model part.
+    """
+    def setup(
+        self,
+        model_name="deepseek-chat",
+        device="cuda",
+        gen_kwargs={},
+        base_url =None,
+        api_key=None,
+        stream=False,
+        user_role="user",
+        chat_size=1,
+        init_chat_role="system",
+        init_chat_prompt="You are a helpful AI assistant.",
+    ):
+        self.model_name = model_name
+        self.stream = stream
+        self.chat = Chat(chat_size)
+        if init_chat_role:
+            if not init_chat_prompt:
+                raise ValueError(
+                    "An initial promt needs to be specified when setting init_chat_role."
+                )
+            self.chat.init_chat({"role": init_chat_role, "content": init_chat_prompt})
+        self.user_role = user_role
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+        self.warmup()
+
+    def warmup(self):
+        logger.info(f"Warming up {self.__class__.__name__}")
+        start = time.time()
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant"},
+                {"role": "user", "content": "Hello"},
+            ],
+            stream=self.stream
+        )
+        end = time.time()
+        logger.info(
+            f"{self.__class__.__name__}:  warmed up! time: {(end - start):.3f} s"
+        )
+    def process(self, prompt):
+            logger.debug("call api language model...")
+            self.chat.append({"role": self.user_role, "content": prompt})
+
+            language_code = None
+            if isinstance(prompt, tuple):
+                prompt, language_code = prompt
+
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[
+                    {"role": self.user_role, "content": prompt},
+                ],
+                stream=self.stream
+            )
+            if self.stream:
+                generated_text, printable_text = "", ""
+                for chunk in response:
+                    new_text = chunk.choices[0].delta.content or ""
+                    generated_text += new_text
+                    printable_text += new_text
+                    sentences = sent_tokenize(printable_text)
+                    if len(sentences) > 1:
+                        yield sentences[0], language_code
+                        printable_text = new_text
+                self.chat.append({"role": "assistant", "content": generated_text})
+                # don't forget last sentence
+                yield printable_text, language_code
+            else:
+                generated_text = response.choices[0].message.content
+                self.chat.append({"role": "assistant", "content": generated_text})
+                yield generated_text, language_code
+
@@ -79,27 +79,28 @@ https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install
 
 ### Server/Client Approach
 
-To run the pipeline on the server:
-```bash
-python s2s_pipeline.py --recv_host 0.0.0.0 --send_host 0.0.0.0
-```
+1. Run the pipeline on the server:
+   ```bash
+   python s2s_pipeline.py --recv_host 0.0.0.0 --send_host 0.0.0.0
+   ```
 
-Then run the client locally to handle sending microphone input and receiving generated audio:
-```bash
-python listen_and_play.py --host <IP address of your server>
-```
+2. Run the client locally to handle microphone input and receive generated audio:
+   ```bash
+   python listen_and_play.py --host <IP address of your server>
+   ```
 
-### Local approach (running on Mac)
-To run on mac, we recommend setting the flag `--local_mac_optimal_settings`:
-```bash
-python s2s_pipeline.py --local_mac_optimal_settings
-```
+### Local Approach (Mac)
+
+1. For optimal settings on Mac:
+   ```bash
+   python s2s_pipeline.py --local_mac_optimal_settings
+   ```
 
-You can also pass `--device mps` to have all the models set to device mps.
-The local mac optimal settings set the mode to be local as explained above and change the models to:
-- LightningWhisperMLX
-- MLX LM
-- MeloTTS
+This setting:
+   - Adds `--device mps` to use MPS for all models.
+     - Sets LightningWhisperMLX for STT
+     - Sets MLX LM for language model
+     - Sets MeloTTS for TTS
 
 ### Recommended usage with Cuda
 
@@ -117,6 +118,57 @@ python s2s_pipeline.py \
 
 For the moment, modes capturing CUDA Graphs are not compatible with streaming Parler-TTS (`reduce-overhead`, `max-autotune`).
 
+
+### Multi-language Support
+
+The pipeline supports multiple languages, allowing for automatic language detection or specific language settings. Here are examples for both local (Mac) and server setups:
+
+#### With the server version:
+
+
+For automatic language detection:
+
+```bash
+python s2s_pipeline.py \
+    --stt_model_name large-v3 \
+    --language zh \
+    --mlx_lm_model_name mlx-community/Meta-Llama-3.1-8B-Instruct \
+```
+
+Or for one language in particular, chinese in this example
+
+```bash
+python s2s_pipeline.py \
+    --stt_model_name large-v3 \
+    --language zh \
+    --mlx_lm_model_name mlx-community/Meta-Llama-3.1-8B-Instruct \
+```
+
+#### Local Mac Setup
+
+For automatic language detection:
+
+```bash
+python s2s_pipeline.py \
+    --local_mac_optimal_settings \
+    --device mps \
+    --stt_model_name large-v3 \
+    --language zh \
+    --mlx_lm_model_name mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
+```
+
+Or for one language in particular, chinese in this example
+
+```bash
+python s2s_pipeline.py \
+    --local_mac_optimal_settings \
+    --device mps \
+    --stt_model_name large-v3 \
+    --language zh \
+    --mlx_lm_model_name mlx-community/Meta-Llama-3.1-8B-Instruct-4bit \
+```
+
+
 ## Command-line Usage
 
 ### Model Parameters
 
@@ -4,12 +4,22 @@
 from lightning_whisper_mlx import LightningWhisperMLX
 import numpy as np
 from rich.console import Console
+from copy import copy
 import torch
 
 logger = logging.getLogger(__name__)
 
 console = Console()
 
+SUPPORTED_LANGUAGES = [
+    "en",
+    "fr",
+    "es",
+    "zh",
+    "ja",
+    "ko",
+]
+
 
 class LightningWhisperSTTHandler(BaseHandler):
     """
@@ -19,7 +29,7 @@ class LightningWhisperSTTHandler(BaseHandler):
     def setup(
         self,
         model_name="distil-large-v3",
-        device="cuda",
+        device="mps",
         torch_dtype="float16",
         compile_mode=None,
         language=None,
@@ -29,6 +39,9 @@ def setup(
             model_name = model_name.split("/")[-1]
         self.device = device
         self.model = LightningWhisperMLX(model=model_name, batch_size=6, quant=None)
+        self.start_language = language
+        self.last_language = language
+
         self.warmup()
 
     def warmup(self):
@@ -47,10 +60,26 @@ def process(self, spoken_prompt):
         global pipeline_start
         pipeline_start = perf_counter()
 
-        pred_text = self.model.transcribe(spoken_prompt)["text"].strip()
+        if self.start_language != 'auto':
+            transcription_dict = self.model.transcribe(spoken_prompt, language=self.start_language)
+        else:
+            transcription_dict = self.model.transcribe(spoken_prompt)
+            language_code = transcription_dict["language"]
+            if language_code not in SUPPORTED_LANGUAGES:
+                logger.warning(f"Whisper detected unsupported language: {language_code}")
+                if self.last_language in SUPPORTED_LANGUAGES:  # reprocess with the last language
+                    transcription_dict = self.model.transcribe(spoken_prompt, language=self.last_language)
+                else:
+                    transcription_dict = {"text": "", "language": "en"}
+            else:
+                self.last_language = language_code
+
+        pred_text = transcription_dict["text"].strip()
+        language_code = transcription_dict["language"]
         torch.mps.empty_cache()
 
         logger.debug("finished whisper inference")
         console.print(f"[yellow]USER: {pred_text}")
+        logger.debug(f"Language Code Whisper: {language_code}")
 
-        yield pred_text
+        yield (pred_text, language_code)
@@ -13,7 +13,7 @@
 console = Console()
 
 WHISPER_LANGUAGE_TO_MELO_LANGUAGE = {
-    "en": "EN_NEWEST",
+    "en": "EN",
     "fr": "FR",
     "es": "ES",
     "zh": "ZH",
@@ -22,7 +22,7 @@
 }
 
 WHISPER_LANGUAGE_TO_MELO_SPEAKER = {
-    "en": "EN-Newest",
+    "en": "EN-BR",
     "fr": "FR",
     "es": "ES",
     "zh": "ZH",
 
@@ -72,7 +72,7 @@ def setup(
 
         if self.compile_mode not in (None, "default"):
             logger.warning(
-                "Torch compilation modes that captures CUDA graphs are not yet compatible with the STT part. Reverting to 'default'"
+                "Torch compilation modes that captures CUDA graphs are not yet compatible with the TTS part. Reverting to 'default'"
             )
             self.compile_mode = "default"
 
 
@@ -86,3 +86,7 @@ def process(self, audio_chunk):
                         )
                     array = enhanced.numpy().squeeze()
                 yield array
+
+    @property
+    def min_time_to_debug(self):
+        return 0.00001
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ def setup(`
`72`	`72`
`73`	`73`	`if self.compile_mode not in (None, "default"):`
`74`	`74`	`logger.warning(`
`75`		`- "Torch compilation modes that captures CUDA graphs are not yet compatible with the STT part. Reverting to 'default'"`
	`75`	`+ "Torch compilation modes that captures CUDA graphs are not yet compatible with the TTS part. Reverting to 'default'"`
`76`	`76`	`)`
`77`	`77`	`self.compile_mode = "default"`
`78`	`78`
Original file line number	Diff line number	Diff line change
`@@ -86,3 +86,7 @@ def process(self, audio_chunk):`
`86`	`86`	`)`
`87`	`87`	`array = enhanced.numpy().squeeze()`
`88`	`88`	`yield array`
	`89`	`+`
	`90`	`+ @property`
	`91`	`+ def min_time_to_debug(self):`
	`92`	`+ return 0.00001`