diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index 90cdbe34b..f3a486f41 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -1,66 +1,73 @@ -# # Real time audio transcription using Parakeet 🦜 +# # Real-time audio transcription using Parakeet -# [Parakeet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#parakeet) is the name of a family of ASR models built using [NVIDIA's NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html). -# We'll show you how to use Parakeet for real-time audio transcription, -# with a simple Python client and a GPU server you can spin up easily in Modal. +# This examples demonstrates the use of Parakeet ASR models for real-time speech-to-text on Modal. -# This example uses the `nvidia/parakeet-tdt-0.6b-v2` model, which, as of May 13, 2025, sits at the -# top of Hugging Face's [ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard). +# [Parakeet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#parakeet) +# is the name of a family of ASR models built using [NVIDIA's NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html). +# We'll show you how to use Parakeet for real-time audio transcription on Modal GPUs, +# with simple Python and browser clients. -# To run this example either: +# This example uses the `nvidia/parakeet-tdt-0.6b-v2` model which, as of June 2025, sits at the +# top of Hugging Face's [Open ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard). + +# To try out transcription from your terminal, +# provide a URL for a `.wav` file to `modal run`: -# - run the browser/microphone frontend, or -# ```bash -# modal serve 06_gpu_and_ml/audio-to-text/parakeet.py -# ``` -# - stream a .wav file from a URL (optional, default is "Dream Within a Dream" by Edgar Allan Poe). # ```bash # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav" # ``` -# See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues. - -# Here's what your final output might look like: +# You should see output like the following: # ```bash -# 🌐 Downloading audio file... -# šŸŽ§ Downloaded 6331478 bytes -# ā˜€ļø Waking up model, this may take a few seconds on cold start... -# šŸ“ Transcription: A Dream Within A Dream Edgar Allan Poe -# šŸ“ Transcription: -# šŸ“ Transcription: take this kiss upon the brow, And in parting from you now, Thus much let me avow You are not wrong who deem That my days have been a dream. +# šŸŽ¤ Starting Transcription +# A Dream Within A Dream Edgar Allan Poe +# take this kiss upon the brow, And in parting from you now, Thus much let me avow You are not wrong who deem That my days have been a dream. # ... # ``` +# Running a web service you can hit from any browser isn't any harder -- Modal handles the deployment of both the frontend and backend in a single App! +# Just run + +# ```bash +# modal serve 06_gpu_and_ml/audio-to-text/parakeet.py +# ``` + +# and go to the link printed in your terminal. + +# The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend). + # ## Setup + import asyncio import os +import sys from pathlib import Path import modal -os.environ["MODAL_LOGLEVEL"] = "INFO" -app_name = "parakeet-websocket" +app = modal.App("example-parakeet") -app = modal.App(app_name) -SILENCE_THRESHOLD = -45 -SILENCE_MIN_LENGTH_MSEC = 1000 -END_OF_STREAM = b"END_OF_STREAM" # ## Volume for caching model weights + # We use a [Modal Volume](https://modal.com/docs/guide/volumes) to cache the model weights. # This allows us to avoid downloading the model weights every time we start a new instance. +# For more on storing models on Modal, see [this guide](https://modal.com/docs/guide/model-weights). + model_cache = modal.Volume.from_name("parakeet-model-cache", create_if_missing=True) + # ## Configuring dependencies -# The model runs remotely inside a [custom container](https://modal.com/docs/guide/custom-container). We can define the environment -# and install our Python dependencies in that container's `Image`. -# For inference, we recommend using the official NVIDIA CUDA Docker images from Docker Hub. +# The model runs remotely inside a container on Modal. We can define the environment +# and install our Python dependencies in that container's [`Image`](https://modal.com/docs/guide/images). + +# For finicky setups like NeMO's, we recommend using the official NVIDIA CUDA Docker images from Docker Hub. # You'll need to install Python and pip with the `add_python` option because the image # doesn't have these by default. # Additionally, we install `ffmpeg` for handling audio data and `fastapi` to create a web -# server for our websocket. +# server for our WebSocket. image = ( modal.Image.from_registry( @@ -82,30 +89,35 @@ "nemo_toolkit[asr]==2.3.0", "cuda-python==12.8.0", "fastapi==0.115.12", - "numpy==1.26.4", # downgrading numpy to avoid issues with CUDA + "numpy<2", "pydub==0.25.1", ) - .entrypoint([]) - .add_local_dir( - os.path.join(Path(__file__).parent.resolve(), "frontend"), + .entrypoint([]) # silence chatty logs by container on start + .add_local_dir( # changes fastest, so make this the last layer + Path(__file__).parent / "frontend", remote_path="/frontend", ) ) # ## Implementing real-time audio transcription on Modal -# Now we're ready to implement the transcription model. We wrap inference in a [modal.Cls](https://modal.com/docs/guide/lifecycle-functions) that -# ensures models are loaded and then moved to the GPU once when a new container starts. Couple of notes: +# Now we're ready to implement transcription. We wrap inference in a [`modal.Cls`](https://modal.com/docs/guide/lifecycle-functions) that +# ensures models are loaded and then moved to the GPU once when a new container starts. -# - The `load` method loads the model at start, instead of during inference, using [`modal.enter()`](https://modal.com/docs/reference/modal.enter#modalenter). -# - The `transcribe` method takes bytes of audio data, and returns the transcribed text. +# A couples of notes about this code: +# - The `transcribe` method takes bytes of audio data and returns the transcribed text. # - The `web` method creates a FastAPI app using [`modal.asgi_app`](https://modal.com/docs/reference/modal.asgi_app#modalasgi_app) that serves a # [WebSocket](https://modal.com/docs/guide/webhooks#websockets) endpoint for real-time audio transcription and a browser frontend for transcribing audio from your microphone. +# - The `run_with_queue` method takes a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue) and passes audio data and transcriptions between our local machine and the GPU container. # Parakeet tries really hard to transcribe everything to English! # Hence it tends to output utterances like "Yeah" or "Mm-hmm" when it runs on silent audio. -# We can pre-process the incoming audio in the server by using `pydub`'s silence detection, -# ensuring that we only pass audio with speech to our model. +# We pre-process the incoming audio in the server using `pydub`'s silence detection, +# ensuring that we don't pass silence into our model. + +END_OF_STREAM = ( + b"END_OF_STREAM_8f13d09" # byte sequence indicating a stream is finished +) @app.cls(volumes={"/cache": model_cache}, gpu="a10g", image=image) @@ -113,17 +125,25 @@ class Parakeet: @modal.enter() def load(self): + import logging + import nemo.collections.asr as nemo_asr + # silence chatty logs from nemo + logging.getLogger("nemo_logger").setLevel(logging.CRITICAL) + self.model = nemo_asr.models.ASRModel.from_pretrained( model_name="nvidia/parakeet-tdt-0.6b-v2" ) - async def transcribe(self, audio_bytes: bytes) -> str: + def transcribe(self, audio_bytes: bytes) -> str: import numpy as np audio_data = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) - output = self.model.transcribe([audio_data]) + + with NoStdStreams(): # hide output, see https://github.com/NVIDIA/NeMo/discussions/3281#discussioncomment-2251217 + output = self.model.transcribe([audio_data]) + return output[0].text @modal.asgi_app() @@ -139,7 +159,7 @@ def web(self): async def status(): return Response(status_code=200) - # server frontend + # serve frontend @web_app.get("/") async def index(): return HTMLResponse(content=open("/frontend/index.html").read()) @@ -201,7 +221,13 @@ async def run_with_queue(self, q: modal.Queue): print(f"Error handling queue: {type(e)}: {e}") return - async def handle_audio_chunk(self, chunk: bytes, audio_segment): + async def handle_audio_chunk( + self, + chunk: bytes, + audio_segment, + silence_thresh=-45, # dB + min_silence_len=1000, # ms + ): from pydub import AudioSegment, silence new_audio_segment = AudioSegment( @@ -210,118 +236,103 @@ async def handle_audio_chunk(self, chunk: bytes, audio_segment): sample_width=2, frame_rate=TARGET_SAMPLE_RATE, ) + # append the new audio segment to the existing audio segment audio_segment += new_audio_segment + # detect windows of silence silent_windows = silence.detect_silence( audio_segment, - min_silence_len=SILENCE_MIN_LENGTH_MSEC, - silence_thresh=SILENCE_THRESHOLD, + min_silence_len=min_silence_len, + silence_thresh=silence_thresh, ) # if there are no silent windows, continue if len(silent_windows) == 0: return audio_segment, None + # get the last silent window because # we want to transcribe until the final pause last_window = silent_windows[-1] + # if the entire audio segment is silent, reset the audio segment if last_window[0] == 0 and last_window[1] == len(audio_segment): audio_segment = AudioSegment.empty() return audio_segment, None + # get the segment to transcribe: beginning until last pause segment_to_transcribe = audio_segment[: last_window[1]] + # remove the segment to transcribe from the audio segment audio_segment = audio_segment[last_window[1] :] try: - text = await self.transcribe(segment_to_transcribe.raw_data) + text = self.transcribe(segment_to_transcribe.raw_data) return audio_segment, text except Exception as e: print("āŒ Transcription error:", e) raise e -# ## Client +# ## Running transcription from a local Python client + # Next, let's test the model with a [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint) that streams audio data to the server and prints -# out the transcriptions to our terminal in real-time. +# out the transcriptions to our terminal as they arrive. -# Instead of using the WebSocket endpoint like the frontend, +# Instead of using the WebSocket endpoint like the browser frontend, # we'll use a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue) # to pass audio data and transcriptions between our local machine and the GPU container. AUDIO_URL = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav" -TARGET_SAMPLE_RATE = 16000 -CHUNK_SIZE = 16000 # send one second of audio at a time +TARGET_SAMPLE_RATE = 16_000 +CHUNK_SIZE = 16_000 # send one second of audio at a time @app.local_entrypoint() -def main(audio_url: str = AUDIO_URL): +async def main(audio_url: str = AUDIO_URL): from urllib.request import urlopen - print("🌐 Downloading audio file...") + print(f"🌐 Downloading audio file from {audio_url}") audio_bytes = urlopen(audio_url).read() print(f"šŸŽ§ Downloaded {len(audio_bytes)} bytes") audio_data = preprocess_audio(audio_bytes) - print("ā˜€ļø Waking up model, this may take a few seconds on cold start...") - try: - asyncio.run(run(audio_data)) - print("āœ… Transcription complete!") - except KeyboardInterrupt: - print("\nšŸ›‘ Stopped by user.") + print("šŸŽ¤ Starting Transcription") + with modal.Queue.ephemeral() as q: + Parakeet().run_with_queue.spawn(q) + send = asyncio.create_task(send_audio(q, audio_data)) + recv = asyncio.create_task(receive_text(q)) + await asyncio.gather(send, recv) + print("āœ… Transcription complete!") + +# Below are the two functions that coordinate streaming audio and receiving transcriptions. -# Below are the three main functions that coordinate streaming audio and receiving transcriptions. -# -# `send_audio` transmits chunks of audio data and then pauses to approximate streaming -# speech at a natural rate. That said, we set it to faster -# than real-time to compensate for network latency. Plus, we're not -# trying to wait forever for this to finish. +# `send_audio` transmits chunks of audio data with a slight delay, +# as though it was being streamed from a live source, like a microphone. +# `receive_text` waits for transcribed text to arrive and prints it. async def send_audio(q, audio_bytes): for chunk in chunk_audio(audio_bytes, CHUNK_SIZE): await q.put.aio(chunk, partition="audio") - await asyncio.sleep( - CHUNK_SIZE / TARGET_SAMPLE_RATE / 8 - ) # simulate real-time pacing + await asyncio.sleep(CHUNK_SIZE / TARGET_SAMPLE_RATE / 8) await q.put.aio(END_OF_STREAM, partition="audio") -# `receive_transcriptions` is straightforward. -# It just waits for a transcription and prints it after a small delay to avoid colliding with the print statements -# from the GPU container. - - -async def receive_transcriptions(q): +async def receive_text(q): while True: message = await q.get.aio(partition="transcription") if message == END_OF_STREAM: break - await asyncio.sleep(1.00) # add a delay to avoid stdout collision - print(f"šŸ“ Transcription: {message}") + print(message) -# We take full advantage of Modal's asynchronous capabilities here. In `run`, we spawn our function call -# so it doesn't block, and then we create and wait on the send and receive tasks. - - -async def run(audio_bytes): - with modal.Queue.ephemeral() as q: - Parakeet().run_with_queue.spawn(q) - send_task = asyncio.create_task(send_audio(q, audio_bytes)) - receive_task = asyncio.create_task(receive_transcriptions(q)) - await asyncio.gather(send_task, receive_task) - - -# ## Troubleshooting -# - Make sure you have the latest version of the Modal CLI installed. -# - The server takes a few seconds to start up on cold start. If your local client times out, try -# restarting the client. # ## Addenda -# Helper functions for converting audio to Parakeet's input format and iterating over audio chunks. + +# The remainder of the code in this example is boilerplate, +# mostly for handling Parakeet's input format. def preprocess_audio(audio_bytes: bytes) -> bytes: @@ -383,3 +394,17 @@ def preprocess_audio(audio_bytes: bytes) -> bytes: def chunk_audio(data: bytes, chunk_size: int): for i in range(0, len(data), chunk_size): yield data[i : i + chunk_size] + + +class NoStdStreams(object): + def __init__(self): + self.devnull = open(os.devnull, "w") + + def __enter__(self): + self._stdout, self._stderr = sys.stdout, sys.stderr + self._stdout.flush(), self._stderr.flush() + sys.stdout, sys.stderr = self.devnull, self.devnull + + def __exit__(self, exc_type, exc_value, traceback): + sys.stdout, sys.stderr = self._stdout, self._stderr + self.devnull.close()