Parakeet text cleanups (#1193)

zhang-lucy · charlesfrye · web-flow · commit 1d80e4fc019b · 2025-05-28T04:19:40.000Z
* more text cleanups

* nits

* minor text changes

* rm troubleshooting

* update

* minor text fixes and reorganization

* silence unnecessary logs

* refactor out one level of async nesting

---------

Co-authored-by: Charles Frye &lt;charles@modal.com&gt;
diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -1,66 +1,73 @@
-# # Real time audio transcription using Parakeet 🦜
+# # Real-time audio transcription using Parakeet
 
-# [Parakeet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#parakeet) is the name of a family of ASR models built using [NVIDIA's NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html).
-# We'll show you how to use Parakeet for real-time audio transcription,
-# with a simple Python client and a GPU server you can spin up easily in Modal.
+# This examples demonstrates the use of Parakeet ASR models for real-time speech-to-text on Modal.
 
-# This example uses the `nvidia/parakeet-tdt-0.6b-v2` model, which, as of May 13, 2025, sits at the
-# top of Hugging Face's [ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
+# [Parakeet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#parakeet)
+# is the name of a family of ASR models built using [NVIDIA's NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html).
+# We'll show you how to use Parakeet for real-time audio transcription on Modal GPUs,
+# with simple Python and browser clients.
 
-# To run this example either:
+# This example uses the `nvidia/parakeet-tdt-0.6b-v2` model which, as of June 2025, sits at the
+# top of Hugging Face's [Open ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
+
+# To try out transcription from your terminal,
+# provide a URL for a `.wav` file to `modal run`:
 
-# - run the browser/microphone frontend, or
-# ```bash
-# modal serve 06_gpu_and_ml/audio-to-text/parakeet.py
-# ```
-# - stream a .wav file from a URL (optional, default is "Dream Within a Dream" by Edgar Allan Poe).
 # ```bash
 # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav"
 # ```
 
-# See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues.
-
-# Here's what your final output might look like:
+# You should see output like the following:
 
 # ```bash
-# 🌐 Downloading audio file...
-# 🎧 Downloaded 6331478 bytes
-# ☀️ Waking up model, this may take a few seconds on cold start...
-# 📝 Transcription: A Dream Within A Dream Edgar Allan Poe
-# 📝 Transcription:
-# 📝 Transcription: take this kiss upon the brow, And in parting from you now, Thus much let me avow You are not wrong who deem That my days have been a dream.
+# 🎤 Starting Transcription
+# A Dream Within A Dream Edgar Allan Poe
+# take this kiss upon the brow, And in parting from you now, Thus much let me avow You are not wrong who deem That my days have been a dream.
 # ...
 # ```
 
+# Running a web service you can hit from any browser isn't any harder -- Modal handles the deployment of both the frontend and backend in a single App!
+# Just run
+
+# ```bash
+# modal serve 06_gpu_and_ml/audio-to-text/parakeet.py
+# ```
+
+# and go to the link printed in your terminal.
+
+# The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend).
+
 # ## Setup
+
 import asyncio
 import os
+import sys
 from pathlib import Path
 
 import modal
 
-os.environ["MODAL_LOGLEVEL"] = "INFO"
-app_name = "parakeet-websocket"
+app = modal.App("example-parakeet")
 
-app = modal.App(app_name)
-SILENCE_THRESHOLD = -45
-SILENCE_MIN_LENGTH_MSEC = 1000
-END_OF_STREAM = b"END_OF_STREAM"
 # ## Volume for caching model weights
+
 # We use a [Modal Volume](https://modal.com/docs/guide/volumes) to cache the model weights.
 # This allows us to avoid downloading the model weights every time we start a new instance.
 
+# For more on storing models on Modal, see [this guide](https://modal.com/docs/guide/model-weights).
+
 model_cache = modal.Volume.from_name("parakeet-model-cache", create_if_missing=True)
+
 # ## Configuring dependencies
-# The model runs remotely inside a [custom container](https://modal.com/docs/guide/custom-container). We can define the environment
-# and install our Python dependencies in that container's `Image`.
 
-# For inference, we recommend using the official NVIDIA CUDA Docker images from Docker Hub.
+# The model runs remotely inside a container on Modal. We can define the environment
+# and install our Python dependencies in that container's [`Image`](https://modal.com/docs/guide/images).
+
+# For finicky setups like NeMO's, we recommend using the official NVIDIA CUDA Docker images from Docker Hub.
 # You'll need to install Python and pip with the `add_python` option because the image
 # doesn't have these by default.
 
 # Additionally, we install `ffmpeg` for handling audio data and `fastapi` to create a web
-# server for our websocket.
+# server for our WebSocket.
 
 image = (
     modal.Image.from_registry(
@@ -82,48 +89,61 @@
         "nemo_toolkit[asr]==2.3.0",
         "cuda-python==12.8.0",
         "fastapi==0.115.12",
-        "numpy==1.26.4",  # downgrading numpy to avoid issues with CUDA
+        "numpy<2",
         "pydub==0.25.1",
     )
-    .entrypoint([])
-    .add_local_dir(
-        os.path.join(Path(__file__).parent.resolve(), "frontend"),
+    .entrypoint([])  # silence chatty logs by container on start
+    .add_local_dir(  # changes fastest, so make this the last layer
+        Path(__file__).parent / "frontend",
         remote_path="/frontend",
     )
 )
 
 # ## Implementing real-time audio transcription on Modal
 
-# Now we're ready to implement the transcription model. We wrap inference in a [modal.Cls](https://modal.com/docs/guide/lifecycle-functions) that
-# ensures models are loaded and then moved to the GPU once when a new container starts. Couple of notes:
+# Now we're ready to implement transcription. We wrap inference in a [`modal.Cls`](https://modal.com/docs/guide/lifecycle-functions) that
+# ensures models are loaded and then moved to the GPU once when a new container starts.
 
-# - The `load` method loads the model at start, instead of during inference, using [`modal.enter()`](https://modal.com/docs/reference/modal.enter#modalenter).
-# - The `transcribe` method takes bytes of audio data, and returns the transcribed text.
+# A couples of notes about this code:
+# - The `transcribe` method takes bytes of audio data and returns the transcribed text.
 # - The `web` method creates a FastAPI app using [`modal.asgi_app`](https://modal.com/docs/reference/modal.asgi_app#modalasgi_app) that serves a
 # [WebSocket](https://modal.com/docs/guide/webhooks#websockets) endpoint for real-time audio transcription and a browser frontend for transcribing audio from your microphone.
+# - The `run_with_queue` method takes a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue) and passes audio data and transcriptions between our local machine and the GPU container.
 
 # Parakeet tries really hard to transcribe everything to English!
 # Hence it tends to output utterances like "Yeah" or "Mm-hmm" when it runs on silent audio.
-# We can pre-process the incoming audio in the server by using `pydub`'s silence detection,
-# ensuring that we only pass audio with speech to our model.
+# We pre-process the incoming audio in the server using `pydub`'s silence detection,
+# ensuring that we don't pass silence into our model.
+
+END_OF_STREAM = (
+    b"END_OF_STREAM_8f13d09"  # byte sequence indicating a stream is finished
+)
 
 
 @app.cls(volumes={"/cache": model_cache}, gpu="a10g", image=image)
 @modal.concurrent(max_inputs=14, target_inputs=10)
 class Parakeet:
     @modal.enter()
     def load(self):
+        import logging
+
         import nemo.collections.asr as nemo_asr
 
+        # silence chatty logs from nemo
+        logging.getLogger("nemo_logger").setLevel(logging.CRITICAL)
+
         self.model = nemo_asr.models.ASRModel.from_pretrained(
             model_name="nvidia/parakeet-tdt-0.6b-v2"
         )
 
-    async def transcribe(self, audio_bytes: bytes) -> str:
+    def transcribe(self, audio_bytes: bytes) -> str:
         import numpy as np
 
         audio_data = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32)
-        output = self.model.transcribe([audio_data])
+
+        with NoStdStreams():  # hide output, see https://github.com/NVIDIA/NeMo/discussions/3281#discussioncomment-2251217
+            output = self.model.transcribe([audio_data])
+
         return output[0].text
 
     @modal.asgi_app()
@@ -139,7 +159,7 @@ def web(self):
         async def status():
             return Response(status_code=200)
 
-        # server frontend
+        # serve frontend
         @web_app.get("/")
         async def index():
             return HTMLResponse(content=open("/frontend/index.html").read())
@@ -201,7 +221,13 @@ async def run_with_queue(self, q: modal.Queue):
             print(f"Error handling queue: {type(e)}: {e}")
             return
 
-    async def handle_audio_chunk(self, chunk: bytes, audio_segment):
+    async def handle_audio_chunk(
+        self,
+        chunk: bytes,
+        audio_segment,
+        silence_thresh=-45,  # dB
+        min_silence_len=1000,  # ms
+    ):
         from pydub import AudioSegment, silence
 
         new_audio_segment = AudioSegment(
@@ -210,118 +236,103 @@ async def handle_audio_chunk(self, chunk: bytes, audio_segment):
             sample_width=2,
             frame_rate=TARGET_SAMPLE_RATE,
         )
+
         # append the new audio segment to the existing audio segment
         audio_segment += new_audio_segment
 
+        # detect windows of silence
         silent_windows = silence.detect_silence(
             audio_segment,
-            min_silence_len=SILENCE_MIN_LENGTH_MSEC,
-            silence_thresh=SILENCE_THRESHOLD,
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_thresh,
         )
 
         # if there are no silent windows, continue
         if len(silent_windows) == 0:
             return audio_segment, None
+
         # get the last silent window because
         # we want to transcribe until the final pause
         last_window = silent_windows[-1]
+
         # if the entire audio segment is silent, reset the audio segment
         if last_window[0] == 0 and last_window[1] == len(audio_segment):
             audio_segment = AudioSegment.empty()
             return audio_segment, None
+
         # get the segment to transcribe: beginning until last pause
         segment_to_transcribe = audio_segment[: last_window[1]]
+
         # remove the segment to transcribe from the audio segment
         audio_segment = audio_segment[last_window[1] :]
         try:
-            text = await self.transcribe(segment_to_transcribe.raw_data)
+            text = self.transcribe(segment_to_transcribe.raw_data)
             return audio_segment, text
         except Exception as e:
             print("❌ Transcription error:", e)
             raise e
 
 
-# ## Client
+# ## Running transcription from a local Python client
+
 # Next, let's test the model with a [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint) that streams audio data to the server and prints
-# out the transcriptions to our terminal in real-time.
+# out the transcriptions to our terminal as they arrive.
 
-# Instead of using the WebSocket endpoint like the frontend,
+# Instead of using the WebSocket endpoint like the browser frontend,
 # we'll use a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue)
 # to pass audio data and transcriptions between our local machine and the GPU container.
 
 AUDIO_URL = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav"
-TARGET_SAMPLE_RATE = 16000
-CHUNK_SIZE = 16000  # send one second of audio at a time
+TARGET_SAMPLE_RATE = 16_000
+CHUNK_SIZE = 16_000  # send one second of audio at a time
 
 
 @app.local_entrypoint()
-def main(audio_url: str = AUDIO_URL):
+async def main(audio_url: str = AUDIO_URL):
     from urllib.request import urlopen
 
-    print("🌐 Downloading audio file...")
+    print(f"🌐 Downloading audio file from {audio_url}")
     audio_bytes = urlopen(audio_url).read()
     print(f"🎧 Downloaded {len(audio_bytes)} bytes")
 
     audio_data = preprocess_audio(audio_bytes)
 
-    print("☀️ Waking up model, this may take a few seconds on cold start...")
-    try:
-        asyncio.run(run(audio_data))
-        print("✅ Transcription complete!")
-    except KeyboardInterrupt:
-        print("\n🛑 Stopped by user.")
+    print("🎤 Starting Transcription")
+    with modal.Queue.ephemeral() as q:
+        Parakeet().run_with_queue.spawn(q)
+        send = asyncio.create_task(send_audio(q, audio_data))
+        recv = asyncio.create_task(receive_text(q))
+        await asyncio.gather(send, recv)
+    print("✅ Transcription complete!")
+
 
+# Below are the two functions that coordinate streaming audio and receiving transcriptions.
 
-# Below are the three main functions that coordinate streaming audio and receiving transcriptions.
-#
-# `send_audio` transmits chunks of audio data and then pauses to approximate streaming
-# speech at a natural rate. That said, we set it to faster
-# than real-time to compensate for network latency. Plus, we're not
-# trying to wait forever for this to finish.
+# `send_audio` transmits chunks of audio data with a slight delay,
+# as though it was being streamed from a live source, like a microphone.
+# `receive_text` waits for transcribed text to arrive and prints it.
 
 
 async def send_audio(q, audio_bytes):
     for chunk in chunk_audio(audio_bytes, CHUNK_SIZE):
         await q.put.aio(chunk, partition="audio")
-        await asyncio.sleep(
-            CHUNK_SIZE / TARGET_SAMPLE_RATE / 8
-        )  # simulate real-time pacing
+        await asyncio.sleep(CHUNK_SIZE / TARGET_SAMPLE_RATE / 8)
     await q.put.aio(END_OF_STREAM, partition="audio")
 
 
-# `receive_transcriptions` is straightforward.
-# It just waits for a transcription and prints it after a small delay to avoid colliding with the print statements
-# from the GPU container.
-
-
-async def receive_transcriptions(q):
+async def receive_text(q):
     while True:
         message = await q.get.aio(partition="transcription")
         if message == END_OF_STREAM:
             break
-        await asyncio.sleep(1.00)  # add a delay to avoid stdout collision
-        print(f"📝 Transcription: {message}")
 
+        print(message)
 
-# We take full advantage of Modal's asynchronous capabilities here. In `run`, we spawn our function call
-# so it doesn't block, and then we create and wait on the send and receive tasks.
-
-
-async def run(audio_bytes):
-    with modal.Queue.ephemeral() as q:
-        Parakeet().run_with_queue.spawn(q)
-        send_task = asyncio.create_task(send_audio(q, audio_bytes))
-        receive_task = asyncio.create_task(receive_transcriptions(q))
-        await asyncio.gather(send_task, receive_task)
-
-
-# ## Troubleshooting
-# - Make sure you have the latest version of the Modal CLI installed.
-# - The server takes a few seconds to start up on cold start. If your local client times out, try
-#   restarting the client.
 
 # ## Addenda
-# Helper functions for converting audio to Parakeet's input format and iterating over audio chunks.
+
+# The remainder of the code in this example is boilerplate,
+# mostly for handling Parakeet's input format.
 
 
 def preprocess_audio(audio_bytes: bytes) -> bytes:
@@ -383,3 +394,17 @@ def preprocess_audio(audio_bytes: bytes) -> bytes:
 def chunk_audio(data: bytes, chunk_size: int):
     for i in range(0, len(data), chunk_size):
         yield data[i : i + chunk_size]
+
+
+class NoStdStreams(object):
+    def __init__(self):
+        self.devnull = open(os.devnull, "w")
+
+    def __enter__(self):
+        self._stdout, self._stderr = sys.stdout, sys.stderr
+        self._stdout.flush(), self._stderr.flush()
+        sys.stdout, sys.stderr = self.devnull, self.devnull
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        sys.stdout, sys.stderr = self._stdout, self._stderr
+        self.devnull.close()