From f82d88113ac0d48364934f6735f5cf31afc2eded Mon Sep 17 00:00:00 2001
From: Lucy Zhang <36051089+zhang-lucy@users.noreply.github.com>
Date: Thu, 22 May 2025 16:09:41 -0400
Subject: [PATCH 1/8] more text cleanups

---
 06_gpu_and_ml/audio-to-text/parakeet.py | 28 +++++++++++++------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
index 90cdbe34b..b9ae0006b 100644
--- a/06_gpu_and_ml/audio-to-text/parakeet.py
+++ b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -7,19 +7,17 @@
 # This example uses the `nvidia/parakeet-tdt-0.6b-v2` model, which, as of May 13, 2025, sits at the
 # top of Hugging Face's [ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
 
-# To run this example either:
+# To run this example, either:
 
-# - run the browser/microphone frontend, or
+# - Run the browser/microphone frontend - Modal can handle the deployment of both the frontend and backend in a single app! You should see a browser window pop up - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend).
 # ```bash
 # modal serve 06_gpu_and_ml/audio-to-text/parakeet.py
 # ```
-# - stream a .wav file from a URL (optional, default is "Dream Within a Dream" by Edgar Allan Poe).
+# - Or, stream a `.wav` file directly from a URL to simulate real-time transcription!
 # ```bash
 # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav"
 # ```
 
-# See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues.
-
 # Here's what your final output might look like:
 
 # ```bash
@@ -28,10 +26,17 @@
 # ☀️ Waking up model, this may take a few seconds on cold start...
 # 📝 Transcription: A Dream Within A Dream Edgar Allan Poe
 # 📝 Transcription:
-# 📝 Transcription: take this kiss upon the brow, And in parting from you now, Thus much let me avow You are not wrong who deem That my days have been a dream.
+# 📝 Transcription: Take this kiss upon the brow,
+# 📝 Transcription: And in parting from you now,
+# 📝 Transcription: Thus much let me avow,
+# 📝 Transcription: You are not wrong who deem
+# 📝 Transcription: That my days have been a dream.
 # ...
 # ```
 
+# See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues.
+
+
 # ## Setup
 import asyncio
 import os
@@ -40,9 +45,8 @@
 import modal
 
 os.environ["MODAL_LOGLEVEL"] = "INFO"
-app_name = "parakeet-websocket"
 
-app = modal.App(app_name)
+app = modal.App("parakeet-websocket")
 SILENCE_THRESHOLD = -45
 SILENCE_MIN_LENGTH_MSEC = 1000
 END_OF_STREAM = b"END_OF_STREAM"
@@ -101,6 +105,7 @@
 # - The `transcribe` method takes bytes of audio data, and returns the transcribed text.
 # - The `web` method creates a FastAPI app using [`modal.asgi_app`](https://modal.com/docs/reference/modal.asgi_app#modalasgi_app) that serves a
 # [WebSocket](https://modal.com/docs/guide/webhooks#websockets) endpoint for real-time audio transcription and a browser frontend for transcribing audio from your microphone.
+# - The `run_with_queue` method takes a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue) and passes audio data and transcriptions between our local machine and the GPU container.
 
 # Parakeet tries really hard to transcribe everything to English!
 # Hence it tends to output utterances like "Yeah" or "Mm-hmm" when it runs on silent audio.
@@ -275,9 +280,7 @@ def main(audio_url: str = AUDIO_URL):
 # Below are the three main functions that coordinate streaming audio and receiving transcriptions.
 #
 # `send_audio` transmits chunks of audio data and then pauses to approximate streaming
-# speech at a natural rate. That said, we set it to faster
-# than real-time to compensate for network latency. Plus, we're not
-# trying to wait forever for this to finish.
+# speech at a natural rate.
 
 
 async def send_audio(q, audio_bytes):
@@ -289,8 +292,7 @@ async def send_audio(q, audio_bytes):
     await q.put.aio(END_OF_STREAM, partition="audio")
 
 
-# `receive_transcriptions` is straightforward.
-# It just waits for a transcription and prints it after a small delay to avoid colliding with the print statements
+# `receive_transcriptions` waits for a transcription and prints it after a small delay to avoid colliding with the print statements
 # from the GPU container.
 
 

From eabfaf10fbeebc7d9502f844b03f7798e402b1fb Mon Sep 17 00:00:00 2001
From: Lucy Zhang <36051089+zhang-lucy@users.noreply.github.com>
Date: Thu, 22 May 2025 16:26:39 -0400
Subject: [PATCH 2/8] nits

---
 06_gpu_and_ml/audio-to-text/parakeet.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
index b9ae0006b..c0a054be0 100644
--- a/06_gpu_and_ml/audio-to-text/parakeet.py
+++ b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -9,16 +9,15 @@
 
 # To run this example, either:
 
-# - Run the browser/microphone frontend - Modal can handle the deployment of both the frontend and backend in a single app! You should see a browser window pop up - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend).
+# - Run the browser/microphone frontend. Modal handles the deployment of both the frontend and backend in a single app! You should see a browser window pop up - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend).
 # ```bash
 # modal serve 06_gpu_and_ml/audio-to-text/parakeet.py
 # ```
-# - Or, stream a `.wav` file directly from a URL to simulate real-time transcription!
+# - Or, stream a `.wav` file directly from a URL to simulate real-time transcription in your terminal:
 # ```bash
 # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav"
 # ```
-
-# Here's what your final output might look like:
+# You should see output like the following in your terminal:
 
 # ```bash
 # 🌐 Downloading audio file...
@@ -33,7 +32,6 @@
 # 📝 Transcription: That my days have been a dream.
 # ...
 # ```
-
 # See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues.
 
 

From 7b4724fd43bb3575ce0e4c9da23f7d769fc6e6eb Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Thu, 22 May 2025 13:46:59 -0700
Subject: [PATCH 3/8] minor text changes

---
 06_gpu_and_ml/audio-to-text/parakeet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
index c0a054be0..bebb47884 100644
--- a/06_gpu_and_ml/audio-to-text/parakeet.py
+++ b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -13,7 +13,7 @@
 # ```bash
 # modal serve 06_gpu_and_ml/audio-to-text/parakeet.py
 # ```
-# - Or, stream a `.wav` file directly from a URL to simulate real-time transcription in your terminal:
+# - Or, stream a `.wav` file directly from a URL to run transcription from your terminal:
 # ```bash
 # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav"
 # ```
@@ -44,7 +44,7 @@
 
 os.environ["MODAL_LOGLEVEL"] = "INFO"
 
-app = modal.App("parakeet-websocket")
+app = modal.App("example-parakeet-websocket")
 SILENCE_THRESHOLD = -45
 SILENCE_MIN_LENGTH_MSEC = 1000
 END_OF_STREAM = b"END_OF_STREAM"

From cce3bed16414774e4316090f15925e742e9e4228 Mon Sep 17 00:00:00 2001
From: Lucy Zhang <36051089+zhang-lucy@users.noreply.github.com>
Date: Thu, 22 May 2025 16:54:36 -0400
Subject: [PATCH 4/8] rm troubleshooting

---
 06_gpu_and_ml/audio-to-text/parakeet.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
index bebb47884..230252f34 100644
--- a/06_gpu_and_ml/audio-to-text/parakeet.py
+++ b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -32,8 +32,6 @@
 # 📝 Transcription: That my days have been a dream.
 # ...
 # ```
-# See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues.
-
 
 # ## Setup
 import asyncio
@@ -315,11 +313,6 @@ async def run(audio_bytes):
         await asyncio.gather(send_task, receive_task)
 
 
-# ## Troubleshooting
-# - Make sure you have the latest version of the Modal CLI installed.
-# - The server takes a few seconds to start up on cold start. If your local client times out, try
-#   restarting the client.
-
 # ## Addenda
 # Helper functions for converting audio to Parakeet's input format and iterating over audio chunks.
 

From 8f13d09e6fe67a5040460cc1737846570adf9122 Mon Sep 17 00:00:00 2001
From: Lucy Zhang <36051089+zhang-lucy@users.noreply.github.com>
Date: Sun, 25 May 2025 09:50:06 -0400
Subject: [PATCH 5/8] update

---
 06_gpu_and_ml/audio-to-text/parakeet.py | 35 +++++++++++++++++++++----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
index 230252f34..8ebf70e55 100644
--- a/06_gpu_and_ml/audio-to-text/parakeet.py
+++ b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -9,7 +9,7 @@
 
 # To run this example, either:
 
-# - Run the browser/microphone frontend. Modal handles the deployment of both the frontend and backend in a single app! You should see a browser window pop up - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend).
+# - Run the browser/microphone frontend. Modal handles the deployment of both the frontend and backend in a single app! Click on the link in your terminal to open the frontend in your browser - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend).
 # ```bash
 # modal serve 06_gpu_and_ml/audio-to-text/parakeet.py
 # ```
@@ -27,9 +27,9 @@
 # 📝 Transcription:
 # 📝 Transcription: Take this kiss upon the brow,
 # 📝 Transcription: And in parting from you now,
-# 📝 Transcription: Thus much let me avow,
-# 📝 Transcription: You are not wrong who deem
-# 📝 Transcription: That my days have been a dream.
+# 📝 Transcription: Thus much let me avow You
+# 📝 Transcription: Are not wrong who deem That
+# 📝 Transcription: My days have been a dream.
 # ...
 # ```
 
@@ -298,7 +298,8 @@ async def receive_transcriptions(q):
         if message == END_OF_STREAM:
             break
         await asyncio.sleep(1.00)  # add a delay to avoid stdout collision
-        print(f"📝 Transcription: {message}")
+
+        output_message_as_transcript(message)
 
 
 # We take full advantage of Modal's asynchronous capabilities here. In `run`, we spawn our function call
@@ -376,3 +377,27 @@ def preprocess_audio(audio_bytes: bytes) -> bytes:
 def chunk_audio(data: bytes, chunk_size: int):
     for i in range(0, len(data), chunk_size):
         yield data[i : i + chunk_size]
+
+
+def output_message_as_transcript(message: str):
+    words = message.strip().split()
+
+    # Group into 5–6 word chunks
+    chunks = []
+    chunk = []
+
+    for word in words:
+        chunk.append(word)
+        if len(chunk) >= 6:
+            chunks.append(" ".join(chunk))
+            chunk = []
+
+    if chunk:
+        chunks.append(" ".join(chunk))
+
+    # Capitalize first word in each chunk and print
+    for chunk in chunks:
+        # Capitalize the first letter of the first word
+        parts = chunk.split(" ", 1)
+        capitalized = parts[0].capitalize() + (" " + parts[1] if len(parts) > 1 else "")
+        print(f"📝 Transcription: {capitalized}")

From e361fe36c15fddebc2614a2471472e5f3c0c3b28 Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Wed, 28 May 2025 04:09:08 +0000
Subject: [PATCH 6/8] minor text fixes and reorganization

---
 06_gpu_and_ml/audio-to-text/parakeet.py | 105 ++++++++++++++----------
 1 file changed, 61 insertions(+), 44 deletions(-)

diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
index 8ebf70e55..bc3cd91c1 100644
--- a/06_gpu_and_ml/audio-to-text/parakeet.py
+++ b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -1,66 +1,73 @@
-# # Real time audio transcription using Parakeet 🦜
+# # Real-time audio transcription using Parakeet
 
-# [Parakeet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#parakeet) is the name of a family of ASR models built using [NVIDIA's NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html).
-# We'll show you how to use Parakeet for real-time audio transcription,
-# with a simple Python client and a GPU server you can spin up easily in Modal.
+# This examples demonstrates the use of Parakeet ASR models for real-time speech-to-text on Modal.
 
-# This example uses the `nvidia/parakeet-tdt-0.6b-v2` model, which, as of May 13, 2025, sits at the
-# top of Hugging Face's [ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
+# [Parakeet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#parakeet)
+# is the name of a family of ASR models built using [NVIDIA's NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html).
+# We'll show you how to use Parakeet for real-time audio transcription on Modal GPUs,
+# with simple Python and browser clients.
 
-# To run this example, either:
+# This example uses the `nvidia/parakeet-tdt-0.6b-v2` model which, as of June 2025, sits at the
+# top of Hugging Face's [Open ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
+
+# To try out transcription from your terminal,
+# provide a URL for a `.wav` file to `modal run`:
 
-# - Run the browser/microphone frontend. Modal handles the deployment of both the frontend and backend in a single app! Click on the link in your terminal to open the frontend in your browser - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend).
-# ```bash
-# modal serve 06_gpu_and_ml/audio-to-text/parakeet.py
-# ```
-# - Or, stream a `.wav` file directly from a URL to run transcription from your terminal:
 # ```bash
 # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav"
 # ```
-# You should see output like the following in your terminal:
+
+# You should see output like the following:
 
 # ```bash
-# 🌐 Downloading audio file...
-# 🎧 Downloaded 6331478 bytes
-# ☀️ Waking up model, this may take a few seconds on cold start...
-# 📝 Transcription: A Dream Within A Dream Edgar Allan Poe
-# 📝 Transcription:
-# 📝 Transcription: Take this kiss upon the brow,
-# 📝 Transcription: And in parting from you now,
-# 📝 Transcription: Thus much let me avow You
-# 📝 Transcription: Are not wrong who deem That
-# 📝 Transcription: My days have been a dream.
+# 🎤 Starting Transcription
+# A Dream Within A Dream Edgar Allan Poe
+# take this kiss upon the brow, And in parting from you now, Thus much let me avow You are not wrong who deem That my days have been a dream.
 # ...
 # ```
 
+# Running a web service you can hit from any browser isn't any harder -- Modal handles the deployment of both the frontend and backend in a single App!
+# Just run
+
+# ```bash
+# modal serve 06_gpu_and_ml/audio-to-text/parakeet.py
+# ```
+
+# and go to the link printed in your terminal.
+
+# The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend).
+
 # ## Setup
+
 import asyncio
 import os
+import sys
 from pathlib import Path
 
 import modal
 
-os.environ["MODAL_LOGLEVEL"] = "INFO"
+app = modal.App("example-parakeet")
 
-app = modal.App("example-parakeet-websocket")
-SILENCE_THRESHOLD = -45
-SILENCE_MIN_LENGTH_MSEC = 1000
-END_OF_STREAM = b"END_OF_STREAM"
 # ## Volume for caching model weights
+
 # We use a [Modal Volume](https://modal.com/docs/guide/volumes) to cache the model weights.
 # This allows us to avoid downloading the model weights every time we start a new instance.
 
+# For more on storing models on Modal, see [this guide](https://modal.com/docs/guide/model-weights).
+
 model_cache = modal.Volume.from_name("parakeet-model-cache", create_if_missing=True)
+
 # ## Configuring dependencies
-# The model runs remotely inside a [custom container](https://modal.com/docs/guide/custom-container). We can define the environment
-# and install our Python dependencies in that container's `Image`.
 
-# For inference, we recommend using the official NVIDIA CUDA Docker images from Docker Hub.
+# The model runs remotely inside a container on Modal. We can define the environment
+# and install our Python dependencies in that container's [`Image`](https://modal.com/docs/guide/images).
+
+# For finicky setups like NeMO's, we recommend using the official NVIDIA CUDA Docker images from Docker Hub.
 # You'll need to install Python and pip with the `add_python` option because the image
 # doesn't have these by default.
 
 # Additionally, we install `ffmpeg` for handling audio data and `fastapi` to create a web
-# server for our websocket.
+# server for our WebSocket.
 
 image = (
     modal.Image.from_registry(
@@ -82,31 +89,35 @@
         "nemo_toolkit[asr]==2.3.0",
         "cuda-python==12.8.0",
         "fastapi==0.115.12",
-        "numpy==1.26.4",  # downgrading numpy to avoid issues with CUDA
+        "numpy<2",
         "pydub==0.25.1",
     )
-    .entrypoint([])
-    .add_local_dir(
-        os.path.join(Path(__file__).parent.resolve(), "frontend"),
+    .entrypoint([])  # silence chatty logs by container on start
+    .add_local_dir(  # changes fastest, so make this the last layer
+        Path(__file__).parent / "frontend",
         remote_path="/frontend",
     )
 )
 
 # ## Implementing real-time audio transcription on Modal
 
-# Now we're ready to implement the transcription model. We wrap inference in a [modal.Cls](https://modal.com/docs/guide/lifecycle-functions) that
-# ensures models are loaded and then moved to the GPU once when a new container starts. Couple of notes:
+# Now we're ready to implement transcription. We wrap inference in a [`modal.Cls`](https://modal.com/docs/guide/lifecycle-functions) that
+# ensures models are loaded and then moved to the GPU once when a new container starts.
 
-# - The `load` method loads the model at start, instead of during inference, using [`modal.enter()`](https://modal.com/docs/reference/modal.enter#modalenter).
-# - The `transcribe` method takes bytes of audio data, and returns the transcribed text.
+# A couples of notes about this code:
+# - The `transcribe` method takes bytes of audio data and returns the transcribed text.
 # - The `web` method creates a FastAPI app using [`modal.asgi_app`](https://modal.com/docs/reference/modal.asgi_app#modalasgi_app) that serves a
 # [WebSocket](https://modal.com/docs/guide/webhooks#websockets) endpoint for real-time audio transcription and a browser frontend for transcribing audio from your microphone.
 # - The `run_with_queue` method takes a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue) and passes audio data and transcriptions between our local machine and the GPU container.
 
 # Parakeet tries really hard to transcribe everything to English!
 # Hence it tends to output utterances like "Yeah" or "Mm-hmm" when it runs on silent audio.
-# We can pre-process the incoming audio in the server by using `pydub`'s silence detection,
-# ensuring that we only pass audio with speech to our model.
+# We pre-process the incoming audio in the server using `pydub`'s silence detection,
+# ensuring that we don't pass silence into our model.
+
+END_OF_STREAM = (
+    b"END_OF_STREAM_8f13d09"  # byte sequence indicating a stream is finished
+)
 
 
 @app.cls(volumes={"/cache": model_cache}, gpu="a10g", image=image)
@@ -140,7 +151,7 @@ def web(self):
         async def status():
             return Response(status_code=200)
 
-        # server frontend
+        # serve frontend
         @web_app.get("/")
         async def index():
             return HTMLResponse(content=open("/frontend/index.html").read())
@@ -202,7 +213,13 @@ async def run_with_queue(self, q: modal.Queue):
             print(f"Error handling queue: {type(e)}: {e}")
             return
 
-    async def handle_audio_chunk(self, chunk: bytes, audio_segment):
+    async def handle_audio_chunk(
+        self,
+        chunk: bytes,
+        audio_segment,
+        silence_thresh=-45,  # dB
+        min_silence_len=1000,  # ms
+    ):
         from pydub import AudioSegment, silence
 
         new_audio_segment = AudioSegment(

From 484bd0cb935fdf8aff7fa28bbbbea75eacdd381f Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Wed, 28 May 2025 04:10:03 +0000
Subject: [PATCH 7/8] silence unnecessary logs

---
 06_gpu_and_ml/audio-to-text/parakeet.py | 42 ++++++++++++-------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
index bc3cd91c1..eb10c369a 100644
--- a/06_gpu_and_ml/audio-to-text/parakeet.py
+++ b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -125,17 +125,25 @@
 class Parakeet:
     @modal.enter()
     def load(self):
+        import logging
+
         import nemo.collections.asr as nemo_asr
 
+        # silence chatty logs from nemo
+        logging.getLogger("nemo_logger").setLevel(logging.CRITICAL)
+
         self.model = nemo_asr.models.ASRModel.from_pretrained(
             model_name="nvidia/parakeet-tdt-0.6b-v2"
         )
 
-    async def transcribe(self, audio_bytes: bytes) -> str:
+    def transcribe(self, audio_bytes: bytes) -> str:
         import numpy as np
 
         audio_data = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32)
-        output = self.model.transcribe([audio_data])
+
+        with NoStdStreams():  # hide output, see https://github.com/NVIDIA/NeMo/discussions/3281#discussioncomment-2251217
+            output = self.model.transcribe([audio_data])
+
         return output[0].text
 
     @modal.asgi_app()
@@ -396,25 +404,15 @@ def chunk_audio(data: bytes, chunk_size: int):
         yield data[i : i + chunk_size]
 
 
-def output_message_as_transcript(message: str):
-    words = message.strip().split()
-
-    # Group into 5–6 word chunks
-    chunks = []
-    chunk = []
-
-    for word in words:
-        chunk.append(word)
-        if len(chunk) >= 6:
-            chunks.append(" ".join(chunk))
-            chunk = []
+class NoStdStreams(object):
+    def __init__(self):
+        self.devnull = open(os.devnull, "w")
 
-    if chunk:
-        chunks.append(" ".join(chunk))
+    def __enter__(self):
+        self._stdout, self._stderr = sys.stdout, sys.stderr
+        self._stdout.flush(), self._stderr.flush()
+        sys.stdout, sys.stderr = self.devnull, self.devnull
 
-    # Capitalize first word in each chunk and print
-    for chunk in chunks:
-        # Capitalize the first letter of the first word
-        parts = chunk.split(" ", 1)
-        capitalized = parts[0].capitalize() + (" " + parts[1] if len(parts) > 1 else "")
-        print(f"📝 Transcription: {capitalized}")
+    def __exit__(self, exc_type, exc_value, traceback):
+        sys.stdout, sys.stderr = self._stdout, self._stderr
+        self.devnull.close()

From 882a0b74b704cc398b9cefc984c7183c448ebd62 Mon Sep 17 00:00:00 2001
From: Charles Frye <charles@modal.com>
Date: Wed, 28 May 2025 04:13:43 +0000
Subject: [PATCH 8/8] refactor out one level of async nesting

---
 06_gpu_and_ml/audio-to-text/parakeet.py | 78 +++++++++++--------------
 1 file changed, 35 insertions(+), 43 deletions(-)

diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py
index eb10c369a..f3a486f41 100644
--- a/06_gpu_and_ml/audio-to-text/parakeet.py
+++ b/06_gpu_and_ml/audio-to-text/parakeet.py
@@ -236,111 +236,103 @@ async def handle_audio_chunk(
             sample_width=2,
             frame_rate=TARGET_SAMPLE_RATE,
         )
+
         # append the new audio segment to the existing audio segment
         audio_segment += new_audio_segment
 
+        # detect windows of silence
         silent_windows = silence.detect_silence(
             audio_segment,
-            min_silence_len=SILENCE_MIN_LENGTH_MSEC,
-            silence_thresh=SILENCE_THRESHOLD,
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_thresh,
         )
 
         # if there are no silent windows, continue
         if len(silent_windows) == 0:
             return audio_segment, None
+
         # get the last silent window because
         # we want to transcribe until the final pause
         last_window = silent_windows[-1]
+
         # if the entire audio segment is silent, reset the audio segment
         if last_window[0] == 0 and last_window[1] == len(audio_segment):
             audio_segment = AudioSegment.empty()
             return audio_segment, None
+
         # get the segment to transcribe: beginning until last pause
         segment_to_transcribe = audio_segment[: last_window[1]]
+
         # remove the segment to transcribe from the audio segment
         audio_segment = audio_segment[last_window[1] :]
         try:
-            text = await self.transcribe(segment_to_transcribe.raw_data)
+            text = self.transcribe(segment_to_transcribe.raw_data)
             return audio_segment, text
         except Exception as e:
             print("❌ Transcription error:", e)
             raise e
 
 
-# ## Client
+# ## Running transcription from a local Python client
+
 # Next, let's test the model with a [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint) that streams audio data to the server and prints
-# out the transcriptions to our terminal in real-time.
+# out the transcriptions to our terminal as they arrive.
 
-# Instead of using the WebSocket endpoint like the frontend,
+# Instead of using the WebSocket endpoint like the browser frontend,
 # we'll use a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue)
 # to pass audio data and transcriptions between our local machine and the GPU container.
 
 AUDIO_URL = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav"
-TARGET_SAMPLE_RATE = 16000
-CHUNK_SIZE = 16000  # send one second of audio at a time
+TARGET_SAMPLE_RATE = 16_000
+CHUNK_SIZE = 16_000  # send one second of audio at a time
 
 
 @app.local_entrypoint()
-def main(audio_url: str = AUDIO_URL):
+async def main(audio_url: str = AUDIO_URL):
     from urllib.request import urlopen
 
-    print("🌐 Downloading audio file...")
+    print(f"🌐 Downloading audio file from {audio_url}")
     audio_bytes = urlopen(audio_url).read()
     print(f"🎧 Downloaded {len(audio_bytes)} bytes")
 
     audio_data = preprocess_audio(audio_bytes)
 
-    print("☀️ Waking up model, this may take a few seconds on cold start...")
-    try:
-        asyncio.run(run(audio_data))
-        print("✅ Transcription complete!")
-    except KeyboardInterrupt:
-        print("\n🛑 Stopped by user.")
+    print("🎤 Starting Transcription")
+    with modal.Queue.ephemeral() as q:
+        Parakeet().run_with_queue.spawn(q)
+        send = asyncio.create_task(send_audio(q, audio_data))
+        recv = asyncio.create_task(receive_text(q))
+        await asyncio.gather(send, recv)
+    print("✅ Transcription complete!")
+
 
+# Below are the two functions that coordinate streaming audio and receiving transcriptions.
 
-# Below are the three main functions that coordinate streaming audio and receiving transcriptions.
-#
-# `send_audio` transmits chunks of audio data and then pauses to approximate streaming
-# speech at a natural rate.
+# `send_audio` transmits chunks of audio data with a slight delay,
+# as though it was being streamed from a live source, like a microphone.
+# `receive_text` waits for transcribed text to arrive and prints it.
 
 
 async def send_audio(q, audio_bytes):
     for chunk in chunk_audio(audio_bytes, CHUNK_SIZE):
         await q.put.aio(chunk, partition="audio")
-        await asyncio.sleep(
-            CHUNK_SIZE / TARGET_SAMPLE_RATE / 8
-        )  # simulate real-time pacing
+        await asyncio.sleep(CHUNK_SIZE / TARGET_SAMPLE_RATE / 8)
     await q.put.aio(END_OF_STREAM, partition="audio")
 
 
-# `receive_transcriptions` waits for a transcription and prints it after a small delay to avoid colliding with the print statements
-# from the GPU container.
-
-
-async def receive_transcriptions(q):
+async def receive_text(q):
     while True:
         message = await q.get.aio(partition="transcription")
         if message == END_OF_STREAM:
             break
-        await asyncio.sleep(1.00)  # add a delay to avoid stdout collision
-
-        output_message_as_transcript(message)
 
-
-# We take full advantage of Modal's asynchronous capabilities here. In `run`, we spawn our function call
-# so it doesn't block, and then we create and wait on the send and receive tasks.
-
-
-async def run(audio_bytes):
-    with modal.Queue.ephemeral() as q:
-        Parakeet().run_with_queue.spawn(q)
-        send_task = asyncio.create_task(send_audio(q, audio_bytes))
-        receive_task = asyncio.create_task(receive_transcriptions(q))
-        await asyncio.gather(send_task, receive_task)
+        print(message)
 
 
 # ## Addenda
-# Helper functions for converting audio to Parakeet's input format and iterating over audio chunks.
+
+# The remainder of the code in this example is boilerplate,
+# mostly for handling Parakeet's input format.
 
 
 def preprocess_audio(audio_bytes: bytes) -> bytes: