From f82d88113ac0d48364934f6735f5cf31afc2eded Mon Sep 17 00:00:00 2001 From: Lucy Zhang <36051089+zhang-lucy@users.noreply.github.com> Date: Thu, 22 May 2025 16:09:41 -0400 Subject: [PATCH 1/8] more text cleanups --- 06_gpu_and_ml/audio-to-text/parakeet.py | 28 +++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index 90cdbe34b..b9ae0006b 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -7,19 +7,17 @@ # This example uses the `nvidia/parakeet-tdt-0.6b-v2` model, which, as of May 13, 2025, sits at the # top of Hugging Face's [ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard). -# To run this example either: +# To run this example, either: -# - run the browser/microphone frontend, or +# - Run the browser/microphone frontend - Modal can handle the deployment of both the frontend and backend in a single app! You should see a browser window pop up - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend). # ```bash # modal serve 06_gpu_and_ml/audio-to-text/parakeet.py # ``` -# - stream a .wav file from a URL (optional, default is "Dream Within a Dream" by Edgar Allan Poe). +# - Or, stream a `.wav` file directly from a URL to simulate real-time transcription! # ```bash # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav" # ``` -# See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues. - # Here's what your final output might look like: # ```bash @@ -28,10 +26,17 @@ # ā˜€ļø Waking up model, this may take a few seconds on cold start... # šŸ“ Transcription: A Dream Within A Dream Edgar Allan Poe # šŸ“ Transcription: -# šŸ“ Transcription: take this kiss upon the brow, And in parting from you now, Thus much let me avow You are not wrong who deem That my days have been a dream. +# šŸ“ Transcription: Take this kiss upon the brow, +# šŸ“ Transcription: And in parting from you now, +# šŸ“ Transcription: Thus much let me avow, +# šŸ“ Transcription: You are not wrong who deem +# šŸ“ Transcription: That my days have been a dream. # ... # ``` +# See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues. + + # ## Setup import asyncio import os @@ -40,9 +45,8 @@ import modal os.environ["MODAL_LOGLEVEL"] = "INFO" -app_name = "parakeet-websocket" -app = modal.App(app_name) +app = modal.App("parakeet-websocket") SILENCE_THRESHOLD = -45 SILENCE_MIN_LENGTH_MSEC = 1000 END_OF_STREAM = b"END_OF_STREAM" @@ -101,6 +105,7 @@ # - The `transcribe` method takes bytes of audio data, and returns the transcribed text. # - The `web` method creates a FastAPI app using [`modal.asgi_app`](https://modal.com/docs/reference/modal.asgi_app#modalasgi_app) that serves a # [WebSocket](https://modal.com/docs/guide/webhooks#websockets) endpoint for real-time audio transcription and a browser frontend for transcribing audio from your microphone. +# - The `run_with_queue` method takes a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue) and passes audio data and transcriptions between our local machine and the GPU container. # Parakeet tries really hard to transcribe everything to English! # Hence it tends to output utterances like "Yeah" or "Mm-hmm" when it runs on silent audio. @@ -275,9 +280,7 @@ def main(audio_url: str = AUDIO_URL): # Below are the three main functions that coordinate streaming audio and receiving transcriptions. # # `send_audio` transmits chunks of audio data and then pauses to approximate streaming -# speech at a natural rate. That said, we set it to faster -# than real-time to compensate for network latency. Plus, we're not -# trying to wait forever for this to finish. +# speech at a natural rate. async def send_audio(q, audio_bytes): @@ -289,8 +292,7 @@ async def send_audio(q, audio_bytes): await q.put.aio(END_OF_STREAM, partition="audio") -# `receive_transcriptions` is straightforward. -# It just waits for a transcription and prints it after a small delay to avoid colliding with the print statements +# `receive_transcriptions` waits for a transcription and prints it after a small delay to avoid colliding with the print statements # from the GPU container. From eabfaf10fbeebc7d9502f844b03f7798e402b1fb Mon Sep 17 00:00:00 2001 From: Lucy Zhang <36051089+zhang-lucy@users.noreply.github.com> Date: Thu, 22 May 2025 16:26:39 -0400 Subject: [PATCH 2/8] nits --- 06_gpu_and_ml/audio-to-text/parakeet.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index b9ae0006b..c0a054be0 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -9,16 +9,15 @@ # To run this example, either: -# - Run the browser/microphone frontend - Modal can handle the deployment of both the frontend and backend in a single app! You should see a browser window pop up - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend). +# - Run the browser/microphone frontend. Modal handles the deployment of both the frontend and backend in a single app! You should see a browser window pop up - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend). # ```bash # modal serve 06_gpu_and_ml/audio-to-text/parakeet.py # ``` -# - Or, stream a `.wav` file directly from a URL to simulate real-time transcription! +# - Or, stream a `.wav` file directly from a URL to simulate real-time transcription in your terminal: # ```bash # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav" # ``` - -# Here's what your final output might look like: +# You should see output like the following in your terminal: # ```bash # 🌐 Downloading audio file... @@ -33,7 +32,6 @@ # šŸ“ Transcription: That my days have been a dream. # ... # ``` - # See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues. From 7b4724fd43bb3575ce0e4c9da23f7d769fc6e6eb Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Thu, 22 May 2025 13:46:59 -0700 Subject: [PATCH 3/8] minor text changes --- 06_gpu_and_ml/audio-to-text/parakeet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index c0a054be0..bebb47884 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -13,7 +13,7 @@ # ```bash # modal serve 06_gpu_and_ml/audio-to-text/parakeet.py # ``` -# - Or, stream a `.wav` file directly from a URL to simulate real-time transcription in your terminal: +# - Or, stream a `.wav` file directly from a URL to run transcription from your terminal: # ```bash # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav" # ``` @@ -44,7 +44,7 @@ os.environ["MODAL_LOGLEVEL"] = "INFO" -app = modal.App("parakeet-websocket") +app = modal.App("example-parakeet-websocket") SILENCE_THRESHOLD = -45 SILENCE_MIN_LENGTH_MSEC = 1000 END_OF_STREAM = b"END_OF_STREAM" From cce3bed16414774e4316090f15925e742e9e4228 Mon Sep 17 00:00:00 2001 From: Lucy Zhang <36051089+zhang-lucy@users.noreply.github.com> Date: Thu, 22 May 2025 16:54:36 -0400 Subject: [PATCH 4/8] rm troubleshooting --- 06_gpu_and_ml/audio-to-text/parakeet.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index bebb47884..230252f34 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -32,8 +32,6 @@ # šŸ“ Transcription: That my days have been a dream. # ... # ``` -# See [Troubleshooting](https://modal.com/docs/examples/parakeet#client) at the bottom if you run into issues. - # ## Setup import asyncio @@ -315,11 +313,6 @@ async def run(audio_bytes): await asyncio.gather(send_task, receive_task) -# ## Troubleshooting -# - Make sure you have the latest version of the Modal CLI installed. -# - The server takes a few seconds to start up on cold start. If your local client times out, try -# restarting the client. - # ## Addenda # Helper functions for converting audio to Parakeet's input format and iterating over audio chunks. From 8f13d09e6fe67a5040460cc1737846570adf9122 Mon Sep 17 00:00:00 2001 From: Lucy Zhang <36051089+zhang-lucy@users.noreply.github.com> Date: Sun, 25 May 2025 09:50:06 -0400 Subject: [PATCH 5/8] update --- 06_gpu_and_ml/audio-to-text/parakeet.py | 35 +++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index 230252f34..8ebf70e55 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -9,7 +9,7 @@ # To run this example, either: -# - Run the browser/microphone frontend. Modal handles the deployment of both the frontend and backend in a single app! You should see a browser window pop up - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend). +# - Run the browser/microphone frontend. Modal handles the deployment of both the frontend and backend in a single app! Click on the link in your terminal to open the frontend in your browser - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend). # ```bash # modal serve 06_gpu_and_ml/audio-to-text/parakeet.py # ``` @@ -27,9 +27,9 @@ # šŸ“ Transcription: # šŸ“ Transcription: Take this kiss upon the brow, # šŸ“ Transcription: And in parting from you now, -# šŸ“ Transcription: Thus much let me avow, -# šŸ“ Transcription: You are not wrong who deem -# šŸ“ Transcription: That my days have been a dream. +# šŸ“ Transcription: Thus much let me avow You +# šŸ“ Transcription: Are not wrong who deem That +# šŸ“ Transcription: My days have been a dream. # ... # ``` @@ -298,7 +298,8 @@ async def receive_transcriptions(q): if message == END_OF_STREAM: break await asyncio.sleep(1.00) # add a delay to avoid stdout collision - print(f"šŸ“ Transcription: {message}") + + output_message_as_transcript(message) # We take full advantage of Modal's asynchronous capabilities here. In `run`, we spawn our function call @@ -376,3 +377,27 @@ def preprocess_audio(audio_bytes: bytes) -> bytes: def chunk_audio(data: bytes, chunk_size: int): for i in range(0, len(data), chunk_size): yield data[i : i + chunk_size] + + +def output_message_as_transcript(message: str): + words = message.strip().split() + + # Group into 5–6 word chunks + chunks = [] + chunk = [] + + for word in words: + chunk.append(word) + if len(chunk) >= 6: + chunks.append(" ".join(chunk)) + chunk = [] + + if chunk: + chunks.append(" ".join(chunk)) + + # Capitalize first word in each chunk and print + for chunk in chunks: + # Capitalize the first letter of the first word + parts = chunk.split(" ", 1) + capitalized = parts[0].capitalize() + (" " + parts[1] if len(parts) > 1 else "") + print(f"šŸ“ Transcription: {capitalized}") From e361fe36c15fddebc2614a2471472e5f3c0c3b28 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Wed, 28 May 2025 04:09:08 +0000 Subject: [PATCH 6/8] minor text fixes and reorganization --- 06_gpu_and_ml/audio-to-text/parakeet.py | 105 ++++++++++++++---------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index 8ebf70e55..bc3cd91c1 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -1,66 +1,73 @@ -# # Real time audio transcription using Parakeet 🦜 +# # Real-time audio transcription using Parakeet -# [Parakeet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#parakeet) is the name of a family of ASR models built using [NVIDIA's NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html). -# We'll show you how to use Parakeet for real-time audio transcription, -# with a simple Python client and a GPU server you can spin up easily in Modal. +# This examples demonstrates the use of Parakeet ASR models for real-time speech-to-text on Modal. -# This example uses the `nvidia/parakeet-tdt-0.6b-v2` model, which, as of May 13, 2025, sits at the -# top of Hugging Face's [ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard). +# [Parakeet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/asr/models.html#parakeet) +# is the name of a family of ASR models built using [NVIDIA's NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html). +# We'll show you how to use Parakeet for real-time audio transcription on Modal GPUs, +# with simple Python and browser clients. -# To run this example, either: +# This example uses the `nvidia/parakeet-tdt-0.6b-v2` model which, as of June 2025, sits at the +# top of Hugging Face's [Open ASR leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard). + +# To try out transcription from your terminal, +# provide a URL for a `.wav` file to `modal run`: -# - Run the browser/microphone frontend. Modal handles the deployment of both the frontend and backend in a single app! Click on the link in your terminal to open the frontend in your browser - make sure you allow access to your microphone. The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend). -# ```bash -# modal serve 06_gpu_and_ml/audio-to-text/parakeet.py -# ``` -# - Or, stream a `.wav` file directly from a URL to run transcription from your terminal: # ```bash # modal run 06_gpu_and_ml/audio-to-text/parakeet.py --audio-url="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav" # ``` -# You should see output like the following in your terminal: + +# You should see output like the following: # ```bash -# 🌐 Downloading audio file... -# šŸŽ§ Downloaded 6331478 bytes -# ā˜€ļø Waking up model, this may take a few seconds on cold start... -# šŸ“ Transcription: A Dream Within A Dream Edgar Allan Poe -# šŸ“ Transcription: -# šŸ“ Transcription: Take this kiss upon the brow, -# šŸ“ Transcription: And in parting from you now, -# šŸ“ Transcription: Thus much let me avow You -# šŸ“ Transcription: Are not wrong who deem That -# šŸ“ Transcription: My days have been a dream. +# šŸŽ¤ Starting Transcription +# A Dream Within A Dream Edgar Allan Poe +# take this kiss upon the brow, And in parting from you now, Thus much let me avow You are not wrong who deem That my days have been a dream. # ... # ``` +# Running a web service you can hit from any browser isn't any harder -- Modal handles the deployment of both the frontend and backend in a single App! +# Just run + +# ```bash +# modal serve 06_gpu_and_ml/audio-to-text/parakeet.py +# ``` + +# and go to the link printed in your terminal. + +# The full frontend code can be found [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/audio-to-text/frontend). + # ## Setup + import asyncio import os +import sys from pathlib import Path import modal -os.environ["MODAL_LOGLEVEL"] = "INFO" +app = modal.App("example-parakeet") -app = modal.App("example-parakeet-websocket") -SILENCE_THRESHOLD = -45 -SILENCE_MIN_LENGTH_MSEC = 1000 -END_OF_STREAM = b"END_OF_STREAM" # ## Volume for caching model weights + # We use a [Modal Volume](https://modal.com/docs/guide/volumes) to cache the model weights. # This allows us to avoid downloading the model weights every time we start a new instance. +# For more on storing models on Modal, see [this guide](https://modal.com/docs/guide/model-weights). + model_cache = modal.Volume.from_name("parakeet-model-cache", create_if_missing=True) + # ## Configuring dependencies -# The model runs remotely inside a [custom container](https://modal.com/docs/guide/custom-container). We can define the environment -# and install our Python dependencies in that container's `Image`. -# For inference, we recommend using the official NVIDIA CUDA Docker images from Docker Hub. +# The model runs remotely inside a container on Modal. We can define the environment +# and install our Python dependencies in that container's [`Image`](https://modal.com/docs/guide/images). + +# For finicky setups like NeMO's, we recommend using the official NVIDIA CUDA Docker images from Docker Hub. # You'll need to install Python and pip with the `add_python` option because the image # doesn't have these by default. # Additionally, we install `ffmpeg` for handling audio data and `fastapi` to create a web -# server for our websocket. +# server for our WebSocket. image = ( modal.Image.from_registry( @@ -82,31 +89,35 @@ "nemo_toolkit[asr]==2.3.0", "cuda-python==12.8.0", "fastapi==0.115.12", - "numpy==1.26.4", # downgrading numpy to avoid issues with CUDA + "numpy<2", "pydub==0.25.1", ) - .entrypoint([]) - .add_local_dir( - os.path.join(Path(__file__).parent.resolve(), "frontend"), + .entrypoint([]) # silence chatty logs by container on start + .add_local_dir( # changes fastest, so make this the last layer + Path(__file__).parent / "frontend", remote_path="/frontend", ) ) # ## Implementing real-time audio transcription on Modal -# Now we're ready to implement the transcription model. We wrap inference in a [modal.Cls](https://modal.com/docs/guide/lifecycle-functions) that -# ensures models are loaded and then moved to the GPU once when a new container starts. Couple of notes: +# Now we're ready to implement transcription. We wrap inference in a [`modal.Cls`](https://modal.com/docs/guide/lifecycle-functions) that +# ensures models are loaded and then moved to the GPU once when a new container starts. -# - The `load` method loads the model at start, instead of during inference, using [`modal.enter()`](https://modal.com/docs/reference/modal.enter#modalenter). -# - The `transcribe` method takes bytes of audio data, and returns the transcribed text. +# A couples of notes about this code: +# - The `transcribe` method takes bytes of audio data and returns the transcribed text. # - The `web` method creates a FastAPI app using [`modal.asgi_app`](https://modal.com/docs/reference/modal.asgi_app#modalasgi_app) that serves a # [WebSocket](https://modal.com/docs/guide/webhooks#websockets) endpoint for real-time audio transcription and a browser frontend for transcribing audio from your microphone. # - The `run_with_queue` method takes a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue) and passes audio data and transcriptions between our local machine and the GPU container. # Parakeet tries really hard to transcribe everything to English! # Hence it tends to output utterances like "Yeah" or "Mm-hmm" when it runs on silent audio. -# We can pre-process the incoming audio in the server by using `pydub`'s silence detection, -# ensuring that we only pass audio with speech to our model. +# We pre-process the incoming audio in the server using `pydub`'s silence detection, +# ensuring that we don't pass silence into our model. + +END_OF_STREAM = ( + b"END_OF_STREAM_8f13d09" # byte sequence indicating a stream is finished +) @app.cls(volumes={"/cache": model_cache}, gpu="a10g", image=image) @@ -140,7 +151,7 @@ def web(self): async def status(): return Response(status_code=200) - # server frontend + # serve frontend @web_app.get("/") async def index(): return HTMLResponse(content=open("/frontend/index.html").read()) @@ -202,7 +213,13 @@ async def run_with_queue(self, q: modal.Queue): print(f"Error handling queue: {type(e)}: {e}") return - async def handle_audio_chunk(self, chunk: bytes, audio_segment): + async def handle_audio_chunk( + self, + chunk: bytes, + audio_segment, + silence_thresh=-45, # dB + min_silence_len=1000, # ms + ): from pydub import AudioSegment, silence new_audio_segment = AudioSegment( From 484bd0cb935fdf8aff7fa28bbbbea75eacdd381f Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Wed, 28 May 2025 04:10:03 +0000 Subject: [PATCH 7/8] silence unnecessary logs --- 06_gpu_and_ml/audio-to-text/parakeet.py | 42 ++++++++++++------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index bc3cd91c1..eb10c369a 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -125,17 +125,25 @@ class Parakeet: @modal.enter() def load(self): + import logging + import nemo.collections.asr as nemo_asr + # silence chatty logs from nemo + logging.getLogger("nemo_logger").setLevel(logging.CRITICAL) + self.model = nemo_asr.models.ASRModel.from_pretrained( model_name="nvidia/parakeet-tdt-0.6b-v2" ) - async def transcribe(self, audio_bytes: bytes) -> str: + def transcribe(self, audio_bytes: bytes) -> str: import numpy as np audio_data = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) - output = self.model.transcribe([audio_data]) + + with NoStdStreams(): # hide output, see https://github.com/NVIDIA/NeMo/discussions/3281#discussioncomment-2251217 + output = self.model.transcribe([audio_data]) + return output[0].text @modal.asgi_app() @@ -396,25 +404,15 @@ def chunk_audio(data: bytes, chunk_size: int): yield data[i : i + chunk_size] -def output_message_as_transcript(message: str): - words = message.strip().split() - - # Group into 5–6 word chunks - chunks = [] - chunk = [] - - for word in words: - chunk.append(word) - if len(chunk) >= 6: - chunks.append(" ".join(chunk)) - chunk = [] +class NoStdStreams(object): + def __init__(self): + self.devnull = open(os.devnull, "w") - if chunk: - chunks.append(" ".join(chunk)) + def __enter__(self): + self._stdout, self._stderr = sys.stdout, sys.stderr + self._stdout.flush(), self._stderr.flush() + sys.stdout, sys.stderr = self.devnull, self.devnull - # Capitalize first word in each chunk and print - for chunk in chunks: - # Capitalize the first letter of the first word - parts = chunk.split(" ", 1) - capitalized = parts[0].capitalize() + (" " + parts[1] if len(parts) > 1 else "") - print(f"šŸ“ Transcription: {capitalized}") + def __exit__(self, exc_type, exc_value, traceback): + sys.stdout, sys.stderr = self._stdout, self._stderr + self.devnull.close() From 882a0b74b704cc398b9cefc984c7183c448ebd62 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Wed, 28 May 2025 04:13:43 +0000 Subject: [PATCH 8/8] refactor out one level of async nesting --- 06_gpu_and_ml/audio-to-text/parakeet.py | 78 +++++++++++-------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/06_gpu_and_ml/audio-to-text/parakeet.py b/06_gpu_and_ml/audio-to-text/parakeet.py index eb10c369a..f3a486f41 100644 --- a/06_gpu_and_ml/audio-to-text/parakeet.py +++ b/06_gpu_and_ml/audio-to-text/parakeet.py @@ -236,111 +236,103 @@ async def handle_audio_chunk( sample_width=2, frame_rate=TARGET_SAMPLE_RATE, ) + # append the new audio segment to the existing audio segment audio_segment += new_audio_segment + # detect windows of silence silent_windows = silence.detect_silence( audio_segment, - min_silence_len=SILENCE_MIN_LENGTH_MSEC, - silence_thresh=SILENCE_THRESHOLD, + min_silence_len=min_silence_len, + silence_thresh=silence_thresh, ) # if there are no silent windows, continue if len(silent_windows) == 0: return audio_segment, None + # get the last silent window because # we want to transcribe until the final pause last_window = silent_windows[-1] + # if the entire audio segment is silent, reset the audio segment if last_window[0] == 0 and last_window[1] == len(audio_segment): audio_segment = AudioSegment.empty() return audio_segment, None + # get the segment to transcribe: beginning until last pause segment_to_transcribe = audio_segment[: last_window[1]] + # remove the segment to transcribe from the audio segment audio_segment = audio_segment[last_window[1] :] try: - text = await self.transcribe(segment_to_transcribe.raw_data) + text = self.transcribe(segment_to_transcribe.raw_data) return audio_segment, text except Exception as e: print("āŒ Transcription error:", e) raise e -# ## Client +# ## Running transcription from a local Python client + # Next, let's test the model with a [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint) that streams audio data to the server and prints -# out the transcriptions to our terminal in real-time. +# out the transcriptions to our terminal as they arrive. -# Instead of using the WebSocket endpoint like the frontend, +# Instead of using the WebSocket endpoint like the browser frontend, # we'll use a [`modal.Queue`](https://modal.com/docs/reference/modal.Queue) # to pass audio data and transcriptions between our local machine and the GPU container. AUDIO_URL = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/mono_44100/156550__acclivity__a-dream-within-a-dream.wav" -TARGET_SAMPLE_RATE = 16000 -CHUNK_SIZE = 16000 # send one second of audio at a time +TARGET_SAMPLE_RATE = 16_000 +CHUNK_SIZE = 16_000 # send one second of audio at a time @app.local_entrypoint() -def main(audio_url: str = AUDIO_URL): +async def main(audio_url: str = AUDIO_URL): from urllib.request import urlopen - print("🌐 Downloading audio file...") + print(f"🌐 Downloading audio file from {audio_url}") audio_bytes = urlopen(audio_url).read() print(f"šŸŽ§ Downloaded {len(audio_bytes)} bytes") audio_data = preprocess_audio(audio_bytes) - print("ā˜€ļø Waking up model, this may take a few seconds on cold start...") - try: - asyncio.run(run(audio_data)) - print("āœ… Transcription complete!") - except KeyboardInterrupt: - print("\nšŸ›‘ Stopped by user.") + print("šŸŽ¤ Starting Transcription") + with modal.Queue.ephemeral() as q: + Parakeet().run_with_queue.spawn(q) + send = asyncio.create_task(send_audio(q, audio_data)) + recv = asyncio.create_task(receive_text(q)) + await asyncio.gather(send, recv) + print("āœ… Transcription complete!") + +# Below are the two functions that coordinate streaming audio and receiving transcriptions. -# Below are the three main functions that coordinate streaming audio and receiving transcriptions. -# -# `send_audio` transmits chunks of audio data and then pauses to approximate streaming -# speech at a natural rate. +# `send_audio` transmits chunks of audio data with a slight delay, +# as though it was being streamed from a live source, like a microphone. +# `receive_text` waits for transcribed text to arrive and prints it. async def send_audio(q, audio_bytes): for chunk in chunk_audio(audio_bytes, CHUNK_SIZE): await q.put.aio(chunk, partition="audio") - await asyncio.sleep( - CHUNK_SIZE / TARGET_SAMPLE_RATE / 8 - ) # simulate real-time pacing + await asyncio.sleep(CHUNK_SIZE / TARGET_SAMPLE_RATE / 8) await q.put.aio(END_OF_STREAM, partition="audio") -# `receive_transcriptions` waits for a transcription and prints it after a small delay to avoid colliding with the print statements -# from the GPU container. - - -async def receive_transcriptions(q): +async def receive_text(q): while True: message = await q.get.aio(partition="transcription") if message == END_OF_STREAM: break - await asyncio.sleep(1.00) # add a delay to avoid stdout collision - - output_message_as_transcript(message) - -# We take full advantage of Modal's asynchronous capabilities here. In `run`, we spawn our function call -# so it doesn't block, and then we create and wait on the send and receive tasks. - - -async def run(audio_bytes): - with modal.Queue.ephemeral() as q: - Parakeet().run_with_queue.spawn(q) - send_task = asyncio.create_task(send_audio(q, audio_bytes)) - receive_task = asyncio.create_task(receive_transcriptions(q)) - await asyncio.gather(send_task, receive_task) + print(message) # ## Addenda -# Helper functions for converting audio to Parakeet's input format and iterating over audio chunks. + +# The remainder of the code in this example is boilerplate, +# mostly for handling Parakeet's input format. def preprocess_audio(audio_bytes: bytes) -> bytes: