arduino · robgee86 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/.github/workflows/cleanup-cache.yml b/.github/workflows/cleanup-cache.yml
@@ -44,7 +44,7 @@ jobs:
           VERSION_ID=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
             -H "Accept: application/vnd.github.v3+json" \
             "https://api.github.com/orgs/${{ github.repository_owner }}/packages/container/${PACKAGE_NAME}/versions" \
-            | jq -r ".[] | select(.metadata.container.tags[] == \"${CACHE_TAG}\") | .id")
+            | jq -r --arg tag "${CACHE_TAG}" '.[]? | select((.metadata.container.tags? // []) | index($tag) != null) | .id')
 
           if [ -n "$VERSION_ID" ] && [ "$VERSION_ID" != "null" ]; then
             echo "Found cache version ID: $VERSION_ID. Deleting..."

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -219,6 +219,7 @@ jobs:
         if: matrix.container == 'python-apps-base'
         env:
           PUBLIC_IMAGE_REGISTRY_BASE: ${{ env.DEV_REGISTRY_PATH }}
+          DEV_TAG_VERSION: ${{ needs.detect-changes.outputs.image_tag }}
         run: |
           pip install go-task-bin==${{ env.TASKFILE_VERSION }}
           task init:ci

diff --git a/models/models-list.yaml b/models/models-list.yaml
@@ -1265,11 +1265,11 @@ models:
       source: "qualcomm-ai-hub"  
       source-model-id: "mediapipe_hand_gesture"
       source-model-url: "https://aihub.qualcomm.com/models/mediapipe_hand_gesture"
- - whisper-small:
+ - whisper-small-quantized:
     runner: audio-analytics
-    name : "Whisper Small"
+    name : "Whisper Small (quantized)"
     description: >-
-      Whisper-Small ASR (Automatic Speech Recognition) model is a state-of-the-art system designed for transcribing spoken language into written text.
+      Whisper ASR (Automatic Speech Recognition) model is a state-of-the-art system designed for transcribing spoken language into written text.
       This model is based on the transformer architecture and has been optimized for edge inference by replacing Multi-Head Attention (MHA)
       with Single-Head Attention (SHA) and linear layers with convolutional (conv) layers. It exhibits robust performance in realistic, noisy environments,
       making it highly reliable for real-world applications.

diff --git a/src/arduino/app_bricks/asr/README.md b/src/arduino/app_bricks/asr/README.md
@@ -1,10 +1,23 @@
 # Automatic Speech Recognition Brick
 
-The `AutomaticSpeechRecognition` brick provides on-device automatic speech recognition (ASR) capabilities for audio streams and files. It offers a high-level interface for transcribing audio using a local model, with support for both real-time and batch processing.
+The `AutomaticSpeechRecognition` brick provides on-device automatic speech recognition (ASR) capabilities for audio streams and files. It offers a high-level interface for transcribing audio using a local model, with support for both real-time microphone capture and in-memory audio (WAV bytes or raw PCM arrays).
 
-## LocalASR Class Features
+## Features
 
 - **Offline Operation:** All transcriptions are performed locally, ensuring data privacy and eliminating network dependencies.
-- **Multi Language Support:** Supports the transcription of spoken multiple languages.
-- **Audio Input Formats**: Designed to work with the Microphone peripheral, WAV and PCM audio.
-- **Concurrency Control**: Limits the number of simultaneous transcription sessions to avoid resource exhaustion.
+- **Multi-Language Support:** Supports the transcription of multiple spoken languages.
+- **Flexible Audio Input:** The constructor accepts a `BaseMicrophone` instance, a `bytes` WAV container, a raw `np.ndarray` of PCM samples, or `None` to use a default `Microphone()`.
+- **Single-Session Semantics:** Each instance handles one transcription session at a time. For concurrent transcriptions on different microphones, create multiple `AutomaticSpeechRecognition` instances.
+
+## Errors
+
+- `ASRBusyError`: raised if you call `transcribe()` / `transcribe_stream()` while the instance already has an active session. Fix by awaiting the current session or using a separate instance.
+- `ASRServiceBusyError`: raised when the inference server rejects session creation because it is currently serving another client. The caller decides whether to retry.
+- `ASRUnavailableError`: raised when the inference service is unreachable (container down, network error) or the WebSocket connection drops mid-session. The caller decides whether to retry.
+- `ASRError`: base class for all of the above.
+
+## Source Ownership
+
+- When `source` is `None`, ASR constructs a default `Microphone()` and manages its lifecycle through `asr.start()` / `asr.stop()`.
+- When `source` is a `BaseMicrophone` you pass in, **you** own its lifecycle — call `mic.start()` before transcribing and `mic.stop()` when done.
+- In-memory sources (`bytes`, `np.ndarray`) have no device lifecycle.
diff --git a/src/arduino/app_bricks/asr/__init__.py b/src/arduino/app_bricks/asr/__init__.py
@@ -2,6 +2,22 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 
-from .local_asr import AutomaticSpeechRecognition, ASREvent, TranscriptionStream
+from .local_asr import (
+    ASREvent,
+    ASRBusyError,
+    ASRError,
+    ASRServiceBusyError,
+    ASRUnavailableError,
+    AutomaticSpeechRecognition,
+    TranscriptionStream,
+)
 
-__all__ = ["AutomaticSpeechRecognition", "ASREvent", "TranscriptionStream"]
+__all__ = [
+    "ASREvent",
+    "ASRError",
+    "ASRBusyError",
+    "ASRServiceBusyError",
+    "ASRUnavailableError",
+    "AutomaticSpeechRecognition",
+    "TranscriptionStream",
+]
diff --git a/src/arduino/app_bricks/asr/brick_config.yaml b/src/arduino/app_bricks/asr/brick_config.yaml
@@ -4,7 +4,7 @@ description: Automatic Speech Recognition brick for offline speech-to-text proce
 category: audio
 requires_model: true
 requires_services: ["arduino:genie_audio"]
-model: whisper-small
+model: whisper-small-quantized
 supported_boards: ["ventunoq"]
 model_configuration_variables:
   - VAD_LEN_HANGOVER

diff --git a/src/arduino/app_bricks/asr/examples/00_transcribe_wav.py b/src/arduino/app_bricks/asr/examples/00_transcribe_wav.py
@@ -7,7 +7,7 @@
 from arduino.app_bricks.asr import AutomaticSpeechRecognition
 
 
-asr = AutomaticSpeechRecognition()
 with open("recording_01.wav", "rb") as wav_file:
-    text = asr.transcribe_wav(wav_file.read())
+    asr = AutomaticSpeechRecognition(wav_file.read())
+    text = asr.transcribe()
     print(f"Transcription: {text}")
diff --git a/src/arduino/app_bricks/asr/examples/01_stream_wav_transcription.py b/src/arduino/app_bricks/asr/examples/01_stream_wav_transcription.py
@@ -7,9 +7,9 @@
 from arduino.app_bricks.asr import AutomaticSpeechRecognition
 
 
-asr = AutomaticSpeechRecognition()
 with open("recording_01.wav", "rb") as wav_file:
-    with asr.transcribe_wav_stream(wav_file.read()) as stream:
+    asr = AutomaticSpeechRecognition(wav_file.read())
+    with asr.transcribe_stream() as stream:
         for chunk in stream:
             match chunk.type:
                 case "partial_text":

diff --git a/src/arduino/app_bricks/asr/examples/02_transcribe_mic.py b/src/arduino/app_bricks/asr/examples/02_transcribe_mic.py
@@ -11,8 +11,8 @@
 mic = Microphone()
 mic.start()
 
-asr = AutomaticSpeechRecognition()
-text = asr.transcribe_mic(mic, duration=5)
+asr = AutomaticSpeechRecognition(mic)
+text = asr.transcribe(duration=5)
 print(f"Transcription: {text}")
 
 mic.stop()
diff --git a/src/arduino/app_bricks/asr/examples/03_stream_mic_transcription.py b/src/arduino/app_bricks/asr/examples/03_stream_mic_transcription.py
@@ -11,8 +11,8 @@
 mic = Microphone()
 mic.start()
 
-asr = AutomaticSpeechRecognition()
-with asr.transcribe_mic_stream(mic, duration=5) as stream:
+asr = AutomaticSpeechRecognition(mic)
+with asr.transcribe_stream(duration=5) as stream:
     for chunk in stream:
         match chunk.type:
             case "partial_text":