ai-dynamo
diff --git a/‎components/src/dynamo/common/memory/encoder_cache_manager.py‎
Lines changed: 37 additions & 29 deletions b/‎components/src/dynamo/common/memory/encoder_cache_manager.py‎
Lines changed: 37 additions & 29 deletions
diff --git a/‎components/src/dynamo/common/multimodal/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎components/src/dynamo/common/multimodal/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎components/src/dynamo/common/multimodal/async_encoder_cache.py‎
Lines changed: 135 additions & 0 deletions b/‎components/src/dynamo/common/multimodal/async_encoder_cache.py‎
Lines changed: 135 additions & 0 deletions
@@ -9,10 +9,10 @@
 
 Usage:
     cache = EncoderCacheManager(capacity_bytes=4 * 1024**3)  # 4GB
-    
+
     # Store embedding
     cache.set("abc123", embedding_tensor)
-    
+
     # Retrieve embedding
     tensor = cache.get("abc123")  # Returns None if not found
 """
@@ -29,10 +29,10 @@
 class EncoderCacheManager:
     """
     LRU cache for encoder embeddings.
-    
+
     Stores tensors keyed by content hash with automatic eviction
     when capacity is exceeded.
-    
+
     Thread Safety:
         This class is NOT thread-safe. It is designed to run within a single
         thread (e.g., an asyncio event loop). All access must be from the same
@@ -43,55 +43,59 @@ class EncoderCacheManager:
     def __init__(self, capacity_bytes: int):
         """
         Initialize the encoder cache.
-        
+
         Args:
             capacity_bytes: Maximum cache capacity in bytes.
         """
         if capacity_bytes <= 0:
             raise ValueError("capacity_bytes must be positive")
-        
+
         self._cache: OrderedDict[str, torch.Tensor] = OrderedDict()
         self._capacity_bytes = capacity_bytes
         self._current_bytes = 0
-        
+
         # Stats
         self._hits = 0
         self._misses = 0
-        
-        logger.info(f"EncoderCacheManager initialized: capacity={capacity_bytes / 1024**3:.2f}GB")
+
+        logger.info(
+            f"EncoderCacheManager initialized: capacity={capacity_bytes / 1024**3:.2f}GB"
+        )
 
     @staticmethod
     def _tensor_size(tensor: torch.Tensor) -> int:
         """Calculate tensor size in bytes.
-        
+
         Args:
             tensor: Must be a contiguous tensor.
-            
+
         Returns:
             Size of the tensor in bytes.
-            
+
         Raises:
             AssertionError: If tensor is not contiguous.
         """
-        assert tensor.is_contiguous(), "Tensor must be contiguous for accurate size calculation"
+        assert (
+            tensor.is_contiguous()
+        ), "Tensor must be contiguous for accurate size calculation"
         return tensor.element_size() * tensor.numel()
 
     def get(self, key: str) -> Optional[torch.Tensor]:
         """
         Get a tensor from the cache.
-        
+
         If found, the entry is moved to the end (most recently used).
-        
+
         Args:
             key: Cache key (typically content hash).
-            
+
         Returns:
             The cached tensor, or None if not found.
         """
         if key not in self._cache:
             self._misses += 1
             return None
-        
+
         # Move to end (most recently used)
         self._cache.move_to_end(key)
         self._hits += 1
@@ -100,44 +104,46 @@ def get(self, key: str) -> Optional[torch.Tensor]:
     def set(self, key: str, tensor: torch.Tensor) -> bool:
         """
         Store a tensor in the cache.
-        
+
         If the key already exists, the old value is replaced.
         If adding the tensor would exceed capacity, LRU entries are evicted.
         If the tensor itself is larger than capacity, it is not stored.
-        
+
         Args:
             key: Cache key (typically content hash).
             tensor: Tensor to cache.
-            
+
         Returns:
             True if the tensor was stored, False if it was too large.
         """
         size = self._tensor_size(tensor)
-        
+
         # Don't cache if single tensor exceeds capacity
         if size > self._capacity_bytes:
             logger.warning(
                 f"Tensor too large to cache: {size / 1024**2:.1f}MB > "
                 f"{self._capacity_bytes / 1024**3:.2f}GB capacity"
             )
             return False
-        
+
         # If key exists, remove old entry first
         if key in self._cache:
             old_tensor = self._cache.pop(key)
             self._current_bytes -= self._tensor_size(old_tensor)
-        
+
         # Evict LRU entries until we have space
         while self._current_bytes + size > self._capacity_bytes and self._cache:
             evicted_key, evicted_tensor = self._cache.popitem(last=False)
             evicted_size = self._tensor_size(evicted_tensor)
             self._current_bytes -= evicted_size
-            logger.debug(f"Evicted key={evicted_key[:16]}..., size={evicted_size / 1024**2:.2f}MB")
-        
+            logger.debug(
+                f"Evicted key={evicted_key[:16]}..., size={evicted_size / 1024**2:.2f}MB"
+            )
+
         # Store new entry
         self._cache[key] = tensor
         self._current_bytes += size
-        
+
         logger.debug(
             f"Cached key={key[:16] if len(key) > 16 else key}, "
             f"size={size / 1024**2:.2f}MB, "
@@ -149,19 +155,21 @@ def set(self, key: str, tensor: torch.Tensor) -> bool:
     def stats(self) -> dict:
         """
         Get cache statistics.
-        
+
         Returns:
             Dictionary with cache stats including entries, memory usage,
             hit/miss counts, and hit rate.
         """
         total_requests = self._hits + self._misses
         hit_rate = self._hits / total_requests if total_requests > 0 else 0.0
-        
+
         return {
             "entries": len(self._cache),
             "current_bytes": self._current_bytes,
             "capacity_bytes": self._capacity_bytes,
-            "utilization": self._current_bytes / self._capacity_bytes if self._capacity_bytes > 0 else 0,
+            "utilization": self._current_bytes / self._capacity_bytes
+            if self._capacity_bytes > 0
+            else 0,
             "hits": self._hits,
             "misses": self._misses,
             "hit_rate": hit_rate,
 
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Multimodal utilities for Dynamo components."""
+
+from dynamo.common.multimodal.async_encoder_cache import AsyncEncoderCache
+
+__all__ = ["AsyncEncoderCache"]
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Async Encoder Cache
+
+Async wrapper over EncoderCacheManager with request coalescing.
+Prevents duplicate encoding when multiple requests arrive for the same content.
+
+Usage:
+    cache = EncoderCacheManager(capacity_bytes=4 * 1024**3)
+    async_cache = AsyncEncoderCache(cache)
+
+    # Get from cache or compute with coalescing
+    tensor = await async_cache.get_or_compute("hash123", encoder.encode)
+"""
+
+import asyncio
+import logging
+from typing import Awaitable, Callable, Dict, Optional
+
+import torch
+
+from dynamo.common.memory.encoder_cache_manager import EncoderCacheManager
+
+logger = logging.getLogger(__name__)
+
+
+def _suppress_unhandled_future_exception(future: asyncio.Future) -> None:
+    """
+    Callback to prevent 'Future exception was never retrieved' warning.
+
+    When a Future has set_exception() called but no one awaits it (e.g., single
+    caller that gets the exception via re-raise), asyncio warns. This callback
+    retrieves the exception to suppress that warning.
+    """
+    if future.done() and not future.cancelled():
+        try:
+            future.exception()
+        except asyncio.CancelledError:
+            pass
+
+
+class AsyncEncoderCache:
+    """
+    Async wrapper with request coalescing over EncoderCacheManager.
+
+    Provides async get_or_compute that deduplicates concurrent requests
+    for the same key, ensuring only one encoding runs at a time per key.
+
+    Thread Safety:
+        This class is NOT thread-safe. It is designed to run within a single
+        asyncio event loop. All access must be from the same thread.
+    """
+
+    def __init__(self, cache: EncoderCacheManager):
+        """
+        Initialize the async encoder cache.
+
+        Args:
+            cache: Underlying EncoderCacheManager for storage.
+        """
+        self._cache = cache
+        self._in_flight: Dict[str, asyncio.Future[torch.Tensor]] = {}
+
+    def get(self, key: str) -> Optional[torch.Tensor]:
+        """
+        Synchronous get from underlying cache.
+
+        Args:
+            key: Cache key.
+
+        Returns:
+            Cached tensor or None if not found.
+        """
+        return self._cache.get(key)
+
+    async def get_or_compute(
+        self,
+        key: str,
+        compute_fn: Callable[[], Awaitable[torch.Tensor]],
+    ) -> torch.Tensor:
+        """
+        Get from cache or compute with request coalescing.
+
+        If the key is in cache, returns immediately.
+        If another coroutine is already computing this key, waits for that result.
+        Otherwise, computes and caches the result.
+
+        Args:
+            key: Cache key (typically content hash).
+            compute_fn: Async function to compute the tensor if not cached.
+
+        Returns:
+            The cached or computed tensor.
+
+        Raises:
+            Exception: Re-raises any exception from compute_fn.
+        """
+        # Check cache first
+        cached = self._cache.get(key)
+        if cached is not None:
+            return cached
+
+        # Wait if already in-flight
+        if key in self._in_flight:
+            logger.debug(f"Waiting for in-flight computation: key={key[:16]}...")
+            return await self._in_flight[key]
+
+        # Compute with coalescing
+        future: asyncio.Future[torch.Tensor] = asyncio.Future()
+        future.add_done_callback(_suppress_unhandled_future_exception)
+        self._in_flight[key] = future
+        try:
+            tensor = await compute_fn()
+            self._cache.set(key, tensor)
+            future.set_result(tensor)
+            return tensor
+        except Exception as e:
+            future.set_exception(e)
+            raise
+        finally:
+            del self._in_flight[key]
+
+    @property
+    def stats(self) -> dict:
+        """
+        Get cache statistics from underlying cache.
+
+        Returns:
+            Dictionary with cache stats.
+        """
+        base_stats = self._cache.stats
+        base_stats["in_flight"] = len(self._in_flight)
+        return base_stats