ai-dynamo · alec-flowers · Jan 27, 2026
@@ -96,7 +96,6 @@ documentation:
 multimodal:
   - changed-files:
       - any-glob-to-any-file:
-          - examples/multimodal/**
           - components/src/dynamo/vllm/multimodal_handlers/**
           - components/src/dynamo/vllm/multimodal_utils/**
           - components/src/dynamo/sglang/multimodal_utils/**

@@ -13,7 +13,6 @@ Cargo.toml @ai-dynamo/dynamo-rust-codeowners
 
 # Examples
 /examples/ @ai-dynamo/Devops @ai-dynamo/dynamo-rust-codeowners @ai-dynamo/python-codeowners
-/examples/multimodal/ @ai-dynamo/python-codeowners @ai-dynamo/Devops
 
 # Dynamo deploy
 /deploy/ @ai-dynamo/dynamo-deploy-codeowners

diff --git a/README.md b/README.md
@@ -360,7 +360,7 @@ If you use vscode or cursor, we have a .devcontainer folder built on [Microsofts
 [kv-routing]: docs/router/kv_cache_routing.md
 [planner]: docs/planner/sla_planner.md
 [kvbm]: docs/kvbm/kvbm_architecture.md
-[mm]: examples/multimodal/
+[mm]: examples/backends/vllm/
 [migration]: docs/fault_tolerance/request_migration.md
 [lora]: examples/backends/vllm/deploy/lora/README.md
 [tools]: docs/agents/tool-calling.md
@@ -65,11 +65,14 @@ class Config:
     # TODO: Have a single processor for all cases and adopting rust based processor
     ec_processor: bool = False
     multimodal_encode_worker: bool = False
+    multimodal_audio_encode_worker: bool = False
+    multimodal_video_encode_worker: bool = False
     multimodal_worker: bool = False
     multimodal_decode_worker: bool = False
     enable_multimodal: bool = False
     multimodal_encode_prefill_worker: bool = False
     mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
+    num_frames_to_sample: int = 8
 
     # vLLM-native encoder worker (ECConnector mode)
     vllm_native_encoder_worker: bool = False
@@ -183,6 +186,16 @@ def parse_args() -> Config:
         action="store_true",
         help="Run as multimodal encode worker component for processing images/videos",
     )
+    parser.add_argument(
+        "--multimodal-audio-encode-worker",
+        action="store_true",
+        help="Run as multimodal audio encode worker component for processing audio inputs",
+    )
+    parser.add_argument(
+        "--multimodal-video-encode-worker",
+        action="store_true",
+        help="Run as multimodal video encode worker component for processing video inputs",
+    )
     parser.add_argument(
         "--multimodal-worker",
         action="store_true",
@@ -216,6 +229,12 @@ def parse_args() -> Config:
             "'USER: <image> please describe the image ASSISTANT:'."
         ),
     )
+    parser.add_argument(
+        "--num-frames-to-sample",
+        type=int,
+        default=8,
+        help="Number of video frames to sample for multimodal video encoding",
+    )
     parser.add_argument(
         "--vllm-native-encoder-worker",
         action="store_true",
@@ -326,14 +345,17 @@ def parse_args() -> Config:
         int(bool(args.multimodal_processor))
         + int(bool(args.ec_processor))
         + int(bool(args.multimodal_encode_worker))
+        + int(bool(args.multimodal_audio_encode_worker))
+        + int(bool(args.multimodal_video_encode_worker))
         + int(bool(args.multimodal_worker))
         + int(bool(args.multimodal_decode_worker))
         + int(bool(args.multimodal_encode_prefill_worker))
         + int(bool(args.vllm_native_encoder_worker))
     )
     if mm_flags > 1:
         raise ValueError(
-            "Use only one of --multimodal-processor, --ec-processor, --multimodal-encode-worker, --multimodal-worker, "
+            "Use only one of --multimodal-processor, --ec-processor, --multimodal-encode-worker, "
+            "--multimodal-audio-encode-worker, --multimodal-video-encode-worker, --multimodal-worker, "
             "--multimodal-decode-worker, --multimodal-encode-prefill-worker, or --vllm-native-encoder-worker"
         )
 
@@ -358,6 +380,8 @@ def parse_args() -> Config:
     elif (
         args.vllm_native_encoder_worker
         or args.multimodal_encode_worker
+        or args.multimodal_audio_encode_worker
+        or args.multimodal_video_encode_worker
         or args.multimodal_encode_prefill_worker
     ):
         config.component = "encoder"
@@ -389,11 +413,14 @@ def parse_args() -> Config:
     config.multimodal_processor = args.multimodal_processor
     config.ec_processor = args.ec_processor
     config.multimodal_encode_worker = args.multimodal_encode_worker
+    config.multimodal_audio_encode_worker = args.multimodal_audio_encode_worker
+    config.multimodal_video_encode_worker = args.multimodal_video_encode_worker
     config.multimodal_worker = args.multimodal_worker
     config.multimodal_decode_worker = args.multimodal_decode_worker
     config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker
     config.enable_multimodal = args.enable_multimodal
     config.mm_prompt_template = args.mm_prompt_template
+    config.num_frames_to_sample = args.num_frames_to_sample
     config.vllm_native_encoder_worker = args.vllm_native_encoder_worker
     config.ec_connector_backend = args.ec_connector_backend
     config.ec_storage_path = args.ec_storage_path

@@ -30,11 +30,13 @@
 from dynamo.runtime import DistributedRuntime
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.vllm.multimodal_handlers import (
+    AudioEncodeWorkerHandler,
     ECProcessorHandler,
     EncodeWorkerHandler,
     MultimodalDecodeWorkerHandler,
     MultimodalPDWorkerHandler,
     PreprocessedHandler,
+    VideoEncodeWorkerHandler,
     VLLMEncodeWorkerHandler,
 )
 from dynamo.vllm.multimodal_utils.encode_utils import create_ec_transfer_config
@@ -125,6 +127,12 @@ def signal_handler():
     elif config.multimodal_encode_worker:
         await init_multimodal_encode_worker(runtime, config)
         logger.debug("init_multimodal_encode_worker completed")
+    elif config.multimodal_audio_encode_worker:
+        await init_multimodal_audio_encode_worker(runtime, config)
+        logger.debug("init_multimodal_audio_encode_worker completed")
+    elif config.multimodal_video_encode_worker:
+        await init_multimodal_video_encode_worker(runtime, config)
+        logger.debug("init_multimodal_video_encode_worker completed")
     elif (
         config.multimodal_worker
         or config.multimodal_decode_worker
@@ -783,6 +791,81 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con
         handler.cleanup()
 
 
+async def init_multimodal_audio_encode_worker(
+    runtime: DistributedRuntime, config: Config
+):
+    """Initialize multimodal audio encode worker component"""
+    component = runtime.namespace(config.namespace).component(config.component)
+
+    generate_endpoint = component.endpoint(config.endpoint)
+
+    pd_worker_client = (
+        await runtime.namespace(config.namespace)
+        .component("backend")
+        .endpoint("generate")
+        .client()
+    )
+
+    handler = AudioEncodeWorkerHandler(
+        config.engine_args,
+        pd_worker_client,
+    )
+    await handler.async_init(runtime)
+    logger.info("Waiting for PD Worker Instances ...")
+    await pd_worker_client.wait_for_instances()
+    logger.info("Starting to serve the audio encode worker endpoint...")
+
+    try:
+        await asyncio.gather(
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=[("model", config.model)]
+            ),
+        )
+    except Exception as e:
+        logger.error(f"Failed to serve audio encode worker endpoint: {e}")
+        raise
+    finally:
+        handler.cleanup()
+
+
+async def init_multimodal_video_encode_worker(
+    runtime: DistributedRuntime, config: Config
+):
+    """Initialize multimodal video encode worker component"""
+    component = runtime.namespace(config.namespace).component(config.component)
+
+    generate_endpoint = component.endpoint(config.endpoint)
+
+    pd_worker_client = (
+        await runtime.namespace(config.namespace)
+        .component("backend")
+        .endpoint("generate")
+        .client()
+    )
+
+    handler = VideoEncodeWorkerHandler(
+        config.engine_args,
+        pd_worker_client,
+        num_frames_to_sample=config.num_frames_to_sample,
+    )
+    await handler.async_init(runtime)
+    logger.info("Waiting for PD Worker Instances ...")
+    await pd_worker_client.wait_for_instances()
+    logger.info("Starting to serve the video encode worker endpoint...")
+
+    try:
+        await asyncio.gather(
+            generate_endpoint.serve_endpoint(
+                handler.generate, metrics_labels=[("model", config.model)]
+            ),
+        )
+    except Exception as e:
+        logger.error(f"Failed to serve video encode worker endpoint: {e}")
+        raise
+    finally:
+        handler.cleanup()
+
+
 async def init_vllm_native_encoder(runtime: DistributedRuntime, config: Config):
     """
     Initialize vLLM-native encoder worker component (ECConnector mode).

@@ -2,7 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dynamo.vllm.multimodal_handlers.encode_worker_handler import (
+    AudioEncodeWorkerHandler,
     EncodeWorkerHandler,
+    VideoEncodeWorkerHandler,
     VLLMEncodeWorkerHandler,
 )
 from dynamo.vllm.multimodal_handlers.preprocessed_handler import (
@@ -16,6 +18,8 @@
 
 __all__ = [
     "EncodeWorkerHandler",
+    "AudioEncodeWorkerHandler",
+    "VideoEncodeWorkerHandler",
     "VLLMEncodeWorkerHandler",
     "PreprocessedHandler",
     "ECProcessorHandler",