Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ documentation:
multimodal:
- changed-files:
- any-glob-to-any-file:
- examples/multimodal/**
- components/src/dynamo/vllm/multimodal_handlers/**
- components/src/dynamo/vllm/multimodal_utils/**
- components/src/dynamo/sglang/multimodal_utils/**
Expand Down
1 change: 0 additions & 1 deletion CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ Cargo.toml @ai-dynamo/dynamo-rust-codeowners

# Examples
/examples/ @ai-dynamo/Devops @ai-dynamo/dynamo-rust-codeowners @ai-dynamo/python-codeowners
/examples/multimodal/ @ai-dynamo/python-codeowners @ai-dynamo/Devops

# Dynamo deploy
/deploy/ @ai-dynamo/dynamo-deploy-codeowners
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ If you use vscode or cursor, we have a .devcontainer folder built on [Microsofts
[kv-routing]: docs/router/kv_cache_routing.md
[planner]: docs/planner/sla_planner.md
[kvbm]: docs/kvbm/kvbm_architecture.md
[mm]: examples/multimodal/
[mm]: examples/backends/vllm/
[migration]: docs/fault_tolerance/request_migration.md
[lora]: examples/backends/vllm/deploy/lora/README.md
[tools]: docs/agents/tool-calling.md
29 changes: 28 additions & 1 deletion components/src/dynamo/vllm/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,14 @@ class Config:
# TODO: Have a single processor for all cases and adopting rust based processor
ec_processor: bool = False
multimodal_encode_worker: bool = False
multimodal_audio_encode_worker: bool = False
multimodal_video_encode_worker: bool = False
multimodal_worker: bool = False
multimodal_decode_worker: bool = False
enable_multimodal: bool = False
multimodal_encode_prefill_worker: bool = False
mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
num_frames_to_sample: int = 8

# vLLM-native encoder worker (ECConnector mode)
vllm_native_encoder_worker: bool = False
Expand Down Expand Up @@ -183,6 +186,16 @@ def parse_args() -> Config:
action="store_true",
help="Run as multimodal encode worker component for processing images/videos",
)
parser.add_argument(
"--multimodal-audio-encode-worker",
action="store_true",
help="Run as multimodal audio encode worker component for processing audio inputs",
)
parser.add_argument(
"--multimodal-video-encode-worker",
action="store_true",
help="Run as multimodal video encode worker component for processing video inputs",
)
parser.add_argument(
"--multimodal-worker",
action="store_true",
Expand Down Expand Up @@ -216,6 +229,12 @@ def parse_args() -> Config:
"'USER: <image> please describe the image ASSISTANT:'."
),
)
parser.add_argument(
"--num-frames-to-sample",
type=int,
default=8,
help="Number of video frames to sample for multimodal video encoding",
)
parser.add_argument(
"--vllm-native-encoder-worker",
action="store_true",
Expand Down Expand Up @@ -326,14 +345,17 @@ def parse_args() -> Config:
int(bool(args.multimodal_processor))
+ int(bool(args.ec_processor))
+ int(bool(args.multimodal_encode_worker))
+ int(bool(args.multimodal_audio_encode_worker))
+ int(bool(args.multimodal_video_encode_worker))
+ int(bool(args.multimodal_worker))
+ int(bool(args.multimodal_decode_worker))
+ int(bool(args.multimodal_encode_prefill_worker))
+ int(bool(args.vllm_native_encoder_worker))
)
if mm_flags > 1:
raise ValueError(
"Use only one of --multimodal-processor, --ec-processor, --multimodal-encode-worker, --multimodal-worker, "
"Use only one of --multimodal-processor, --ec-processor, --multimodal-encode-worker, "
"--multimodal-audio-encode-worker, --multimodal-video-encode-worker, --multimodal-worker, "
"--multimodal-decode-worker, --multimodal-encode-prefill-worker, or --vllm-native-encoder-worker"
)

Expand All @@ -358,6 +380,8 @@ def parse_args() -> Config:
elif (
args.vllm_native_encoder_worker
or args.multimodal_encode_worker
or args.multimodal_audio_encode_worker
or args.multimodal_video_encode_worker
or args.multimodal_encode_prefill_worker
):
config.component = "encoder"
Expand Down Expand Up @@ -389,11 +413,14 @@ def parse_args() -> Config:
config.multimodal_processor = args.multimodal_processor
config.ec_processor = args.ec_processor
config.multimodal_encode_worker = args.multimodal_encode_worker
config.multimodal_audio_encode_worker = args.multimodal_audio_encode_worker
config.multimodal_video_encode_worker = args.multimodal_video_encode_worker
config.multimodal_worker = args.multimodal_worker
config.multimodal_decode_worker = args.multimodal_decode_worker
config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker
config.enable_multimodal = args.enable_multimodal
config.mm_prompt_template = args.mm_prompt_template
config.num_frames_to_sample = args.num_frames_to_sample
config.vllm_native_encoder_worker = args.vllm_native_encoder_worker
config.ec_connector_backend = args.ec_connector_backend
config.ec_storage_path = args.ec_storage_path
Expand Down
83 changes: 83 additions & 0 deletions components/src/dynamo/vllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@
from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.multimodal_handlers import (
AudioEncodeWorkerHandler,
ECProcessorHandler,
EncodeWorkerHandler,
MultimodalDecodeWorkerHandler,
MultimodalPDWorkerHandler,
PreprocessedHandler,
VideoEncodeWorkerHandler,
VLLMEncodeWorkerHandler,
)
from dynamo.vllm.multimodal_utils.encode_utils import create_ec_transfer_config
Expand Down Expand Up @@ -125,6 +127,12 @@ def signal_handler():
elif config.multimodal_encode_worker:
await init_multimodal_encode_worker(runtime, config)
logger.debug("init_multimodal_encode_worker completed")
elif config.multimodal_audio_encode_worker:
await init_multimodal_audio_encode_worker(runtime, config)
logger.debug("init_multimodal_audio_encode_worker completed")
elif config.multimodal_video_encode_worker:
await init_multimodal_video_encode_worker(runtime, config)
logger.debug("init_multimodal_video_encode_worker completed")
elif (
config.multimodal_worker
or config.multimodal_decode_worker
Expand Down Expand Up @@ -783,6 +791,81 @@ async def init_multimodal_encode_worker(runtime: DistributedRuntime, config: Con
handler.cleanup()


async def init_multimodal_audio_encode_worker(
runtime: DistributedRuntime, config: Config
):
"""Initialize multimodal audio encode worker component"""
component = runtime.namespace(config.namespace).component(config.component)

generate_endpoint = component.endpoint(config.endpoint)

pd_worker_client = (
await runtime.namespace(config.namespace)
.component("backend")
.endpoint("generate")
.client()
)

handler = AudioEncodeWorkerHandler(
config.engine_args,
pd_worker_client,
)
await handler.async_init(runtime)
logger.info("Waiting for PD Worker Instances ...")
await pd_worker_client.wait_for_instances()
logger.info("Starting to serve the audio encode worker endpoint...")

try:
await asyncio.gather(
generate_endpoint.serve_endpoint(
handler.generate, metrics_labels=[("model", config.model)]
),
)
except Exception as e:
logger.error(f"Failed to serve audio encode worker endpoint: {e}")
raise
finally:
handler.cleanup()


async def init_multimodal_video_encode_worker(
runtime: DistributedRuntime, config: Config
):
"""Initialize multimodal video encode worker component"""
component = runtime.namespace(config.namespace).component(config.component)

generate_endpoint = component.endpoint(config.endpoint)

pd_worker_client = (
await runtime.namespace(config.namespace)
.component("backend")
.endpoint("generate")
.client()
)

handler = VideoEncodeWorkerHandler(
config.engine_args,
pd_worker_client,
num_frames_to_sample=config.num_frames_to_sample,
)
await handler.async_init(runtime)
logger.info("Waiting for PD Worker Instances ...")
await pd_worker_client.wait_for_instances()
logger.info("Starting to serve the video encode worker endpoint...")

try:
await asyncio.gather(
generate_endpoint.serve_endpoint(
handler.generate, metrics_labels=[("model", config.model)]
),
)
except Exception as e:
logger.error(f"Failed to serve video encode worker endpoint: {e}")
raise
finally:
handler.cleanup()


async def init_vllm_native_encoder(runtime: DistributedRuntime, config: Config):
"""
Initialize vLLM-native encoder worker component (ECConnector mode).
Expand Down
4 changes: 4 additions & 0 deletions components/src/dynamo/vllm/multimodal_handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# SPDX-License-Identifier: Apache-2.0

from dynamo.vllm.multimodal_handlers.encode_worker_handler import (
AudioEncodeWorkerHandler,
EncodeWorkerHandler,
VideoEncodeWorkerHandler,
VLLMEncodeWorkerHandler,
)
from dynamo.vllm.multimodal_handlers.preprocessed_handler import (
Expand All @@ -16,6 +18,8 @@

__all__ = [
"EncodeWorkerHandler",
"AudioEncodeWorkerHandler",
"VideoEncodeWorkerHandler",
"VLLMEncodeWorkerHandler",
"PreprocessedHandler",
"ECProcessorHandler",
Expand Down
Loading
Loading