Skip to content

Commit 725a19e

Browse files
feat: add dynamo-platform and dynamo-crds for AI inference serving (#83)
Signed-off-by: Yuan Chen <yuanchen8911@gmail.com> Co-authored-by: Mark Chmarny <mchmarny@users.noreply.github.com>
1 parent c32b059 commit 725a19e

File tree

4 files changed

+139
-1
lines changed

4 files changed

+139
-1
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Dynamo CRDs Helm values
16+
# Installs Custom Resource Definitions for NVIDIA Dynamo inference platform.
17+
# CRDs: DynamoGraphDeployment, DynamoGraphDeploymentRequest,
18+
# DynamoComponentDeployment, DynamoModel, DynamoWorkerMetadata,
19+
# DynamoGraphDeploymentScalingAdapter
20+
21+
# This chart has no configurable values — it only installs CRDs.
22+
# The enabled flag is managed by the umbrella chart condition.
23+
enabled: true
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Dynamo Platform Helm values
16+
# NVIDIA Dynamo inference serving platform: operator, etcd, NATS.
17+
# Provides OpenAI-compatible endpoints, KV-cache-aware routing,
18+
# disaggregated prefill/decode, and SLA-driven autoscaling.
19+
20+
# Override release name prefix to avoid eidos-stack- prefix in resource names
21+
dynamo-operator:
22+
fullnameOverride: dynamo-operator
23+
controllerManager:
24+
tolerations:
25+
- operator: Exists
26+
27+
# Use Kubernetes-native service discovery (no external etcd dependency for discovery)
28+
discoveryBackend: "kubernetes"
29+
30+
# Point Dynamo to the existing kube-prometheus-stack for metrics collection.
31+
# PodMonitor CRs are auto-created by the operator for metric discovery.
32+
dynamo:
33+
metrics:
34+
prometheusEndpoint: "http://kube-prometheus-prometheus.monitoring.svc.cluster.local:9090"
35+
36+
# Disable kai-scheduler sub-chart — managed as a separate eidos component
37+
kai-scheduler:
38+
enabled: false
39+
40+
# Disable grove — enable when multinode inference coordination is needed
41+
grove:
42+
enabled: false
43+
44+
# etcd for operator state storage
45+
etcd:
46+
enabled: true
47+
fullnameOverride: dynamo-etcd
48+
image:
49+
repository: bitnamilegacy/etcd
50+
tag: 3.5.18-debian-12-r5
51+
persistence:
52+
enabled: true
53+
size: 1Gi
54+
resources:
55+
requests:
56+
cpu: 100m
57+
memory: 128Mi
58+
limits:
59+
cpu: 500m
60+
memory: 512Mi
61+
62+
# NATS message broker for component communication
63+
nats:
64+
enabled: true
65+
fullnameOverride: dynamo-nats
66+
config:
67+
cluster:
68+
enabled: false
69+
jetstream:
70+
enabled: true

pkg/recipe/data/overlays/inference.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@ metadata:
1919

2020
spec:
2121
# Inherits from base (implicit when spec.base is empty)
22-
# This recipe adds inference-serving components for AI workloads.
22+
# This recipe adds inference components for AI workloads.
2323
# Satisfies CNCF AI Conformance requirements:
2424
# - Gang scheduling (kai-scheduler)
25+
# - AI inference serving (dynamo-platform)
26+
# - AI service metrics (dynamo metrics + kube-prometheus-stack)
2527

2628
criteria:
2729
intent: inference
@@ -34,3 +36,19 @@ spec:
3436
valuesFile: components/kai-scheduler/values.yaml
3537
dependencyRefs:
3638
- gpu-operator
39+
40+
- name: dynamo-crds
41+
type: Helm
42+
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
43+
version: "0.8.1"
44+
valuesFile: components/dynamo-crds/values.yaml
45+
46+
- name: dynamo-platform
47+
type: Helm
48+
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
49+
version: "0.8.1"
50+
valuesFile: components/dynamo-platform/values.yaml
51+
dependencyRefs:
52+
- dynamo-crds
53+
- cert-manager
54+
- kube-prometheus-stack

pkg/recipe/data/registry.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,33 @@ components:
287287
tolerationPaths:
288288
- global.tolerations
289289

290+
- name: dynamo-crds
291+
displayName: dynamo-crds
292+
valueOverrideKeys:
293+
- dynamocrds
294+
helm:
295+
defaultRepository: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
296+
defaultChart: dynamo-crds
297+
defaultVersion: "0.8.1"
298+
defaultNamespace: dynamo-system
299+
300+
- name: dynamo-platform
301+
displayName: dynamo-platform
302+
valueOverrideKeys:
303+
- dynamoplatform
304+
- dynamo
305+
helm:
306+
defaultRepository: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
307+
defaultChart: dynamo-platform
308+
defaultVersion: "0.8.1"
309+
defaultNamespace: dynamo-system
310+
nodeScheduling:
311+
system:
312+
nodeSelectorPaths:
313+
- dynamo-operator.controllerManager.nodeSelector
314+
tolerationPaths:
315+
- dynamo-operator.controllerManager.tolerations
316+
290317
- name: kubeflow-trainer
291318
displayName: kubeflow-trainer
292319
valueOverrideKeys:

0 commit comments

Comments
 (0)