Skip to content

Commit 43b43d2

Browse files
refactor DRA support for multiple templates
Signed-off-by: greg pereira <grpereir@redhat.com>
1 parent 9361821 commit 43b43d2

5 files changed

Lines changed: 178 additions & 4 deletions

File tree

charts/llm-d-modelservice/templates/_helpers-dra.tpl

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,14 +112,52 @@ false
112112
{{- $count -}}
113113
{{- end }}
114114

115-
{{/* Generate resourceClaims Variable (merges accelerator + user-defined claims) */}}
115+
{{/* Get claim name for an additional resource claim template */}}
116+
{{- define "llm-d-modelservice.additionalClaimName" -}}
117+
{{- $templateKey := .templateKey -}}
118+
{{- $role := .role | default "" -}}
119+
{{- $config := .config -}}
120+
{{- $baseName := $config.name | default (printf "%s-claim-template" $templateKey) -}}
121+
{{- if $role -}}
122+
{{- printf "%s-%s-claim" (trimSuffix "-claim-template" $baseName) $role -}}
123+
{{- else -}}
124+
{{- printf "%s-claim" (trimSuffix "-claim-template" $baseName) -}}
125+
{{- end -}}
126+
{{- end }}
127+
128+
{{/* Get claim template name for an additional resource claim template */}}
129+
{{- define "llm-d-modelservice.additionalClaimTemplateName" -}}
130+
{{- $templateKey := .templateKey -}}
131+
{{- $role := .role | default "" -}}
132+
{{- $config := .config -}}
133+
{{- $baseName := $config.name | default (printf "%s-claim-template" $templateKey) -}}
134+
{{- if $role -}}
135+
{{- printf "%s-%s" $baseName $role -}}
136+
{{- else -}}
137+
{{- $baseName -}}
138+
{{- end -}}
139+
{{- end }}
140+
141+
{{/* Generate resourceClaims Variable (merges accelerator + additional + user-defined claims) */}}
116142
{{- define "llm-d-modelservice.resourceClaimsBase" -}}
117143
{{- $claims := list -}}
118144
{{- $draEnabled := eq (include "llm-d-modelservice.draEnabled" .) "true" -}}
119145
{{- if $draEnabled -}}
120146
{{- $claimName := include "llm-d-modelservice.acceleratorClaimName" . -}}
121147
{{- $templateName := include "llm-d-modelservice.acceleratorClaimTemplateName" . -}}
122148
{{- $claims = append $claims (dict "name" $claimName "resourceClaimTemplateName" $templateName) -}}
149+
150+
{{- /* Add claims for additional resource claim templates (e.g., RDMA) */}}
151+
{{- $additionalTemplates := .Values.accelerator.additionalResourceClaimTemplates | default list -}}
152+
{{- range $templateKey := $additionalTemplates -}}
153+
{{- if hasKey $.Values.accelerator.resourceClaimTemplates $templateKey -}}
154+
{{- $config := index $.Values.accelerator.resourceClaimTemplates $templateKey -}}
155+
{{- $ctx := dict "templateKey" $templateKey "role" $.role "config" $config -}}
156+
{{- $addClaimName := include "llm-d-modelservice.additionalClaimName" $ctx -}}
157+
{{- $addTemplateName := include "llm-d-modelservice.additionalClaimTemplateName" $ctx -}}
158+
{{- $claims = append $claims (dict "name" $addClaimName "resourceClaimTemplateName" $addTemplateName) -}}
159+
{{- end -}}
160+
{{- end -}}
123161
{{- end -}}
124162
{{- if .pdSpec.resourceClaims -}}
125163
{{- $claims = concat $claims .pdSpec.resourceClaims -}}

charts/llm-d-modelservice/templates/resource-claim-template.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,42 @@ spec:
6565
{{- toYaml $selectors | nindent 10 }}
6666
{{- end }}
6767
{{- end -}}
68+
69+
{{- /* Generate ResourceClaimTemplates for additional resources (e.g., RDMA) */}}
70+
{{- $additionalTemplates := $.Values.accelerator.additionalResourceClaimTemplates | default list -}}
71+
{{- range $templateKey := $additionalTemplates -}}
72+
{{- if hasKey $.Values.accelerator.resourceClaimTemplates $templateKey -}}
73+
{{- $addConfig := index $.Values.accelerator.resourceClaimTemplates $templateKey -}}
74+
{{- $addCtx := dict "templateKey" $templateKey "role" $role.name "config" $addConfig -}}
75+
{{- $addTemplateName := include "llm-d-modelservice.additionalClaimTemplateName" $addCtx -}}
76+
77+
{{- $addCount := $addConfig.count | default 1 -}}
78+
{{- $addClass := $addConfig.class | default $templateKey -}}
79+
{{- $addMatch := $addConfig.match | default "exactly" -}}
80+
{{- $addSelectors := $addConfig.selectors | default list -}}
81+
{{- $addRequestName := $addConfig.requestName | default $templateKey }}
82+
---
83+
apiVersion: resource.k8s.io/v1
84+
kind: ResourceClaimTemplate
85+
metadata:
86+
name: {{ $addTemplateName }}
87+
labels:
88+
{{- include "llm-d-modelservice.labels" $ | nindent 4 }}
89+
llm-d.ai/role: {{ $role.name }}
90+
spec:
91+
spec:
92+
devices:
93+
requests:
94+
- name: {{ $addRequestName }}
95+
{{ $addMatch }}:
96+
deviceClassName: {{ $addClass }}
97+
count: {{ $addCount }}
98+
{{- if $addSelectors }}
99+
selectors:
100+
{{- toYaml $addSelectors | nindent 10 }}
101+
{{- end }}
102+
{{- end -}}
103+
{{- end -}}
68104
{{- end -}}
69105
{{- end -}}
70106
{{- end }}

charts/llm-d-modelservice/values.schema.json

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66
"additionalProperties": false,
77
"description": " Supported types: nvidia, intel-i915, intel-xe, intel-gaudi, amd, google, cpu",
88
"properties": {
9+
"additionalResourceClaimTemplates": {
10+
"description": " Each entry references a key in accelerator.resourceClaimTemplates below. This allows composing multiple device types (e.g., GPU + RDMA) for a single workload. Example: additionalResourceClaimTemplates: - rdma",
11+
"items": {
12+
"required": []
13+
},
14+
"required": [],
15+
"title": "additionalResourceClaimTemplates"
16+
},
917
"dra": {
1018
"default": false,
1119
"description": " type: boolean @schema Enable Dynamic Resource Allocation (DRA) for accelerators When true, uses resourceClaimTemplates instead of device plugin resources",
@@ -109,7 +117,7 @@
109117
},
110118
"resourceClaimTemplates": {
111119
"additionalProperties": false,
112-
"description": " Each accelerator type can have its own claim template configuration",
120+
"description": " Each accelerator type can have its own claim template configuration. Custom entries can be added here and referenced via additionalResourceClaimTemplates.",
113121
"properties": {
114122
"amd": {
115123
"additionalProperties": false,
@@ -321,6 +329,41 @@
321329
"required": [],
322330
"title": "nvidia",
323331
"type": "object"
332+
},
333+
"rdma": {
334+
"additionalProperties": false,
335+
"properties": {
336+
"class": {
337+
"default": "rdma-dranet",
338+
"title": "class",
339+
"type": "string"
340+
},
341+
"count": {
342+
"default": 1,
343+
"title": "count",
344+
"type": "integer"
345+
},
346+
"match": {
347+
"default": "exactly",
348+
"title": "match",
349+
"type": "string"
350+
},
351+
"name": {
352+
"default": "rdma-claim-template",
353+
"title": "name",
354+
"type": "string"
355+
},
356+
"selectors": {
357+
"items": {
358+
"required": []
359+
},
360+
"title": "selectors",
361+
"type": "array"
362+
}
363+
},
364+
"required": [],
365+
"title": "rdma",
366+
"type": "object"
324367
}
325368
},
326369
"required": [],

charts/llm-d-modelservice/values.schema.tmpl.json

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66
"additionalProperties": false,
77
"description": " Supported types: nvidia, intel-i915, intel-xe, intel-gaudi, amd, google, cpu",
88
"properties": {
9+
"additionalResourceClaimTemplates": {
10+
"description": " Each entry references a key in accelerator.resourceClaimTemplates below. This allows composing multiple device types (e.g., GPU + RDMA) for a single workload. Example: additionalResourceClaimTemplates: - rdma",
11+
"items": {
12+
"required": []
13+
},
14+
"required": [],
15+
"title": "additionalResourceClaimTemplates"
16+
},
917
"dra": {
1018
"default": false,
1119
"description": " type: boolean @schema Enable Dynamic Resource Allocation (DRA) for accelerators When true, uses resourceClaimTemplates instead of device plugin resources",
@@ -109,7 +117,7 @@
109117
},
110118
"resourceClaimTemplates": {
111119
"additionalProperties": false,
112-
"description": " Each accelerator type can have its own claim template configuration",
120+
"description": " Each accelerator type can have its own claim template configuration. Custom entries can be added here and referenced via additionalResourceClaimTemplates.",
113121
"properties": {
114122
"amd": {
115123
"additionalProperties": false,
@@ -321,6 +329,41 @@
321329
"required": [],
322330
"title": "nvidia",
323331
"type": "object"
332+
},
333+
"rdma": {
334+
"additionalProperties": false,
335+
"properties": {
336+
"class": {
337+
"default": "rdma-dranet",
338+
"title": "class",
339+
"type": "string"
340+
},
341+
"count": {
342+
"default": 1,
343+
"title": "count",
344+
"type": "integer"
345+
},
346+
"match": {
347+
"default": "exactly",
348+
"title": "match",
349+
"type": "string"
350+
},
351+
"name": {
352+
"default": "rdma-claim-template",
353+
"title": "name",
354+
"type": "string"
355+
},
356+
"selectors": {
357+
"items": {
358+
"required": []
359+
},
360+
"title": "selectors",
361+
"type": "array"
362+
}
363+
},
364+
"required": [],
365+
"title": "rdma",
366+
"type": "object"
324367
}
325368
},
326369
"required": [],

charts/llm-d-modelservice/values.yaml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,16 @@ accelerator:
111111
intel-xe:
112112
- name: VLLM_WORKER_MULTIPROC_METHOD
113113
value: "spawn"
114+
# Additional DRA resource claim templates to include alongside the primary accelerator.
115+
# Each entry references a key in accelerator.resourceClaimTemplates below.
116+
# This allows composing multiple device types (e.g., GPU + RDMA) for a single workload.
117+
# Example:
118+
# additionalResourceClaimTemplates:
119+
# - rdma
120+
additionalResourceClaimTemplates: []
114121
# ResourceClaimTemplate configurations for DRA (used when dra: true)
115-
# Each accelerator type can have its own claim template configuration
122+
# Each accelerator type can have its own claim template configuration.
123+
# Custom entries can be added here and referenced via additionalResourceClaimTemplates.
116124
resourceClaimTemplates:
117125
nvidia:
118126
name: nvidia-claim-template
@@ -150,6 +158,12 @@ accelerator:
150158
class: tpu.google.com
151159
match: "exactly"
152160
selectors: []
161+
rdma:
162+
name: rdma-claim-template
163+
class: rdma-dranet
164+
match: "exactly"
165+
count: 1
166+
selectors: []
153167

154168
# @schema
155169
# additionalProperties: true

0 commit comments

Comments
 (0)