llm-d · kalantar · Jun 12, 2025 · Jun 12, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/helm/.helmignore b/helm/.helmignore
@@ -0,0 +1,28 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
+# ReadMEs
+*.md
+# Rendered templates
+values-*.yaml
+output-*.yaml
diff --git a/helm/Chart.yaml b/helm/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: llm-d-modelservice
+description: A Helm chart for ModelService
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.0.1
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "0.0.1"
diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl
@@ -0,0 +1,203 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "llm-d-modelservice.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "llm-d-modelservice.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "llm-d-modelservice.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "llm-d-modelservice.labels" -}}
+helm.sh/chart: {{ include "llm-d-modelservice.chart" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/* Sanitized model name (DNS compliant) */}}
+{{- define "llm-d-modelservice.sanitizedModelName" -}}
+  {{- $name := .Release.Name | lower | trim -}}
+  {{- $name = regexReplaceAll "[^a-z0-9_.-]" $name "-" -}}
+  {{- $name = regexReplaceAll "^[\\-._]+" $name "" -}}
+  {{- $name = regexReplaceAll "[\\-._]+$" $name "" -}}
+  {{- $name = regexReplaceAll "\\." $name "-" -}}
+
+  {{- if gt (len $name) 63 -}}
+    {{- $name = substr 0 63 $name -}}
+  {{- end -}}
+
+{{- $name -}}
+{{- end }}
+
+{{/* Common P/D labels */}}
+{{- define "llm-d-modelservice.pdlabels" -}}
+llm-d.ai/inferenceServing: "true"
+llm-d.ai/model: {{ (include "llm-d-modelservice.sanitizedModelName" .) -}}
+{{- end }}
+
+{{/* prefill labels */}}
+{{- define "llm-d-modelservice.prefilllabels" -}}
+{{ include "llm-d-modelservice.pdlabels" . }}
+llm-d.ai/role: prefill
+{{- end }}
+
+{{/* decode labels */}}
+{{- define "llm-d-modelservice.decodelabels" -}}
+{{ include "llm-d-modelservice.pdlabels" . }}
+llm-d.ai/role: decode
+{{- end }}
+
+{{/* affinity from acceleratorTypes */}}
+{{- define "llm-d-modelservice.acceleratorTypes" -}}
+affinity:
+  nodeAffinity:
+    requiredDuringSchedulingIgnoredDuringExecution:
+      nodeSelectorTerms:
+        - matchExpressions:
+          - key: {{ .labelKey }}
+            operator: In
+            {{- with .labelValues }}
+            values:
+            {{- toYaml . | nindent 14 }}
+            {{- end }}
+{{- end }}
+
+{{/* Routing proxy -- sidecar for decode pods */}}
+{{- define "llm-d-modelservice.routingProxy" -}}
+initContainers:
+  - name: routing-proxy
+    args:
+      - --port={{ default 8080 .servicePort }}
+      - --vllm-port={{ default 8200 .proxy.targetPort }}
+      - --connector=nixlv2
+      - -v={{ default 5 .proxy.debugLevel }}
+    image: {{ .image }}
+    imagePullPolicy: Always
+    ports:
+      - containerPort: {{ default 8080 .servicePort }}
+    protocol: TCP
+    resources: {}
+    restartPolicy: Always
+    securityContext:
+    allowPrivilegeEscalation: false
+    runAsNonRoot: true
+{{- end }}
+
+{{- define "llm-d-modelservice.parallelism" -}}
+{{- $parallelism := dict "tensor" 1 "data" 1 -}}
+{{- if and . .tensor }}
+{{- $parallelism = mergeOverwrite $parallelism (dict "tensor" .tensor) -}}
+{{- end }}
+{{- if and . .data }}
+{{- $parallelism = mergeOverwrite $parallelism (dict "data" .data) -}}
+{{- end }}
+{{- $parallelism | toYaml | nindent 0 }}
+{{- end }}
+
+{{- define "llm-d-modelservice.resources" -}}
+resources:
+  limits:
+    {{- $limits := dict -}}
+    {{- if and .resources .resources.limits -}}
+    {{- $limits = omit .resources.limits "nvidia.com/gpu" }}
+    {{- if gt (len $limits) 0 }}
+    {{- toYaml $limits | nindent 4 }}
+    {{- end }}
+    {{- end }}
+    nvidia.com/gpu: {{ .parallelism.tensor }}
+  requests:
+    {{- $requests := dict -}}
+    {{- if and .resources .resources.requests -}}
+    {{- $requests = omit .resources.requests "nvidia.com/gpu" }}
+    {{- end }}
+    {{- if gt (len $requests) 0 }}
+    {{- toYaml $requests | nindent 4 }}
+    {{- end }}
+    nvidia.com/gpu: {{ .parallelism.tensor }}
+{{- end }}
+
+{{/* P/D service account name */}}
+{{- define "llm-d-modelservice.pdServiceAccountName" -}}
+{{ include "llm-d-modelservice.sanitizedModelName" . }}-sa
+{{- end }}
+
+{{/* EPP service account name */}}
+{{- define "llm-d-modelservice.eppServiceAccountName" -}}
+{{ include "llm-d-modelservice.sanitizedModelName" . }}-epp-sa
+{{- end }}
+
+{{/*
+EPP selector labels
+*/}}
+{{- define "llm-d-modelservice.eppSelectorLabels" -}}
+app.kubernetes.io/name: {{ include "llm-d-modelservice.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp
+{{- end }}
+
+{{/*
+Volumes for PD containers based on model artifact prefix
+*/}}
+{{- define "llm-d-modelservice.mountModelVolumeVolumes" -}}
+{{- if eq .Values.modelArtifacts.prefix "hf" -}}
+- name: model-storage
+  emptyDir: 
+    sizeLimit: {{ default "0" .Values.modelArtifacts.size }}
+{{- else if eq .Values.modelArtifacts.prefix "pvc" }}
+- name: model-storage
+  persistentVolumeClaim:
+    claimName: {{ .Values.modelArtifacts.artifact }}
+    readOnly: true
+{{- else if eq .Values.modelArtifacts.prefix "oci" }}
+- name: model-storage
+  image:
+    reference: {{ .Values.modelArtifacts.artifact }}
+    pullPolicy: {{ default "Always" .Values.modelArtifacts.imagePullPolicy }}
+{{- end }}
+{{- end }}
+
+{{/*
+VolumeMount for a PD container
+Supplies model-storage mount if mountModelVolume: true for the container
+*/}}
+{{- define "llm-d-modelservice.mountModelVolumeVolumeMounts" -}}
+{{- if or .volumeMounts .mountModelVolume }}
+volumeMounts:
+{{- end }}
+{{- /* user supplied volume mount in values */}}
+{{- with .volumeMounts }}
+  {{- toYaml . | nindent 8 }}
+{{- end }}
+{{- /* what we add if mounModelVolume is true */}}
+{{- if .mountModelVolume }}
+  - name: model-storage
+    mountPath: /model-cache
+{{- end }}
+{{- end }}
diff --git a/helm/templates/decode-deployment.yaml b/helm/templates/decode-deployment.yaml
@@ -0,0 +1,106 @@
+{{- $parallelism := (include "llm-d-modelservice.parallelism" .Values.decode.parallelism) | fromYaml -}}
+{{- if .Values.decode }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-decode
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+spec:
+  replicas: {{ default 1 .Values.decode.replicas }}
+  selector:
+    matchLabels:
+      {{- include "llm-d-modelservice.decodelabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "llm-d-modelservice.decodelabels" . | nindent 8 }}
+    spec:
+      {{- with .Values.decode.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      serviceAccountName: {{ include "llm-d-modelservice.pdServiceAccountName" . }}
+      {{- with .Values.podSecurityContext }}
+      securityContext:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.decode.acceleratorTypes }}
+      {{- include "llm-d-modelservice.acceleratorTypes" . | nindent 6 }}
+      {{- end }}
+      {{- /* initContainers */}}
+      {{- with .Values.decode.initContainers }}
+      initContainers:
+        {{- toYaml . | nindent 6 }}
+      {{- end }}     
+      {{- /* range $.Values.decode.containers */}}
+      {{- with .Values.decode.containers }}
+      containers:
+      {{- range . }}
+      - name: {{ default "vllm" .name }}
+        image: {{ required "image of container is required" .image }}
+        {{- with .securityContext }}
+        securityContext:
+          {{- toYaml . | nindent 10 }}
+        {{- end }}
+        {{- with .imagePullPolicy }}
+        imagePullPolicy: {{ . }}
+        {{- end }}
+        {{- with .command }}
+        command:
+          {{- toYaml . | nindent 10 }}
+        {{- end }}
+        {{- with .args }}
+        args:
+          {{- toYaml . | nindent 10 }}
+        {{- end }}
+        {{- /* insert user's env for this container */}}
+        {{- if or .env .mountModelVolume }}
+        env:
+        {{- end }}
+        {{- with .env }}
+          {{- toYaml . | nindent 10 }}
+        {{- end }}
+        {{- /* insert envs based on what modelArtifact prefix */}}
+        {{- if .mountModelVolume }}
+          - name: HF_HOME
+            value: /model-cache
+          {{- with $.Values.modelArtifacts.authSecretName }}
+          - name: HF_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: {{ . }}
+                key: HF_TOKEN
+          {{- end }}
+        {{- end }}
+        {{- with .livenessProbe }}
+        livenessProbe:
+          {{- toYaml . | nindent 10 }}
+        {{- end }}
+        {{- with .readinessProbe }}
+        readinessProbe:
+          {{- toYaml . | nindent 10 }}
+        {{- end }}
+        {{- with .resources }}
+        resources:
+          limits:
+            {{- if .limits -}}
+            {{- omit .limits "nvidia.com/gpu"  | toYaml | nindent 12 }}
+            {{- end }}
+            {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}}
+          requests:
+            {{- if .limits -}}
+            {{- omit .requests "nvidia.com/gpu" | toYaml | nindent 12 }}
+            {{- end }}
+            {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}}
+        {{- end }}
+        {{- /* volumeMount */}}
+        {{- include "llm-d-modelservice.mountModelVolumeVolumeMounts" . | nindent 8 }}
+        {{- end }}
+      {{- end }}
+      volumes:
+      {{- with .Values.decode.volumes }}
+        {{- toYaml . | nindent 8 }}
+      {{- end -}}
+      {{- include "llm-d-modelservice.mountModelVolumeVolumes" . | nindent 8 }} 
+{{- end }}