Address inconsistencies

jgchn · jgchn · commit 81c048c07d70 · 2025-06-16T16:17:46.000-04:00
Signed-off-by: Jing Chen &lt;jing.chen2@ibm.com&gt;
diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl
@@ -123,12 +123,12 @@ initContainers:
 
 {{/* P/D service account name */}}
 {{- define "llm-d-modelservice.pdServiceAccountName" -}}
-{{ include "llm-d-modelservice.sanitizedModelName" . }}-sa
+{{ include "llm-d-modelservice.fullname" . }}-sa
 {{- end }}
 
 {{/* EPP service account name */}}
 {{- define "llm-d-modelservice.eppServiceAccountName" -}}
-{{ include "llm-d-modelservice.sanitizedModelName" . }}-epp-sa
+{{ include "llm-d-modelservice.fullname" . }}-epp-sa
 {{- end }}
 
 {{/*
diff --git a/helm/templates/decode-deployment.yaml b/helm/templates/decode-deployment.yaml
@@ -28,11 +28,14 @@ spec:
       {{- with .Values.decode.acceleratorTypes }}
       {{- include "llm-d-modelservice.acceleratorTypes" . | nindent 6 }}
       {{- end }}
-      {{- /* initContainers */}}
+      {{- /* Sidecar init container */}}
+      {{- with .Values.routing }}
+        {{ (include "llm-d-modelservice.routingProxy" .) | nindent 6 }}
+      {{- end }}    
+      {{- /* User's other init containesr */}}
       {{- with .Values.decode.initContainers }}
-      initContainers:
-        {{- toYaml . | nindent 6 }}
-      {{- end }}     
+        {{- toYaml . | nindent 8 }}
+      {{- end }}   
       {{- /* range $.Values.decode.containers */}}
       {{- with .Values.decode.containers }}
       containers:
@@ -81,19 +84,25 @@ spec:
         readinessProbe:
           {{- toYaml . | nindent 10 }}
         {{- end }}
-        {{- with .resources }}
         resources:
           limits:
-            {{- if .limits -}}
-            {{- omit .limits "nvidia.com/gpu"  | toYaml | nindent 12 }}
+            {{- $limits := dict -}}
+            {{- if and .resources .resources.limits -}}
+            {{- $limits = omit .resources.limits "nvidia.com/gpu" }}
+            {{- if gt (len $limits) 0 }}
+            {{- toYaml $limits | nindent 12 }}
+            {{- end }}
             {{- end }}
-            {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}}
+            nvidia.com/gpu: {{ $parallelism.tensor }}
           requests:
-            {{- if .limits -}}
-            {{- omit .requests "nvidia.com/gpu" | toYaml | nindent 12 }}
+            {{- $requests := dict -}}
+            {{- if and .resources .resources.requests -}}
+            {{- $requests = omit .resources.requests "nvidia.com/gpu" }}
             {{- end }}
-            {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}}
-        {{- end }}
+            {{- if gt (len $requests) 0 }}
+            {{- toYaml $requests | nindent 12 }}
+            {{- end }}
+            nvidia.com/gpu: {{ $parallelism.tensor }}
         {{- /* volumeMount */}}
         {{- include "llm-d-modelservice.mountModelVolumeVolumeMounts" . | nindent 8 }}
         {{- end }}
diff --git a/helm/templates/examples/output-facebook.yaml b/helm/templates/examples/output-facebook.yaml
@@ -3,7 +3,7 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: facebook-epp-sa
+  name: facebook-llm-d-modelservice-epp-sa
   labels:
     helm.sh/chart: llm-d-modelservice-0.0.1
     app.kubernetes.io/version: "0.0.1"
@@ -14,7 +14,7 @@ automountServiceAccountToken: true
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: facebook-sa
+  name: facebook-llm-d-modelservice-sa
   labels:
     helm.sh/chart: llm-d-modelservice-0.0.1
     app.kubernetes.io/version: "0.0.1"
@@ -65,21 +65,23 @@ spec:
         llm-d.ai/model: facebook
         llm-d.ai/role: decode
     spec:
-      serviceAccountName: facebook-sa
+      serviceAccountName: facebook-llm-d-modelservice-sa
+        
       initContainers:
-      - args:
-        - --port=8000
-        - --vllm-port=8200
-        - --connector=nixlv2
-        - -v=6
-        image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
-        imagePullPolicy: Always
-        name: routing-proxy
-        ports:
-        - containerPort: 8000
+        - name: routing-proxy
+          args:
+            - --port=8000
+            - --vllm-port=8200
+            - --connector=nixlv2
+            - -v=5
+          image: 
+          imagePullPolicy: Always
+          ports:
+            - containerPort: 8000
           protocol: TCP
-        restartPolicy: Always
-        securityContext:
+          resources: {}
+          restartPolicy: Always
+          securityContext:
           allowPrivilegeEscalation: false
           runAsNonRoot: true
       containers:
@@ -113,10 +115,11 @@ spec:
             value: /model-cache
         resources:
           limits:
-            {}
+            nvidia.com/gpu: 1
           requests:
             cpu: "16"
             memory: 16Gi
+            nvidia.com/gpu: 1
         
         volumeMounts:
           - name: model-storage
@@ -149,7 +152,7 @@ spec:
         app.kubernetes.io/version: "0.0.1"
         app.kubernetes.io/managed-by: Helm
     spec:
-      serviceAccountName: facebook-epp-sa
+      serviceAccountName: facebook-llm-d-modelservice-epp-sa
 ---
 # Source: llm-d-modelservice/templates/epp-deployment.yaml
 apiVersion: apps/v1 
@@ -235,8 +238,8 @@ spec:
         - containerPort: 9090
           name: metrics
           protocol: TCP
-      serviceAccount: facebook-epp-sa
-      serviceAccountName: facebook-epp-sa
+      serviceAccount: facebook-llm-d-modelservice-epp-sa
+      serviceAccountName: facebook-llm-d-modelservice-epp-sa
       readinessProbe:
         grpc:
           port: 9003
@@ -279,7 +282,7 @@ spec:
         llm-d.ai/model: facebook
         llm-d.ai/role: prefill
     spec:
-      serviceAccountName: facebook-sa
+      serviceAccountName: facebook-llm-d-modelservice-sa
       containers:
       - name: vllm
         image: ghcr.io/llm-d/llm-d:0.0.8
@@ -307,10 +310,11 @@ spec:
             value: DEBUG
         resources:
           limits:
-            {}
+            nvidia.com/gpu: 1
           requests:
             cpu: "16"
             memory: 16Gi
+            nvidia.com/gpu: 1
         
       volumes:
         - name: model-storage
diff --git a/helm/templates/examples/values-facebook.yaml b/helm/templates/examples/values-facebook.yaml
@@ -8,11 +8,10 @@ httpRoute: true
 routing: 
   # This is the model name for the OpenAI request
   modelName: facebook/opt-125m
-  ports:
-    servicePort: 8000   # Sidecar listens on this port for requests. If there's no sidecar, the request goes here
-    internalPort: 8200  # Sidecar forwards request to vllm container on this port 
-    proxy:
-      targetPort: 8000
+  servicePort: 8000   # Sidecar listens on this port for requests. If there's no sidecar, the request goes here
+  proxy:
+    image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
+    targetPort: 8200
   parentRefs:
   - group: gateway.networking.k8s.io
     kind: Istio
@@ -27,26 +26,6 @@ modelArtifacts:
 decode:
   enableService: false
   replicas: 1
-  # parallelism:  
-  #   tensor: 3
-    # data: 2
-    # dataLocal: 1
-  initContainers: 
-  - name: routing-proxy
-    image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
-    imagePullPolicy: Always
-    securityContext:
-      allowPrivilegeEscalation: false
-      runAsNonRoot: true
-    args:
-      - "--port=8000"       # servicePort
-      - "--vllm-port=8200"  # internalPort
-      - "--connector=nixlv2"
-      - "-v=6"
-    ports:
-      - containerPort: 8000 # servicePort
-        protocol: TCP
-    restartPolicy: Always
   containers:
   - name: "vllm"
     image: "ghcr.io/llm-d/llm-d:0.0.8"  
diff --git a/helm/templates/prefill-deployment.yaml b/helm/templates/prefill-deployment.yaml
@@ -81,19 +81,25 @@ spec:
         readinessProbe:
           {{- toYaml . | nindent 10 }}
         {{- end }}
-        {{- with .resources }}
         resources:
           limits:
-            {{- if .limits -}}
-            {{- omit .limits "nvidia.com/gpu"  | toYaml | nindent 12 }}
+            {{- $limits := dict -}}
+            {{- if and .resources .resources.limits -}}
+            {{- $limits = omit .resources.limits "nvidia.com/gpu" }}
+            {{- if gt (len $limits) 0 }}
+            {{- toYaml $limits | nindent 12 }}
             {{- end }}
-            {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}}
+            {{- end }}
+            nvidia.com/gpu: {{ $parallelism.tensor }}
           requests:
-            {{- if .limits -}}
-            {{- omit .requests "nvidia.com/gpu" | toYaml | nindent 12 }}
+            {{- $requests := dict -}}
+            {{- if and .resources .resources.requests -}}
+            {{- $requests = omit .resources.requests "nvidia.com/gpu" }}
             {{- end }}
-            {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}}
-        {{- end }}
+            {{- if gt (len $requests) 0 }}
+            {{- toYaml $requests | nindent 12 }}
+            {{- end }}
+            nvidia.com/gpu: {{ $parallelism.tensor }}
         {{- /* volumeMount */}}
         {{- include "llm-d-modelservice.mountModelVolumeVolumeMounts" . | nindent 8 }}
         {{- end }}
diff --git a/helm/templates/routing.yaml b/helm/templates/routing.yaml
@@ -50,7 +50,7 @@ spec:
     - group: inference.networking.x-k8s.io
       kind: InferencePool
       name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool
-      port: {{ .Values.routing.ports.servicePort  }}
+      port: {{ .Values.routing.servicePort  }}
       weight: 1
     matches:
     - headers: