Working example in kind

jgchn · jgchn · commit 7e94af4724e2 · 2025-06-17T16:34:15.000-04:00
Signed-off-by: Jing Chen &lt;jing.chen2@ibm.com&gt;
diff --git a/helm/examples/output-facebook.yaml b/helm/examples/output-facebook.yaml
@@ -111,10 +111,15 @@ spec:
             value: DEBUG
           - name: HF_HOME
             value: /model-cache
+          
           resources:
             limits:
+              cpu: "16"
+              memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
+              cpu: "16"
+              memory: 16Gi
               nvidia.com/gpu: "1"
           volumeMounts:
           - name: model-storage
@@ -280,10 +285,15 @@ spec:
                 fieldPath: status.podIP
           - name: VLLM_LOGGING_LEVEL
             value: DEBUG
+          
           resources:
             limits:
+              cpu: "16"
+              memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
+              cpu: "16"
+              memory: 16Gi
               nvidia.com/gpu: "1"
       volumes:
         - name: model-storage
diff --git a/helm/examples/output-vllm-sim.yaml b/helm/examples/output-vllm-sim.yaml
@@ -92,13 +92,16 @@ spec:
           env:
           - name: HF_HOME
             value: /model-cache
+          
+          resources:
+            limits:
+              {}
+            requests:
+              {}
           volumeMounts:
           - name: model-storage
             mountPath: /model-cache
       volumes:
-        - name: model-storage
-          emptyDir: 
-            sizeLimit: 5Mi
 ---
 # Source: llm-d-modelservice/templates/epp-deployment.yaml
 apiVersion: apps/v1 
@@ -223,13 +226,16 @@ spec:
           env:
           - name: HF_HOME
             value: /model-cache
+          
+          resources:
+            limits:
+              {}
+            requests:
+              {}
           volumeMounts:
           - name: model-storage
             mountPath: /model-cache
       volumes:
-        - name: model-storage
-          emptyDir: 
-            sizeLimit: 5Mi
 ---
 # Source: llm-d-modelservice/templates/routing.yaml
 apiVersion: gateway.networking.k8s.io/v1
diff --git a/helm/examples/values-facebook.yaml b/helm/examples/values-facebook.yaml
@@ -19,7 +19,6 @@ routing:
 
 modelArtifacts:
   uri: "hf://facebook/opt-125m"
-  size: 5Mi
 
 # describe decode pods
 decode:
diff --git a/helm/examples/values-vllm-sim.yaml b/helm/examples/values-vllm-sim.yaml
@@ -1,4 +1,4 @@
-# This values.yaml file creates the resources for facebook/opt-125m 
+# This values.yaml file creates the resources for random 
 
 multinode: false          # If true, creates LWS instead of deployments  
 inferencePool: true 
@@ -7,7 +7,7 @@ httpRoute: true
 
 routing: 
   # This is the model name for the OpenAI request
-  modelName: facebook/opt-125m
+  modelName: random
   servicePort: 8000   # Sidecar listens on this port for requests. If there's no sidecar, the request goes here
   proxy:
     image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
@@ -18,7 +18,7 @@ routing:
     name: llm-d-inference-gateway
 
 modelArtifacts:
-  uri: "ht://facebook/opt-125m"
+  uri: "hf://random"
   size: 5Mi
 
 # describe decode pods
@@ -29,7 +29,7 @@ decode:
     image: "ghcr.io/llm-d/llm-d-inference-sim:0.0.4"  
     args:
       - "--model"
-      - "facebook/opt-125m"
+      - "random"
       - "--port"
       - "8200"  # targetPort
     ports:
@@ -43,7 +43,7 @@ prefill:
     image: "ghcr.io/llm-d/llm-d-inference-sim:0.0.4"  
     args:
       - "--model"
-      - "facebook/opt-125m"
+      - "random"
       - "--port"
       - "8000"  # servicePort
     ports:
@@ -67,3 +67,12 @@ endpointPicker:
   autoscaling:
     enabled: false
   replicas: 1
+
+
+curl http://localhost:8000/v1/completions -vvv \
+    -H "Content-Type: application/json" \
+    -H "x-model-name: facebook/opt-125m" \
+    -d '{
+    "model": "facebook/opt-125m",
+    "prompt": "Hello, "
+}' 
diff --git a/helm/templates/epp-deployment.yaml b/helm/templates/epp-deployment.yaml
@@ -81,27 +81,27 @@ spec:
         - containerPort: 9090
           name: metrics
           protocol: TCP
+        {{- if (not .Values.endpointPicker.disableReadinessProbe) }}
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: envoy.service.ext_proc.v3.ExternalProcessor
+          initialDelaySeconds: 5
+          timeoutSeconds: 1
+          periodSeconds: 10
+          successThreshold: 1
+          failureThreshold: 3
+        {{- end }}
+        {{- if (not .Values.endpointPicker.disableLivenessProbe) }}
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: envoy.service.ext_proc.v3.ExternalProcessor
+          initialDelaySeconds: 5
+          timeoutSeconds: 1
+          periodSeconds: 10
+          successThreshold: 1
+          failureThreshold: 3
+        {{- end }}
       serviceAccount: {{ include "llm-d-modelservice.eppServiceAccountName" . }}
-      serviceAccountName: {{ include "llm-d-modelservice.eppServiceAccountName" . }}
-      {{- if (not .Values.endpointPicker.disableReadinessProbe) }}
-      readinessProbe:
-        grpc:
-          port: 9003
-          service: envoy.service.ext_proc.v3.ExternalProcessor
-        initialDelaySeconds: 5
-        timeoutSeconds: 1
-        periodSeconds: 10
-        successThreshold: 1
-        failureThreshold: 3
-      {{- end }}
-      {{- if (not .Values.endpointPicker.disableLivenessProbe) }}
-      livenessProbe:
-        grpc:
-          port: 9003
-          service: envoy.service.ext_proc.v3.ExternalProcessor
-        initialDelaySeconds: 5
-        timeoutSeconds: 1
-        periodSeconds: 10
-        successThreshold: 1
-        failureThreshold: 3
-      {{- end }}
+      serviceAccountName: {{ include "llm-d-modelservice.eppServiceAccountName" . }}