Routing resources

jgchn · jgchn · commit 3cd169badd18 · 2025-06-12T15:40:21.000-04:00
Signed-off-by: Jing Chen &lt;jing.chen2@ibm.com&gt;
diff --git a/helm/templates/epp-deployment.yaml b/helm/templates/epp-deployment.yaml
@@ -0,0 +1,108 @@
+apiVersion: apps/v1 
+kind: Deployment 
+metadata: 
+  name: {{ .Values.modelServiceName }}-epp
+  labels: 
+    llm-d.ai/epp: {{ .Values.modelServiceName }}-epp
+  namespace: {{ .Release.Namespace }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      llm-d.ai/epp: {{ .Values.modelServiceName }}-epp
+  template:
+    metadata:
+      labels:
+        llm-d.ai/epp: {{ .Values.modelServiceName }}-epp
+    spec:
+      containers:
+      - name: epp 
+        imagePullPolicy: Always 
+        {{- if .Values.endpointPicker.image }}
+        image: {{ .Values.endpointPicker.image }}
+        {{- else }}
+        image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3
+        {{- end }}
+        imagePullPolicy: Always
+        args:
+        - --poolName
+        - {{ .Values.modelServiceName }}-inference-pool
+        - --poolNamespace
+        - {{ .Release.Namespace }}
+        - -v
+        - "4"
+        - --zap-encoder
+        - json
+        - --grpcPort
+        - "9002"
+        - --grpcHealthPort
+        - "9003"
+        env:
+        - name: ENABLE_KVCACHE_AWARE_SCORER
+          value: "false"
+        - name: ENABLE_LOAD_AWARE_SCORER
+          value: "true"
+        - name: ENABLE_PREFIX_AWARE_SCORER
+          value: "true"
+        - name: ENABLE_SESSION_AWARE_SCORER
+          value: "false"
+        - name: KVCACHE_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: KVCACHE_INDEXER_REDIS_ADDR
+        - name: LOAD_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PD_ENABLED
+          value: "false"
+        - name: PD_PROMPT_LEN_THRESHOLD
+          value: "10"
+        - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR
+        - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFIX_AWARE_SCORER_WEIGHT
+          value: "2"
+        - name: SESSION_AWARE_SCORER_WEIGHT
+          value: "1"
+        ports:
+        - containerPort: 9002
+          name: grpc
+          protocol: TCP
+        - containerPort: 9003
+          name: grpc-health
+          protocol: TCP
+        - containerPort: 9090
+          name: metrics
+          protocol: TCP
+      serviceAccount: {{ .Values.modelServiceName }}-sa
+      serviceAccountName: {{ .Values.modelServiceName }}-sa
+      readinessProbe:
+        grpc:
+          port: 9003
+          service: envoy.service.ext_proc.v3.ExternalProcessor
+        initialDelaySeconds: 5
+        timeoutSeconds: 1
+        periodSeconds: 10
+        successThreshold: 1
+        failureThreshold: 3
+      livenessProbe:
+        grpc:
+          port: 9003
+          service: envoy.service.ext_proc.v3.ExternalProcessor
+        initialDelaySeconds: 5
+        timeoutSeconds: 1
+        periodSeconds: 10
+        successThreshold: 1
+        failureThreshold: 3
diff --git a/helm/templates/hpa.yaml b/helm/templates/hpa.yaml
diff --git a/helm/templates/inferencemodel.yaml b/helm/templates/inferencemodel.yaml
@@ -0,0 +1,16 @@
+{{- if .Values.inferenceModel }}
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  labels:
+    llm-d.ai/inferenceServing: "true"
+    llm-d.ai/model: {{ .Values.modelServiceName }}
+  name: {{ .Values.modelServiceName }}
+  namespace: {{ .Release.Namespace }}
+spec:
+  modelName: ibm-granite/granite-3.3-2b-base
+  poolRef:
+    group: inference.networking.x-k8s.io
+    kind: InferencePool
+    name: {{ .Values.modelServiceName }}-inference-pool
+{{- end }}
diff --git a/helm/templates/inferencepool.yaml b/helm/templates/inferencepool.yaml
@@ -0,0 +1,17 @@
+{{- if .Values.inferencePool }}
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  name: {{ .Values.modelServiceName }}-inference-pool
+  namespace: {{ .Release.Namespace }}
+spec:
+  extensionRef:
+    failureMode: FailClose
+    group: ""
+    kind: Service
+    name: {{ .Values.modelServiceName }}-epp-service
+  selector:
+    llm-d.ai/inferenceServing: "true"
+    llm-d.ai/model: {{ .Values.modelServiceName }}
+  targetPortNumber: 8000
+{{- end }}
diff --git a/helm/templates/ingress.yaml b/helm/templates/ingress.yaml
diff --git a/helm/values-msvc.yaml b/helm/values-msvc.yaml
@@ -1,11 +1,11 @@
 # TODO
 # decoupleScaling: false
 
+modelServiceName: facebook-opt-125m       # DNS compliant
 lws: false  # If true, creates LWS instead of deployments  
 inferencePool: true 
 inferenceModel: true 
 httpRoute: true 
-
 routing: 
   # This is the model name for the OpenAI request
   modelName: facebook/opt-125m
@@ -22,6 +22,10 @@ modelArtifacts:
   artficat: facebook/opt-125m
   authSecretName: "hf-secret"
   size: 5Mi 
+  gatewayRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway-kgateway
 
 # describe decode pods
 decode: