Skip to content
This repository was archived by the owner on Jul 24, 2025. It is now read-only.

Commit 3cd169b

Browse files
committed
Routing resources
Signed-off-by: Jing Chen <[email protected]>
1 parent 65a5498 commit 3cd169b

File tree

6 files changed

+146
-76
lines changed

6 files changed

+146
-76
lines changed

helm/templates/epp-deployment.yaml

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: {{ .Values.modelServiceName }}-epp
5+
labels:
6+
llm-d.ai/epp: {{ .Values.modelServiceName }}-epp
7+
namespace: {{ .Release.Namespace }}
8+
spec:
9+
replicas: 1
10+
selector:
11+
matchLabels:
12+
llm-d.ai/epp: {{ .Values.modelServiceName }}-epp
13+
template:
14+
metadata:
15+
labels:
16+
llm-d.ai/epp: {{ .Values.modelServiceName }}-epp
17+
spec:
18+
containers:
19+
- name: epp
20+
imagePullPolicy: Always
21+
{{- if .Values.endpointPicker.image }}
22+
image: {{ .Values.endpointPicker.image }}
23+
{{- else }}
24+
image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3
25+
{{- end }}
26+
imagePullPolicy: Always
27+
args:
28+
- --poolName
29+
- {{ .Values.modelServiceName }}-inference-pool
30+
- --poolNamespace
31+
- {{ .Release.Namespace }}
32+
- -v
33+
- "4"
34+
- --zap-encoder
35+
- json
36+
- --grpcPort
37+
- "9002"
38+
- --grpcHealthPort
39+
- "9003"
40+
env:
41+
- name: ENABLE_KVCACHE_AWARE_SCORER
42+
value: "false"
43+
- name: ENABLE_LOAD_AWARE_SCORER
44+
value: "true"
45+
- name: ENABLE_PREFIX_AWARE_SCORER
46+
value: "true"
47+
- name: ENABLE_SESSION_AWARE_SCORER
48+
value: "false"
49+
- name: KVCACHE_AWARE_SCORER_WEIGHT
50+
value: "1"
51+
- name: KVCACHE_INDEXER_REDIS_ADDR
52+
- name: LOAD_AWARE_SCORER_WEIGHT
53+
value: "1"
54+
- name: PD_ENABLED
55+
value: "false"
56+
- name: PD_PROMPT_LEN_THRESHOLD
57+
value: "10"
58+
- name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
59+
value: "false"
60+
- name: PREFILL_ENABLE_LOAD_AWARE_SCORER
61+
value: "false"
62+
- name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
63+
value: "false"
64+
- name: PREFILL_ENABLE_SESSION_AWARE_SCORER
65+
value: "false"
66+
- name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
67+
value: "1"
68+
- name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR
69+
- name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
70+
value: "1"
71+
- name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
72+
value: "1"
73+
- name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
74+
value: "1"
75+
- name: PREFIX_AWARE_SCORER_WEIGHT
76+
value: "2"
77+
- name: SESSION_AWARE_SCORER_WEIGHT
78+
value: "1"
79+
ports:
80+
- containerPort: 9002
81+
name: grpc
82+
protocol: TCP
83+
- containerPort: 9003
84+
name: grpc-health
85+
protocol: TCP
86+
- containerPort: 9090
87+
name: metrics
88+
protocol: TCP
89+
serviceAccount: {{ .Values.modelServiceName }}-sa
90+
serviceAccountName: {{ .Values.modelServiceName }}-sa
91+
readinessProbe:
92+
grpc:
93+
port: 9003
94+
service: envoy.service.ext_proc.v3.ExternalProcessor
95+
initialDelaySeconds: 5
96+
timeoutSeconds: 1
97+
periodSeconds: 10
98+
successThreshold: 1
99+
failureThreshold: 3
100+
livenessProbe:
101+
grpc:
102+
port: 9003
103+
service: envoy.service.ext_proc.v3.ExternalProcessor
104+
initialDelaySeconds: 5
105+
timeoutSeconds: 1
106+
periodSeconds: 10
107+
successThreshold: 1
108+
failureThreshold: 3

helm/templates/hpa.yaml

Lines changed: 0 additions & 32 deletions
This file was deleted.

helm/templates/inferencemodel.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{{- if .Values.inferenceModel }}
2+
apiVersion: inference.networking.x-k8s.io/v1alpha2
3+
kind: InferenceModel
4+
metadata:
5+
labels:
6+
llm-d.ai/inferenceServing: "true"
7+
llm-d.ai/model: {{ .Values.modelServiceName }}
8+
name: {{ .Values.modelServiceName }}
9+
namespace: {{ .Release.Namespace }}
10+
spec:
11+
modelName: ibm-granite/granite-3.3-2b-base
12+
poolRef:
13+
group: inference.networking.x-k8s.io
14+
kind: InferencePool
15+
name: {{ .Values.modelServiceName }}-inference-pool
16+
{{- end }}

helm/templates/inferencepool.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{{- if .Values.inferencePool }}
2+
apiVersion: inference.networking.x-k8s.io/v1alpha2
3+
kind: InferencePool
4+
metadata:
5+
name: {{ .Values.modelServiceName }}-inference-pool
6+
namespace: {{ .Release.Namespace }}
7+
spec:
8+
extensionRef:
9+
failureMode: FailClose
10+
group: ""
11+
kind: Service
12+
name: {{ .Values.modelServiceName }}-epp-service
13+
selector:
14+
llm-d.ai/inferenceServing: "true"
15+
llm-d.ai/model: {{ .Values.modelServiceName }}
16+
targetPortNumber: 8000
17+
{{- end }}

helm/templates/ingress.yaml

Lines changed: 0 additions & 43 deletions
This file was deleted.

helm/values-msvc.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# TODO
22
# decoupleScaling: false
33

4+
modelServiceName: facebook-opt-125m # DNS compliant
45
lws: false # If true, creates LWS instead of deployments
56
inferencePool: true
67
inferenceModel: true
78
httpRoute: true
8-
99
routing:
1010
# This is the model name for the OpenAI request
1111
modelName: facebook/opt-125m
@@ -22,6 +22,10 @@ modelArtifacts:
2222
artficat: facebook/opt-125m
2323
authSecretName: "hf-secret"
2424
size: 5Mi
25+
gatewayRefs:
26+
- group: gateway.networking.k8s.io
27+
kind: Gateway
28+
name: inference-gateway-kgateway
2529

2630
# describe decode pods
2731
decode:

0 commit comments

Comments
 (0)