|
7 | 7 | # containsElement(), wait_deployment_available_nonfatal(), detect_inference_pool_api_group(). |
8 | 8 | # |
9 | 9 |
|
10 | | -# Deploy second model infrastructure for multi-model/limiter testing |
11 | | -# Creates a second InferencePool, modelservice deployment, and updates HTTPRoute |
12 | | -deploy_second_model_infrastructure() { |
13 | | - log_info "Deploying second model infrastructure for multi-model testing..." |
14 | | - log_info "Second model: $MODEL_ID_2" |
15 | | - |
16 | | - local POOL_NAME_2="gaie-sim-2" |
17 | | - local MS_NAME_2="ms-sim-2" |
18 | | - local MODEL_LABEL_2="model-2" |
19 | | - # Sanitize model name for use in Kubernetes labels (replace / with -) |
20 | | - local MODEL_ID_2_SANITIZED=$(echo "$MODEL_ID_2" | tr '/' '-') |
21 | | - |
22 | | - # Create second InferencePool with different selector |
23 | | - log_info "Creating second InferencePool: $POOL_NAME_2" |
24 | | - cat <<EOF | kubectl apply -n "$LLMD_NS" -f - |
25 | | -apiVersion: inference.networking.x-k8s.io/v1alpha2 |
26 | | -kind: InferencePool |
27 | | -metadata: |
28 | | - name: $POOL_NAME_2 |
29 | | -spec: |
30 | | - targetPortNumber: 8000 |
31 | | - selector: |
32 | | - llm-d.ai/model-pool: "$MODEL_LABEL_2" |
33 | | - extensionRef: |
34 | | - name: ${POOL_NAME_2}-epp |
35 | | -EOF |
36 | | - |
37 | | - # Create EPP deployment for second pool |
38 | | - log_info "Creating EPP deployment for second pool" |
39 | | - cat <<EOF | kubectl apply -n "$LLMD_NS" -f - |
40 | | -apiVersion: apps/v1 |
41 | | -kind: Deployment |
42 | | -metadata: |
43 | | - name: ${POOL_NAME_2}-epp |
44 | | -spec: |
45 | | - replicas: 1 |
46 | | - selector: |
47 | | - matchLabels: |
48 | | - app: ${POOL_NAME_2}-epp |
49 | | - template: |
50 | | - metadata: |
51 | | - labels: |
52 | | - app: ${POOL_NAME_2}-epp |
53 | | - spec: |
54 | | - serviceAccountName: gaie-sim-sa |
55 | | - containers: |
56 | | - - name: epp |
57 | | - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.2 |
58 | | - imagePullPolicy: Always |
59 | | - args: |
60 | | - - --poolName=$POOL_NAME_2 |
61 | | - - --poolNamespace=$LLMD_NS |
62 | | - - --extProcPort=9002 |
63 | | - - --grpcHealthPort=9003 |
64 | | - ports: |
65 | | - - containerPort: 9002 |
66 | | - name: grpc |
67 | | - - containerPort: 9003 |
68 | | - name: grpc-health |
69 | | - - containerPort: 9090 |
70 | | - name: metrics |
71 | | - readinessProbe: |
72 | | - grpc: |
73 | | - port: 9003 |
74 | | - initialDelaySeconds: 5 |
75 | | - periodSeconds: 10 |
76 | | - livenessProbe: |
77 | | - grpc: |
78 | | - port: 9003 |
79 | | - initialDelaySeconds: 15 |
80 | | - periodSeconds: 20 |
81 | | ---- |
82 | | -apiVersion: v1 |
83 | | -kind: Service |
84 | | -metadata: |
85 | | - name: ${POOL_NAME_2}-epp |
86 | | -spec: |
87 | | - selector: |
88 | | - app: ${POOL_NAME_2}-epp |
89 | | - ports: |
90 | | - - name: grpc |
91 | | - port: 9002 |
92 | | - targetPort: 9002 |
93 | | - - name: grpc-health |
94 | | - port: 9003 |
95 | | - targetPort: 9003 |
96 | | - - name: metrics |
97 | | - port: 9090 |
98 | | - targetPort: 9090 |
99 | | -EOF |
100 | | - |
101 | | - # Wait for second EPP to be ready |
102 | | - log_info "Waiting for second EPP deployment to be ready..." |
103 | | - wait_deployment_available_nonfatal \ |
104 | | - "$LLMD_NS" \ |
105 | | - "${POOL_NAME_2}-epp" \ |
106 | | - "120s" \ |
107 | | - "Second EPP deployment not ready yet - check 'kubectl get pods -n $LLMD_NS -l app=${POOL_NAME_2}-epp'" |
108 | | - |
109 | | - # Create second modelservice deployment (using llm-d-inference-sim) |
110 | | - log_info "Creating second modelservice deployment: $MS_NAME_2" |
111 | | - cat <<EOF | kubectl apply -n "$LLMD_NS" -f - |
112 | | -apiVersion: apps/v1 |
113 | | -kind: Deployment |
114 | | -metadata: |
115 | | - name: ${MS_NAME_2}-decode |
116 | | -spec: |
117 | | - replicas: 2 |
118 | | - selector: |
119 | | - matchLabels: |
120 | | - app: ${MS_NAME_2}-decode |
121 | | - llm-d.ai/model-pool: "$MODEL_LABEL_2" |
122 | | - template: |
123 | | - metadata: |
124 | | - labels: |
125 | | - app: ${MS_NAME_2}-decode |
126 | | - llm-d.ai/model-pool: "$MODEL_LABEL_2" |
127 | | - llm-d.ai/model: "${MODEL_ID_2_SANITIZED}" |
128 | | - spec: |
129 | | - containers: |
130 | | - - name: vllm |
131 | | - image: ghcr.io/llm-d/llm-d-inference-sim:v0.5.1 |
132 | | - imagePullPolicy: Always |
133 | | - args: |
134 | | - - --model=$MODEL_ID_2 |
135 | | - - --time-to-first-token=$TTFT_AVERAGE_LATENCY_MS |
136 | | - - --inter-token-latency=$ITL_AVERAGE_LATENCY_MS |
137 | | - - --enable-kvcache |
138 | | - - --kv-cache-size=1024 |
139 | | - - --block-size=16 |
140 | | - ports: |
141 | | - - containerPort: 8000 |
142 | | - name: http |
143 | | - - containerPort: 8200 |
144 | | - name: metrics |
145 | | - env: |
146 | | - - name: POD_NAME |
147 | | - valueFrom: |
148 | | - fieldRef: |
149 | | - fieldPath: metadata.name |
150 | | - - name: POD_NAMESPACE |
151 | | - valueFrom: |
152 | | - fieldRef: |
153 | | - fieldPath: metadata.namespace |
154 | | - readinessProbe: |
155 | | - httpGet: |
156 | | - path: /health |
157 | | - port: 8000 |
158 | | - periodSeconds: 5 |
159 | | ---- |
160 | | -apiVersion: v1 |
161 | | -kind: Service |
162 | | -metadata: |
163 | | - name: ${MS_NAME_2}-decode |
164 | | - labels: |
165 | | - llm-d.ai/model-pool: "$MODEL_LABEL_2" |
166 | | -spec: |
167 | | - selector: |
168 | | - app: ${MS_NAME_2}-decode |
169 | | - ports: |
170 | | - - name: http |
171 | | - port: 8000 |
172 | | - targetPort: 8000 |
173 | | - - name: metrics |
174 | | - port: 8200 |
175 | | - targetPort: 8200 |
176 | | -EOF |
177 | | - |
178 | | - # Create InferenceModel for second model (maps model name to pool) |
179 | | - # Note: InferenceModel CRD may not be available in all environments |
180 | | - if kubectl get crd inferencemodels.inference.networking.x-k8s.io &>/dev/null; then |
181 | | - log_info "Creating InferenceModel for second model" |
182 | | - cat <<EOF | kubectl apply -n "$LLMD_NS" -f - |
183 | | -apiVersion: inference.networking.x-k8s.io/v1alpha2 |
184 | | -kind: InferenceModel |
185 | | -metadata: |
186 | | - name: ${MS_NAME_2}-model |
187 | | -spec: |
188 | | - modelName: $MODEL_ID_2 |
189 | | - criticality: Critical |
190 | | - poolRef: |
191 | | - name: $POOL_NAME_2 |
192 | | - targetModels: |
193 | | - - name: $MODEL_ID_2 |
194 | | - weight: 100 |
195 | | -EOF |
196 | | - else |
197 | | - log_warning "InferenceModel CRD not available - skipping InferenceModel creation for second model" |
198 | | - log_warning "Model routing may need to be configured manually or via HTTPRoute" |
199 | | - fi |
200 | | - |
201 | | - # Create PodMonitor for second model metrics |
202 | | - log_info "Creating PodMonitor for second model" |
203 | | - cat <<EOF | kubectl apply -n "$LLMD_NS" -f - |
204 | | -apiVersion: monitoring.coreos.com/v1 |
205 | | -kind: PodMonitor |
206 | | -metadata: |
207 | | - name: ${MS_NAME_2}-podmonitor |
208 | | - labels: |
209 | | - release: kube-prometheus-stack |
210 | | -spec: |
211 | | - selector: |
212 | | - matchLabels: |
213 | | - app: ${MS_NAME_2}-decode |
214 | | - podMetricsEndpoints: |
215 | | - - port: metrics |
216 | | - path: /metrics |
217 | | - interval: 15s |
218 | | -EOF |
219 | | - |
220 | | - # Wait for second model deployment to be ready |
221 | | - log_info "Waiting for second model deployment to be ready..." |
222 | | - wait_deployment_available_nonfatal \ |
223 | | - "$LLMD_NS" \ |
224 | | - "${MS_NAME_2}-decode" \ |
225 | | - "120s" \ |
226 | | - "Second model deployment not ready yet - check 'kubectl get pods -n $LLMD_NS'" |
227 | | - |
228 | | - log_success "Second model infrastructure deployed successfully" |
229 | | -} |
230 | | - |
231 | 10 | deploy_llm_d_infrastructure() { |
232 | 11 | log_info "Deploying llm-d infrastructure..." |
233 | 12 |
|
@@ -556,11 +335,6 @@ deploy_llm_d_infrastructure() { |
556 | 335 | fi |
557 | 336 | fi |
558 | 337 |
|
559 | | - # Deploy second model infrastructure for multi-model testing (limiter e2e tests) |
560 | | - if [ "$MULTI_MODEL_TESTING" == "true" ]; then |
561 | | - deploy_second_model_infrastructure |
562 | | - fi |
563 | | - |
564 | 338 | cd "$WVA_PROJECT" |
565 | 339 | log_success "llm-d infrastructure deployment complete" |
566 | 340 | } |
0 commit comments