@@ -43,6 +43,18 @@ VALUES_FILE=${VALUES_FILE:-"$WVA_PROJECT/charts/workload-variant-autoscaler/valu
4343# Controller instance identifier for multi-controller isolation (optional)
4444# When set, adds controller_instance label to metrics and HPA selectors
4545CONTROLLER_INSTANCE=${CONTROLLER_INSTANCE:- " " }
46+ # InferencePool API group
47+ # inference.networking.k8s.io for v1, should be new default
48+ # inference.networking.x-k8s.io for v1alpha2
49+ POOL_GROUP=${POOL_GROUP:- " inference.networking.k8s.io" }
50+ if [ " $POOL_GROUP " = " inference.networking.k8s.io" ]; then
51+ POOL_VERSION=" v1"
52+ elif [ " $POOL_GROUP " = " inference.networking.x-k8s.io" ]; then
53+ POOL_VERSION=" v1alpha2"
54+ else
55+ log_error " Unknown POOL_GROUP: $POOL_GROUP (expected inference.networking.k8s.io or inference.networking.x-k8s.io)"
56+ exit 1
57+ fi
4658
4759# llm-d Configuration
4860LLM_D_OWNER=${LLM_D_OWNER:- " llm-d" }
@@ -612,18 +624,39 @@ deploy_second_model_infrastructure() {
612624
613625 # Create second InferencePool with different selector
614626 log_info " Creating second InferencePool: $POOL_NAME_2 "
615- cat << EOF | kubectl apply -n $LLMD_NS -f -
616- apiVersion: inference.networking.x-k8s.io/v1alpha2
627+ if [ " $POOL_VERSION " = " v1" ]; then
628+ cat << EOF | kubectl apply -n $LLMD_NS -f -
629+ apiVersion: ${POOL_GROUP} /${POOL_VERSION}
630+ kind: InferencePool
631+ metadata:
632+ name: $POOL_NAME_2
633+ spec:
634+ selector:
635+ matchLabels:
636+ llm-d.ai/model-pool: "$MODEL_LABEL_2 "
637+ llm-d.ai/inference-serving: "true"
638+ targetPorts:
639+ - number: 8000
640+ endpointPickerRef:
641+ name: ${POOL_NAME_2} -epp
642+ port:
643+ number: 9002
644+ EOF
645+ else
646+ cat << EOF | kubectl apply -n $LLMD_NS -f -
647+ apiVersion: ${POOL_GROUP} /${POOL_VERSION}
617648kind: InferencePool
618649metadata:
619650 name: $POOL_NAME_2
620651spec:
621652 targetPortNumber: 8000
622653 selector:
623654 llm-d.ai/model-pool: "$MODEL_LABEL_2 "
655+ llm-d.ai/inference-serving: "true"
624656 extensionRef:
625657 name: ${POOL_NAME_2} -epp
626658EOF
659+ fi
627660
628661 # Create EPP deployment for second pool
629662 log_info " Creating EPP deployment for second pool"
@@ -707,12 +740,14 @@ spec:
707740 matchLabels:
708741 app: ${MS_NAME_2} -decode
709742 llm-d.ai/model-pool: "$MODEL_LABEL_2 "
743+ llm-d.ai/inference-serving: "true"
710744 template:
711745 metadata:
712746 labels:
713747 app: ${MS_NAME_2} -decode
714748 llm-d.ai/model-pool: "$MODEL_LABEL_2 "
715749 llm-d.ai/model: "${MODEL_ID_2_SANITIZED} "
750+ llm-d.ai/inference-serving: "true"
716751 spec:
717752 containers:
718753 - name: vllm
@@ -764,7 +799,7 @@ spec:
764799EOF
765800
766801 # Create InferenceModel for second model (maps model name to pool)
767- # Note : InferenceModel CRD may not be available in all environments
802+ # TODO : InferenceModel only exists in inference.networking.x-k8s.io/v1alpha2, should use InfereceModelRewrite instead
768803 if kubectl get crd inferencemodels.inference.networking.x-k8s.io & > /dev/null; then
769804 log_info " Creating InferenceModel for second model"
770805 cat << EOF | kubectl apply -n $LLMD_NS -f -
@@ -833,7 +868,11 @@ deploy_llm_d_infrastructure() {
833868
834869 if [ ! -d " $LLM_D_PROJECT " ]; then
835870 log_info " Cloning $LLM_D_PROJECT repository (release: $LLM_D_RELEASE )"
836- git clone -b $LLM_D_RELEASE -- https://github.com/$LLM_D_OWNER /$LLM_D_PROJECT .git $LLM_D_PROJECT & > /dev/null
871+ if ! git clone -b " $LLM_D_RELEASE " -- https://github.com/$LLM_D_OWNER /$LLM_D_PROJECT .git " $LLM_D_PROJECT " 2>&1 | grep -v " Cloning into" ; then
872+ log_error " Failed to clone $LLM_D_PROJECT repository (release: $LLM_D_RELEASE )"
873+ return 1
874+ fi
875+ log_success " Successfully cloned $LLM_D_PROJECT repository (release: $LLM_D_RELEASE )"
837876 fi
838877
839878 # Check for HF_TOKEN (use dummy for emulated deployments)
@@ -1073,6 +1112,7 @@ deploy_llm_d_infrastructure() {
10731112
10741113 # Deploy InferenceObjective for GIE queuing when flow control is enabled (scale-from-zero / e2e).
10751114 # Enables gateway-level queuing so inference_extension_flow_control_queue_size is populated.
1115+ # Note: InferenceObjective only exists in inference.networking.x-k8s.io v1alpha2
10761116 if [ " $ENABLE_SCALE_TO_ZERO " == " true" ] || [ " $E2E_TESTS_ENABLED " == " true" ]; then
10771117 if kubectl get crd inferenceobjectives.inference.networking.x-k8s.io & > /dev/null; then
10781118 local infobj_file=" ${WVA_PROJECT} /deploy/inference-objective-e2e.yaml"
0 commit comments