llm-d
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmd/pd-sidecar/main.go‎
Lines changed: 26 additions & 8 deletions b/‎cmd/pd-sidecar/main.go‎
Lines changed: 26 additions & 8 deletions
diff --git a/‎deploy/components/crds-gie/kustomization.yaml‎
Lines changed: 1 addition & 1 deletion b/‎deploy/components/crds-gie/kustomization.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/components/inference-gateway/deployments.yaml‎
Lines changed: 0 additions & 2 deletions b/‎deploy/components/inference-gateway/deployments.yaml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎deploy/components/inference-gateway/httproutes.yaml‎
Lines changed: 1 addition & 1 deletion b/‎deploy/components/inference-gateway/httproutes.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/components/inference-gateway/inference-pools.yaml‎
Lines changed: 9 additions & 4 deletions b/‎deploy/components/inference-gateway/inference-pools.yaml‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎deploy/components/inference-gateway/rbac.yaml‎
Lines changed: 8 additions & 0 deletions b/‎deploy/components/inference-gateway/rbac.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎deploy/components/istio-control-plane/rbac.yaml‎
Lines changed: 15 additions & 0 deletions b/‎deploy/components/istio-control-plane/rbac.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎deploy/components/vllm-sim-pd/deployments.yaml‎
Lines changed: 52 additions & 1 deletion b/‎deploy/components/vllm-sim-pd/deployments.yaml‎
Lines changed: 52 additions & 1 deletion
diff --git a/‎deploy/components/vllm-sim/deployments.yaml‎
Lines changed: 66 additions & 3 deletions b/‎deploy/components/vllm-sim/deployments.yaml‎
Lines changed: 66 additions & 3 deletions
@@ -112,7 +112,7 @@ test-integration: download-tokenizer install-dependencies ## Run integration tes
 	go test -ldflags="$(LDFLAGS)" -v -tags=integration_tests ./test/integration/
 
 .PHONY: test-e2e
-test-e2e: image-build sidecar-image-build ## Run end-to-end tests against a new kind cluster
+test-e2e: image-build ## Run end-to-end tests against a new kind cluster
 	@printf "\033[33;1m==== Running End to End Tests ====\033[0m\n"
 	./test/scripts/run_e2e.sh
 
 
@@ -16,6 +16,7 @@ limitations under the License.
 package main
 
 import (
+	"crypto/tls"
 	"flag"
 	"net/url"
 	"os"
@@ -30,6 +31,7 @@ import (
 func main() {
 	port := flag.String("port", "8000", "the port the sidecar is listening on")
 	vLLMPort := flag.String("vllm-port", "8001", "the port vLLM is listening on")
+	vLLMDataParallelSize := flag.Int("data-parallel-size", 1, "the vLLM DATA-PARALLEL-SIZE value")
 	connector := flag.String("connector", "nixlv2", "the P/D connector being used. Either nixl, nixlv2 or lmcache")
 	prefillerUseTLS := flag.Bool("prefiller-use-tls", false, "whether to use TLS when sending requests to prefillers")
 	decoderUseTLS := flag.Bool("decoder-use-tls", false, "whether to use TLS when sending requests to the decoder")
@@ -86,23 +88,39 @@ func main() {
 		return
 	}
 
+	var cert *tls.Certificate
+	if *secureProxy {
+		var tempCert tls.Certificate
+		if *certPath != "" {
+			tempCert, err = tls.LoadX509KeyPair(*certPath+"/tls.crt", *certPath+"/tls.key")
+		} else {
+			tempCert, err = proxy.CreateSelfSignedTLSCertificate()
+		}
+		if err != nil {
+			logger.Error(err, "failed to create TLS certificate")
+			return
+		}
+		cert = &tempCert
+	}
+
 	config := proxy.Config{
 		Connector:                   *connector,
 		PrefillerUseTLS:             *prefillerUseTLS,
-		SecureProxy:                 *secureProxy,
-		CertPath:                    *certPath,
 		PrefillerInsecureSkipVerify: *prefillerInsecureSkipVerify,
 		DecoderInsecureSkipVerify:   *decoderInsecureSkipVerify,
-		EnableSSRFProtection:        *enableSSRFProtection,
-		InferencePoolNamespace:      *inferencePoolNamespace,
-		InferencePoolName:           *inferencePoolName,
+		DataParallelSize:            *vLLMDataParallelSize,
 	}
 
-	proxy, err := proxy.NewProxy(*port, targetURL, config)
+	// Create SSRF protection validator
+	validator, err := proxy.NewAllowlistValidator(*enableSSRFProtection, *inferencePoolNamespace, *inferencePoolName)
 	if err != nil {
-		logger.Error(err, "Failed to create proxy")
+		logger.Error(err, "failed to create SSRF protection validator")
+		return
 	}
-	if err := proxy.Start(ctx); err != nil {
+
+	proxyServer := proxy.NewProxy(*port, targetURL, config)
+
+	if err := proxyServer.Start(ctx, cert, validator); err != nil {
 		logger.Error(err, "failed to start proxy server")
 	}
 }
@@ -10,4 +10,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
 resources:
-- https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd?ref=v1.0.0
+- https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd?ref=v1.1.0-rc.1
@@ -23,8 +23,6 @@ spec:
         args:
         - --pool-name
         - "${POOL_NAME}"
-        - "--pool-group"
-        - "inference.networking.x-k8s.io"
         - --v
         - "4"
         - --zap-encoder
 
@@ -11,7 +11,7 @@ spec:
         type: PathPrefix
         value: /
     backendRefs:
-    - group: inference.networking.x-k8s.io
+    - group: inference.networking.k8s.io
       kind: InferencePool
       name: ${POOL_NAME}
       port: 8000
 
@@ -1,10 +1,15 @@
-apiVersion: inference.networking.x-k8s.io/v1alpha2
+apiVersion: inference.networking.k8s.io/v1
 kind: InferencePool
 metadata:
   name: ${POOL_NAME}
 spec:
-  targetPortNumber: 8000
   selector:
-    app: ${POOL_NAME}
-  extensionRef:
+    matchLabels:
+      app: ${POOL_NAME}
+  endpointPickerRef:
     name: ${EPP_NAME}
+    kind: Service
+    port:
+      number: 9002
+  targetPorts:
+  - number: ${TARGET_PORTS}
@@ -12,6 +12,14 @@ rules:
   - "get"
   - "watch"
   - "list"
+- apiGroups:
+  - "inference.networking.k8s.io"
+  resources:
+  - "inferencepools"
+  verbs:
+  - "get"
+  - "watch"
+  - "list"
 - apiGroups:
   - ""
   resources:
 
@@ -330,6 +330,21 @@ rules:
   verbs:
   - update
   - patch
+- apiGroups:
+  - "inference.networking.k8s.io"
+  resources:
+  - "inferencepools"
+  verbs:
+  - "get"
+  - "watch"
+  - "list"
+- apiGroups:
+  - "inference.networking.k8s.io"
+  resources:
+  - inferencepools/status
+  verbs:
+  - update
+  - patch
 - apiGroups:
   - ""
   resources:
 
@@ -55,21 +55,72 @@ spec:
         - "--port=8000"
         - "--vllm-port=8200"
         - "--connector=lmcache"
+        - "--secure-proxy=false"
+        - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
         ports:
-        - containerPort: 8000
+        - name: sidecar-http
+          containerPort: 8000
+          protocol: TCP
+        - name: sidecar-rank1
+          containerPort: 8001
+          protocol: TCP
+        - name: sidecar-rank2
+          containerPort: 8002
+          protocol: TCP
+        - name: sidecar-rank3
+          containerPort: 8003
+          protocol: TCP
+        - name: sidecar-rank4
+          containerPort: 8004
+          protocol: TCP
+        - name: sidecar-rank5
+          containerPort: 8005
+          protocol: TCP
+        - name: sidecar-rank6
+          containerPort: 8006
+          protocol: TCP
+        - name: sidecar-rank7
+          containerPort: 8007
           protocol: TCP
         restartPolicy: Always
+        env:
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
       containers:
       - name: vllm
         image: ghcr.io/llm-d/llm-d-inference-sim:latest
         imagePullPolicy: IfNotPresent
         args:
         - "--port=8200"
         - "--model=${MODEL_NAME}"
+        - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
         ports:
         - name: http
           containerPort: 8200
           protocol: TCP
+        - name: rank1
+          containerPort: 8201
+          protocol: TCP
+        - name: rank2
+          containerPort: 8202
+          protocol: TCP
+        - name: rank3
+          containerPort: 8203
+          protocol: TCP
+        - name: rank4
+          containerPort: 8204
+          protocol: TCP
+        - name: rank5
+          containerPort: 8205
+          protocol: TCP
+        - name: rank6
+          containerPort: 8206
+          protocol: TCP
+        - name: rank7
+          containerPort: 8207
+          protocol: TCP
         env:
         - name: PORT
           value: "8200"
@@ -14,26 +14,89 @@ spec:
       labels:
         app: ${POOL_NAME}
     spec:
+      initContainers:
+      - name: routing-sidecar
+        image: ghcr.io/llm-d/llm-d-routing-sidecar:latest
+        imagePullPolicy: IfNotPresent
+        args:
+        - "--port=8000"
+        - "--vllm-port=8200"
+        - "--connector=lmcache"
+        - "--secure-proxy=false"
+        - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
+        ports:
+        - name: sidecar-http
+          containerPort: 8000
+          protocol: TCP
+        - name: sidecar-rank1
+          containerPort: 8001
+          protocol: TCP
+        - name: sidecar-rank2
+          containerPort: 8002
+          protocol: TCP
+        - name: sidecar-rank3
+          containerPort: 8003
+          protocol: TCP
+        - name: sidecar-rank4
+          containerPort: 8004
+          protocol: TCP
+        - name: sidecar-rank5
+          containerPort: 8005
+          protocol: TCP
+        - name: sidecar-rank6
+          containerPort: 8006
+          protocol: TCP
+        - name: sidecar-rank7
+          containerPort: 8007
+          protocol: TCP
+        restartPolicy: Always
+        env:
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
       containers:
       - name: vllm
         image: ghcr.io/llm-d/llm-d-inference-sim:latest
         imagePullPolicy: IfNotPresent
         args:
-        - "--port=8000"
+        - "--port=8200"
         - "--model=${MODEL_NAME}"
         - "--enable-kvcache=${KV_CACHE_ENABLED}"
         - "--kv-cache-size=1024"
         - "--block-size=16"
         - "--zmq-endpoint=tcp://${EPP_NAME}.default.svc.cluster.local:5557"
         - "--event-batch-size=16"
         - "--tokenizers-cache-dir=/tokenizer-cache"
+        - "--data-parallel-size=${VLLM_DATA_PARALLEL_SIZE}"
         ports:
         - name: http
-          containerPort: 8000
+          containerPort: 8200
+          protocol: TCP
+        - name: rank1
+          containerPort: 8201
+          protocol: TCP
+        - name: rank2
+          containerPort: 8202
+          protocol: TCP
+        - name: rank3
+          containerPort: 8203
+          protocol: TCP
+        - name: rank4
+          containerPort: 8204
+          protocol: TCP
+        - name: rank5
+          containerPort: 8205
+          protocol: TCP
+        - name: rank6
+          containerPort: 8206
+          protocol: TCP
+        - name: rank7
+          containerPort: 8207
           protocol: TCP
         env:
         - name: PORT
-          value: "8000"
+          value: "8200"
         - name: PYTHONHASHSEED
           value: "42"
         volumeMounts: