waltforme
diff --git a/‎.github/workflows/launcher-based-e2e-test.yml‎
Lines changed: 105 additions & 0 deletions b/‎.github/workflows/launcher-based-e2e-test.yml‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎pkg/controller/common/interface.go‎
Lines changed: 5 additions & 0 deletions b/‎pkg/controller/common/interface.go‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎pkg/controller/dual-pods/controller.go‎
Lines changed: 28 additions & 3 deletions b/‎pkg/controller/dual-pods/controller.go‎
Lines changed: 28 additions & 3 deletions
@@ -0,0 +1,105 @@
+# Tests launcher-based server-providing pods in a `kind` cluster
+name: Launcher-Based E2E Test
+
+on:
+  push:
+    paths:
+      - ".github/workflows/launcher-based-e2e-test.yml"
+      - Makefile
+      - cmd/dual-pods-controller/**
+      - cmd/test-requester/**
+      - cmd/launcher-populator/**
+      - inference_server/launcher/**
+      - dockerfiles/Dockerfile.launcher.benchmark
+      - pkg/**
+      - test/e2e/run-launcher-based.sh
+      - test/e2e/mkobjs.sh
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  debug:
+    name: print relevant info
+    runs-on: ubuntu-latest
+    steps:
+      - run: |
+          echo "github.actor=${{ github.actor }}"
+          echo "github.action_ref=${{ github.action_ref }}"
+          echo "github.event_name=${{ github.event_name }}"
+          echo "github.head_ref=$GITHUB_HEAD_REF"
+          echo "github.ref=${{ github.ref }}"
+          echo "github.ref_name=${{ github.ref_name }}"
+          echo "github.repository=${{ github.repository }}"
+          echo "github.repository_owner=${{ github.repository_owner }}"
+          echo "github.triggering_actor=${{ github.triggering_actor }}"
+          echo "GITHUB_ACTION_REF=$GITHUB_ACTION_REF"
+
+  run-launcher-test:
+    runs-on: ubuntu-22.04-arm
+    steps:
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.24.2'
+
+      - name: Install ko
+        uses: ko-build/setup-ko@v0.8
+        with:
+          version: v0.15.2
+
+      - name: Checkout code
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
+
+      - name: Run launcher-based E2E test
+        run: test/e2e/run-launcher-based.sh
+
+      - name: show all pods
+        if: always()
+        run: kubectl get pods -A -o wide
+
+      - name: show test pods with labels
+        if: always()
+        run: kubectl get pods -L dual-pods.llm-d.ai/dual,dual-pods.llm-d.ai/sleeping,dual-pods.llm-d.ai/launcher-config-name
+
+      - name: show ReplicaSets
+        if: always()
+        run: kubectl get rs -A
+
+      - name: show dual-pods controller log
+        if: always()
+        run: kubectl logs deploy/dpctlr
+
+      - name: show launcher-populator log
+        if: always()
+        run: kubectl logs deploy/launcher-populator || echo "launcher-populator not deployed"
+
+      - name: show GPU allocations
+        if: always()
+        run: kubectl get cm gpu-allocs -o yaml
+
+      - name: show GPU map
+        if: always()
+        run: kubectl get cm gpu-map -o yaml
+
+      - name: show InferenceServerConfigs
+        if: always()
+        run: kubectl get inferenceserverconfigs -o yaml
+
+      - name: show LauncherConfigs
+        if: always()
+        run: kubectl get launcherconfigs -o yaml
+
+      - name: show YAML of test pods
+        if: always()
+        run: kubectl get pods -o yaml
+
+      - name: show launcher pod logs
+        if: always()
+        run: |
+          for pod in $(kubectl get pods -l dual-pods.llm-d.ai/launcher-config-name -o name); do
+            echo "=== Logs for $pod ==="
+            kubectl logs $pod || echo "Failed to get logs for $pod"
+          done
@@ -34,4 +34,9 @@ const (
 	// that is relevant to identify the launcher-based server-providing Pod, mainly the
 	// corresponding LauncherConfig object's PodTemplate that the server-providing Pod uses.
 	LauncherConfigHashAnnotationKey = "dual-pods.llm-d.ai/launcher-config-hash"
+
+	// LauncherServicePort is the port number on which the launcher exposes its HTTP service
+	// for the management of vLLM instances.
+	// This is a contract between the controllers and the launcher implementation.
+	LauncherServicePort = 8001
 )
@@ -42,6 +42,7 @@ import (
 
 	fmav1alpha1 "github.com/llm-d-incubation/llm-d-fast-model-actuation/api/fma/v1alpha1"
 	"github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/api"
+	ctlrcommon "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/common"
 	genctlr "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/generic"
 	"github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/utils"
 	fmainformers "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/generated/informers/externalversions"
@@ -103,7 +104,7 @@ func GPUIndexFunc(obj any) ([]string, error) {
 	if len(pod.Annotations[nominalHashAnnotationKey]) == 0 || pod.Spec.NodeName == "" {
 		return []string{}, nil
 	}
-	isIdx, _, err := utils.GetInferenceServerPort(pod)
+	isIdx, _, err := utils.GetInferenceServerPort(pod, false)
 	if err != nil {
 		return []string{}, nil
 	}
@@ -175,6 +176,7 @@ func (config ControllerConfig) NewController(
 	ctl.gpuMap.Store(&map[string]GpuLocation{})
 	err := ctl.podInformer.AddIndexers(cache.Indexers{
 		inferenceServerConfigIndexName: inferenceServerConfigIndexFunc,
+		launcherConfigHashIndexName:    launcherConfigHashIndexFunc,
 		requesterIndexName:             requesterIndexFunc,
 		nominalHashIndexName:           nominalHashIndexFunc,
 		GPUIndexName:                   GPUIndexFunc})
@@ -303,7 +305,6 @@ type serverData struct {
 	RequesterDeleteRequested bool
 }
 
-// nolint
 type launcherData struct {
 	// Instances is a map,
 	// where key is an instance's ID which is the instance' nominal hash,
@@ -313,7 +314,6 @@ type launcherData struct {
 	// Accurate indicates whether the set of nominal hash in Instances is accurate.
 	Accurate bool
 }
-
 type queueItem interface {
 	// process returns (err error, retry bool).
 	// There will be a retry iff `retry`, error logged if `err != nil`.
@@ -371,6 +371,17 @@ func inferenceServerConfigIndexFunc(obj any) ([]string, error) {
 	return []string{inferenceServerConfigName}, nil
 }
 
+const launcherConfigHashIndexName = "launcherconfighash"
+
+func launcherConfigHashIndexFunc(obj any) ([]string, error) {
+	pod := obj.(*corev1.Pod)
+	launcherConfigHash := pod.Annotations[ctlrcommon.LauncherConfigHashAnnotationKey]
+	if len(launcherConfigHash) == 0 {
+		return []string{}, nil
+	}
+	return []string{launcherConfigHash}, nil
+}
+
 const requesterIndexName = "requester"
 
 func requesterIndexFunc(obj any) ([]string, error) {
@@ -627,6 +638,7 @@ func (ctl *controller) getNodeData(nodeName string) *nodeData {
 		ans = &nodeData{
 			Items:            sets.New[itemOnNode](),
 			InferenceServers: make(map[apitypes.UID]*serverData),
+			Launchers:        make(map[string]*launcherData),
 		}
 		ctl.nodeNameToData[nodeName] = ans
 	}
@@ -660,6 +672,19 @@ func (ctl *controller) getServerData(nodeDat *nodeData, reqName string, reqUID a
 	return ans
 }
 
+func (ctl *controller) getLauncherData(nodeDat *nodeData, launcherPodName string) *launcherData {
+	ctl.mutex.Lock()
+	defer ctl.mutex.Unlock()
+	ans := nodeDat.Launchers[launcherPodName]
+	if ans == nil {
+		ans = &launcherData{
+			Instances: make(map[string]time.Time),
+		}
+		nodeDat.Launchers[launcherPodName] = ans
+	}
+	return ans
+}
+
 func (ctl *controller) clearServerData(nodeDat *nodeData, uid apitypes.UID) {
 	ctl.mutex.Lock()
 	defer ctl.mutex.Unlock()