Skip to content

Commit 105a60b

Browse files
committed
Improve debugging
Signed-off-by: Mike Spreitzer <mspreitz@us.ibm.com>
1 parent 05a411c commit 105a60b

File tree

2 files changed

+63
-24
lines changed

2 files changed

+63
-24
lines changed

pkg/controller/dual-pods/inference-server.go

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,13 @@ func (item launcherPodItem) process(ctx context.Context, ctl *controller, nodeDa
106106

107107
func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *nodeData) (error, bool) {
108108
logger := klog.FromContext(urCtx).WithValues("serverUID", item.UID, "requesterName", item.RequesterName)
109+
serverDat := ctl.getServerData(nodeDat, item.RequesterName, item.UID)
110+
if serverDat.InstanceID != "" {
111+
logger = logger.WithValues("instanceID", serverDat.InstanceID)
112+
}
109113
ctx := klog.NewContext(urCtx, logger)
110114
requesterRV := "(non existent)"
111115
providerRV := "(non existent)"
112-
serverDat := ctl.getServerData(nodeDat, item.RequesterName, item.UID)
113116
var requesterDeletionTimestamp, providerDeletionTimestamp *string
114117
var requesterRCS, providerRCS *reducedContainerState
115118

@@ -356,11 +359,10 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
356359
serverDat.Sleeping = &sleeping
357360
}
358361
if *(serverDat.Sleeping) {
359-
err = ctl.wakeSleeper(ctx, serverDat, requestingPod, providingPod, serverPort)
362+
err = ctl.wakeSleeper(ctx, serverDat, requestingPod, providingPod, serverPort, "discovered-bound")
360363
if err != nil {
361364
return err, true
362365
}
363-
logger.V(2).Info("Woke discovered-bound inference server")
364366
}
365367
if err := ctl.ensureSleepingLabel(ctx, providingPod, *(serverDat.Sleeping)); err != nil {
366368
return err, true
@@ -433,7 +435,7 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
433435
logger.V(2).Info("Unexpected: multiple sleeping Pods match; using the first", "requesterName", requestingPod.Name)
434436
}
435437
providingPod = sleepingAnys[0].(*corev1.Pod)
436-
return ctl.bind(ctx, serverDat, requestingPod, providingPod, false, -1)
438+
return ctl.bind(ctx, serverDat, requestingPod, providingPod, nil, -1)
437439
}
438440
// What remains is to make a new server-providing Pod --- if the sleeper budget allows.
439441

@@ -531,7 +533,7 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
531533
}
532534
launcherDat.Instances[iscHash] = time.Now()
533535
// TODO(waltforme): the bind method may need more revision to fully handle launcher-based server providing Pods
534-
return ctl.bind(ctx, serverDat, requestingPod, launcherPod, true, int16(isc.Spec.ModelServerConfig.Port))
536+
return ctl.bind(ctx, serverDat, requestingPod, launcherPod, &iscHash, int16(isc.Spec.ModelServerConfig.Port))
535537
} else {
536538
// Slower path: create new instance in launcher with capacity
537539
logger.V(5).Info("Creating new vLLM instance", "iscHash", iscHash)
@@ -545,7 +547,7 @@ func (item infSvrItem) process(urCtx context.Context, ctl *controller, nodeDat *
545547
)
546548
launcherDat.Instances[iscHash] = time.Now()
547549
// TODO(waltforme): the bind method may need more revision to fully handle launcher-based server providing Pods
548-
return ctl.bind(ctx, serverDat, requestingPod, launcherPod, true, int16(isc.Spec.ModelServerConfig.Port))
550+
return ctl.bind(ctx, serverDat, requestingPod, launcherPod, &iscHash, int16(isc.Spec.ModelServerConfig.Port))
549551
}
550552
}
551553
}
@@ -693,11 +695,12 @@ func (ctl *controller) configInferenceServer(isc *fmav1alpha1.InferenceServerCon
693695

694696
func (ctl *controller) wakeupInstance(ctx context.Context, lClient *LauncherClient, instanceID string, instancePort int32) error {
695697
logger := klog.FromContext(ctx)
696-
err := doPost("http://" + lClient.baseURL.Hostname() + ":" + strconv.Itoa(int(instancePort)) + "/wake_up")
698+
endpoint := lClient.baseURL.Hostname() + ":" + strconv.Itoa(int(instancePort))
699+
err := doPost("http://" + endpoint + "/wake_up")
697700
if err != nil {
698-
return fmt.Errorf("failed to wake up vLLM instance %q: %w", instanceID, err)
701+
return fmt.Errorf("failed to wake up vLLM instance %q (at %s): %w", instanceID, endpoint, err)
699702
}
700-
logger.V(2).Info("Woke up vLLM instance", "instanceID", instanceID)
703+
logger.V(2).Info("Woke up vLLM instance", "instanceID", instanceID, "endpoint", endpoint)
701704
return nil
702705
}
703706

@@ -794,7 +797,8 @@ func (ctl *controller) enforceSleeperBudget(ctx context.Context, serverDat *serv
794797
}
795798

796799
// Note: instPort is used only for launcher-based server-providing Pods.
797-
func (ctl *controller) bind(ctx context.Context, serverDat *serverData, requestingPod, providingPod *corev1.Pod, launcherBased bool, instPort int16) (error, bool) {
800+
// instanceID is non-nil iff launcher-based
801+
func (ctl *controller) bind(ctx context.Context, serverDat *serverData, requestingPod, providingPod *corev1.Pod, instanceID *string, instPort int16) (error, bool) {
798802
logger := klog.FromContext(ctx)
799803
providingPod = providingPod.DeepCopy()
800804
providingPod.Annotations[requesterAnnotationKey] = string(requestingPod.UID) + " " + requestingPod.Name
@@ -807,8 +811,12 @@ func (ctl *controller) bind(ctx context.Context, serverDat *serverData, requesti
807811
if err != nil {
808812
return fmt.Errorf("failed to bind server-providing Pod %s: %w", providingPod.Name, err), true
809813
}
814+
launcherBased := instanceID != nil
810815
serverDat.ProvidingPodName = providingPod.Name
811-
logger.V(2).Info("Bound server-providing Pod", "name", providingPod.Name, "node", requestingPod.Spec.NodeName, "gpus", serverDat.GPUIDsStr, "newResourceVersion", echo.ResourceVersion)
816+
if launcherBased {
817+
serverDat.InstanceID = *instanceID
818+
}
819+
logger.V(2).Info("Bound server-providing Pod", "name", providingPod.Name, "node", requestingPod.Spec.NodeName, "gpus", serverDat.GPUIDsStr, "newResourceVersion", echo.ResourceVersion, "instanceID", serverDat.InstanceID)
812820
var serverPort int16
813821
if launcherBased {
814822
serverPort = instPort
@@ -824,25 +832,27 @@ func (ctl *controller) bind(ctx context.Context, serverDat *serverData, requesti
824832
if launcherBased {
825833
serverDat.ServerPort = serverPort
826834
}
827-
err = ctl.wakeSleeper(ctx, serverDat, requestingPod, providingPod, serverPort)
835+
err = ctl.wakeSleeper(ctx, serverDat, requestingPod, providingPod, serverPort, "freshly-bound")
828836
if err != nil {
829837
return err, true
830838
}
831-
logger.V(2).Info("Woke freshly-bound inference server", "providingPod", providingPod.Name)
832839
return ctl.ensureReqState(ctx, requestingPod, serverDat, !slices.Contains(requestingPod.Finalizers, requesterFinalizer), false)
833840
}
834841

835-
func (ctl *controller) wakeSleeper(ctx context.Context, serverDat *serverData, requestingPod, providingPod *corev1.Pod, serverPort int16) error {
842+
func (ctl *controller) wakeSleeper(ctx context.Context, serverDat *serverData, requestingPod, providingPod *corev1.Pod, serverPort int16, description string) error {
836843
if ctl.debugAccelMemory {
837844
if err := ctl.accelMemoryIsLowEnough(ctx, requestingPod, serverDat); err != nil {
838845
return err
839846
}
840847
}
841-
wakeURL := fmt.Sprintf("http://%s:%d/wake_up", providingPod.Status.PodIP, serverPort)
848+
endpoint := fmt.Sprintf("%s:%d", providingPod.Status.PodIP, serverPort)
849+
wakeURL := "http://" + endpoint + "/wake_up"
842850
err := doPost(wakeURL)
843851
if err != nil {
844852
return err
845853
}
854+
logger := klog.FromContext(ctx)
855+
logger.V(2).Info("Woke inference server", "endpoint", endpoint, "description", description)
846856
if err := ctl.ensureSleepingLabel(ctx, providingPod, false); err != nil {
847857
return err
848858
}
@@ -949,7 +959,8 @@ func (ctl *controller) ensureUnbound(ctx context.Context, serverDat *serverData,
949959
}
950960
}
951961
}
952-
sleepURL := fmt.Sprintf("http://%s:%d/sleep", providingPod.Status.PodIP, serverPort)
962+
endpoint := fmt.Sprintf("%s:%d", providingPod.Status.PodIP, serverPort)
963+
sleepURL := "http://" + endpoint + "/sleep"
953964
resp, err := http.Post(sleepURL, "", nil)
954965
if err != nil {
955966
return fmt.Errorf("failed to put provider %q to sleep, POST %s got error: %w", serverDat.ProvidingPodName, sleepURL, err)
@@ -958,7 +969,7 @@ func (ctl *controller) ensureUnbound(ctx context.Context, serverDat *serverData,
958969
return fmt.Errorf("failed to put provider %q to sleep, POST %s returned status %d", serverDat.ProvidingPodName, sleepURL, sc)
959970
}
960971
serverDat.Sleeping = ptr.To(true)
961-
logger.V(2).Info("Put inference server to sleep")
972+
logger.V(2).Info("Put inference server to sleep", "endpoint", endpoint)
962973
}
963974
providingPod = providingPod.DeepCopy()
964975
var aChange, fChange bool

test/e2e/run-launcher-e2e.sh

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,21 @@ cheer() {
4343
echo
4444
}
4545

46+
intro_case() {
47+
echo
48+
echo "====== Test case: $* ======"
49+
echo
50+
}
51+
4652
expect() {
4753
local elapsed=0
4854
local start=$(date)
4955
local limit=${POLL_LIMIT_SECS}
56+
echo "Expecting $1" >&2
5057
while true; do
51-
kubectl get pods -n "$NS" -L dual-pods.llm-d.ai/dual,dual-pods.llm-d.ai/sleeping
58+
if (( elapsed < 7 || elapsed+7 > POLL_LIMIT_SECS )); then
59+
kubectl get pods -n "$NS" -L dual-pods.llm-d.ai/dual,dual-pods.llm-d.ai/sleeping
60+
fi
5261
if eval "$1"; then return; fi
5362
if (( elapsed > limit )); then
5463
echo "Did not become true (from $start to $(date)): $1" >&2
@@ -63,7 +72,7 @@ expect() {
6372
# Create test objects
6473
# ---------------------------------------------------------------------------
6574

66-
: Basic Launcher Pod Creation
75+
intro_case Basic Launcher Pod Creation
6776

6877
objs=$("$MKOBJS_SCRIPT" -n "$NS")
6978
isc=$(echo $objs | awk '{print $1}')
@@ -87,14 +96,27 @@ expect "[ \$(kubectl get pods -n $NS -l dual-pods.llm-d.ai/launcher-config-name=
8796
echo "At least $READY_TARGET launcher pod(s) are Ready"
8897
kubectl get pods -n "$NS" -l dual-pods.llm-d.ai/launcher-config-name=$lc -o wide
8998

99+
trap 'echo "
100+
reqlb=${reqlb:-}
101+
reqlb2=${reqlb2:-}
102+
reqlb3=${reqlb3:-}
103+
reqlb4=${reqlb24-}
104+
launcherlb=${launcherlb:-}
105+
launcherlb2=${launcherlb2:-}
106+
launcherlb3=${launcherlb3:-}
107+
launcherlb4=${launcherlb4:-}
108+
"' EXIT
109+
90110
# Expect requester pod to be created
91111
expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"
92112

93113
export reqlb=$(kubectl get pods -n "$NS" -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)
114+
echo "Server-requesting Pod is $reqlb"
94115

95116
# Wait for launcher-to-requester binding, then capture the launcher name
96117
expect "kubectl get pods -n $NS -o name -l dual-pods.llm-d.ai/dual=$reqlb | wc -l | grep -w 1"
97118
export launcherlb=$(kubectl get pods -n "$NS" -o name -l dual-pods.llm-d.ai/dual=$reqlb | sed s%pod/%%)
119+
echo "Launcher Pod is $launcherlb"
98120

99121
# Verify requester is bound to launcher (bidirectional check)
100122
expect '[ "$(kubectl get pod -n '"$NS"' $reqlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "$launcherlb" ]'
@@ -112,6 +134,7 @@ cheer Successful launcher-based pod creation
112134

113135
POLICIES_ENABLED="${POLICIES_ENABLED:-false}"
114136
if [ "$POLICIES_ENABLED" = true ]; then
137+
intro_case Admission policy enforcement
115138
if ! test/e2e/validate.sh; then
116139
echo "ERROR: CEL policy tests failed!" >&2
117140
exit 1
@@ -123,7 +146,7 @@ fi
123146
# Instance Wake-up Fast Path
124147
# ---------------------------------------------------------------------------
125148

126-
: Instance Wake-up Fast Path
149+
intro_case Instance Wake-up Fast Path
127150

128151
# Scale requester to 0 (instance should sleep in launcher)
129152
kubectl scale rs $rslb -n "$NS" --replicas=0
@@ -142,6 +165,7 @@ kubectl scale rs $rslb -n "$NS" --replicas=1
142165
expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"
143166

144167
reqlb2=$(kubectl get pods -n "$NS" -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)
168+
echo "Server-requesting Pod2 is $reqlb2"
145169

146170
# Should still be using the same launcher pod
147171
expect "kubectl get pods -n $NS -o name -l dual-pods.llm-d.ai/dual=$reqlb2 | wc -l | grep -w 1"
@@ -162,7 +186,7 @@ cheer Successful instance wake-up fast path
162186
# Multiple Instances Share One Launcher
163187
# ---------------------------------------------------------------------------
164188

165-
: Multiple Instances Share One Launcher
189+
intro_case Multiple Instances Share One Launcher
166190

167191
# Scale requester to 0 again
168192
kubectl scale rs $rslb -n "$NS" --replicas=0
@@ -184,6 +208,7 @@ kubectl scale rs $rslb -n "$NS" --replicas=1
184208
expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"
185209

186210
reqlb3=$(kubectl get pods -n "$NS" -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)
211+
echo "Server-requesting Pod3 is $reqlb3"
187212

188213
# Should still be using the same launcher pod
189214
expect "kubectl get pods -n $NS -o name -l dual-pods.llm-d.ai/dual=$reqlb3 | wc -l | grep -w 1"
@@ -204,7 +229,7 @@ cheer Successful multiple instances sharing one launcher
204229
# Switch Instances In One Launcher
205230
# ---------------------------------------------------------------------------
206231

207-
: Switch Instances In One Launcher
232+
intro_case Switch Instances In One Launcher
208233

209234
# Scale requester to 0 again
210235
kubectl scale rs $rslb -n "$NS" --replicas=0
@@ -226,6 +251,7 @@ kubectl scale rs $rslb -n "$NS" --replicas=1
226251
expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"
227252

228253
reqlb4=$(kubectl get pods -n "$NS" -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)
254+
echo "Server-requesting Pod4 is $reqlb4"
229255

230256
# Should still be using the same launcher pod
231257
expect "kubectl get pods -n $NS -o name -l dual-pods.llm-d.ai/dual=$reqlb4 | wc -l | grep -w 1"
@@ -246,7 +272,7 @@ cheer Successful switching instances in one launcher
246272
# Controller Restart State Recovery
247273
# ---------------------------------------------------------------------------
248274

249-
: Controller Restart State Recovery
275+
intro_case Controller Restart State Recovery
250276

251277
# This test verifies that the controller can rebuild its internal state after restart
252278
# by syncing launcher instances from unbound launcher pods
@@ -258,6 +284,7 @@ expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$instlb | wc
258284

259285
# Verify launcher set is unchanged and target launcher is unbound
260286
launcher_count_pre_restart=$(kubectl get pods -n "$NS" -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | wc -l)
287+
echo launcher_count_pre_restart = $launcher_count_pre_restart
261288
kubectl get pods -n "$NS" -o name -l dual-pods.llm-d.ai/launcher-config-name=$lc | grep -x "pod/$launcherlb"
262289
expect '[ "$(kubectl get pod -n '"$NS"' $launcherlb -o jsonpath={.metadata.labels.dual-pods\\.llm-d\\.ai/dual})" == "" ]'
263290

@@ -309,7 +336,7 @@ cheer Successful controller restart state recovery
309336
# Unbound Launcher Deletion Cleanup
310337
# ---------------------------------------------------------------------------
311338

312-
: Unbound Launcher Deletion Cleanup
339+
intro_case Unbound Launcher Deletion Cleanup
313340

314341
# This test verifies that deleting an unbound launcher does not leave the controller
315342
# stuck with stale instance state.
@@ -327,6 +354,7 @@ kubectl scale rs $rslb -n "$NS" --replicas=1
327354

328355
expect "kubectl get pods -n $NS -o name -l app=dp-example,instance=$instlb | wc -l | grep -w 1"
329356
reqlb_after_delete=$(kubectl get pods -n "$NS" -o name -l app=dp-example,instance=$instlb | sed s%pod/%%)
357+
echo "Server-requesting Pod after delete = $reqlb_after_delete"
330358
expect "kubectl get pods -n $NS -o name -l dual-pods.llm-d.ai/dual=$reqlb_after_delete | wc -l | grep -w 1"
331359
launcherlb_after_delete=$(kubectl get pods -n "$NS" -o name -l dual-pods.llm-d.ai/dual=$reqlb_after_delete | sed s%pod/%%)
332360
[ "$launcherlb_after_delete" != "$launcherlb" ]

0 commit comments

Comments
 (0)