Skip to content

Commit 040aa2b

Browse files
Merge pull request llm-d-incubation#217 from osswangxining/polish-launcher-pod-template
Polish launcher pod template
2 parents c307ae8 + 779effd commit 040aa2b

File tree

2 files changed

+110
-4
lines changed

2 files changed

+110
-4
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
package generic
2+
3+
const NominalHashAnnotationKey = "dual-pods.llm-d.ai/nominal"

pkg/controller/launcher-populator/populator.go

Lines changed: 107 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,20 @@ package launcherpopulator
1818

1919
import (
2020
"context"
21+
"crypto/sha256"
22+
"encoding/base64"
23+
"encoding/json"
2124
"fmt"
2225

26+
"github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/api"
27+
dualpods "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/dual-pods"
28+
"github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/utils"
2329
corev1 "k8s.io/api/core/v1"
2430
apierrors "k8s.io/apimachinery/pkg/api/errors"
31+
"k8s.io/apimachinery/pkg/api/resource"
2532
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2633
"k8s.io/apimachinery/pkg/labels"
34+
"k8s.io/apimachinery/pkg/util/intstr"
2735
"k8s.io/utils/ptr"
2836

2937
corev1preinformers "k8s.io/client-go/informers/core/v1"
@@ -348,7 +356,10 @@ func (ctl *controller) createLaunchers(ctx context.Context, node corev1.Node, ke
348356

349357
// Create the specified number of launcher pods
350358
for i := 0; i < count; i++ {
351-
pod := ctl.buildPodFromTemplate(launcherConfig.Spec.PodTemplate, key)
359+
pod, err := ctl.buildPodFromTemplate(launcherConfig.Spec.PodTemplate, key)
360+
if err != nil {
361+
return fmt.Errorf("failed to build launcher pod: %w", err)
362+
}
352363
pod.GenerateName = fmt.Sprintf("launcher-%s-", launcherConfig.Name)
353364
// Set owner reference pointing to LauncherConfig
354365
ownerRef := *metav1.NewControllerRef(launcherConfig, fmav1alpha1.SchemeGroupVersion.WithKind("LauncherConfig"))
@@ -380,10 +391,10 @@ func (ctl *controller) deleteExcessLaunchers(ctx context.Context, launchers []co
380391
}
381392

382393
// buildPodFromTemplate creates a pod from a template and assigns it to a node
383-
func (ctl *controller) buildPodFromTemplate(template corev1.PodTemplateSpec, key NodeLauncherKey) *corev1.Pod {
394+
func (ctl *controller) buildPodFromTemplate(template corev1.PodTemplateSpec, key NodeLauncherKey) (*corev1.Pod, error) {
384395
pod := &corev1.Pod{
385396
ObjectMeta: template.ObjectMeta,
386-
Spec: template.Spec,
397+
Spec: *utils.DeIndividualize(template.Spec.DeepCopy()),
387398
}
388399
pod.Namespace = ctl.namespace
389400
// Ensure labels are set
@@ -394,7 +405,99 @@ func (ctl *controller) buildPodFromTemplate(template corev1.PodTemplateSpec, key
394405
pod.Labels[LauncherGeneratedByLabelKey] = LauncherGeneratedByLabelValue
395406
pod.Labels[LauncherConfigNameLabelKey] = key.LauncherConfigName
396407
pod.Labels[NodeNameLabelKey] = key.NodeName
408+
pod.Labels[api.SleepingLabelName] = "false"
409+
410+
hasher := sha256.New()
411+
modifiedJSON, _ := json.Marshal(pod)
412+
hasher.Write(modifiedJSON)
413+
hasher.Write([]byte(";gpus="))
414+
hasher.Write([]byte("all")) //@TODO will be refined
415+
hasher.Write([]byte(";node="))
416+
hasher.Write([]byte(key.NodeName))
417+
var modifiedHash [sha256.Size]byte
418+
modifiedHashSl := hasher.Sum(modifiedHash[:0])
419+
nominalHash := base64.RawStdEncoding.EncodeToString(modifiedHashSl)
420+
421+
if pod.Annotations == nil {
422+
pod.Annotations = make(map[string]string)
423+
}
424+
pod.Annotations = dualpods.MapSet(pod.Annotations, genctlr.NominalHashAnnotationKey, nominalHash)
425+
426+
cIdx, serverPort, err := utils.GetInferenceServerPort(pod)
427+
if err != nil {
428+
return nil, err
429+
}
430+
container := &pod.Spec.Containers[cIdx]
431+
432+
// Configure required environment variables
433+
configureRequiredEnvVars(container)
434+
435+
// Set fixed liveness probe
436+
container.LivenessProbe = &corev1.Probe{
437+
ProbeHandler: corev1.ProbeHandler{
438+
HTTPGet: &corev1.HTTPGetAction{
439+
Path: "/health",
440+
Port: intstr.FromInt(int(serverPort)),
441+
Scheme: corev1.URISchemeHTTP,
442+
},
443+
},
444+
InitialDelaySeconds: 10,
445+
PeriodSeconds: 20,
446+
TimeoutSeconds: 1,
447+
SuccessThreshold: 1,
448+
FailureThreshold: 3,
449+
}
450+
451+
// Remove nvidia.com/gpu from resource limits
452+
removeGPUResourceLimits(container)
453+
454+
// Remove nvidia.com/gpu from Pod-level resource overhead
455+
if pod.Spec.Overhead != nil {
456+
delete(pod.Spec.Overhead, corev1.ResourceName("nvidia.com/gpu"))
457+
}
458+
397459
// Assign to specific node
398460
pod.Spec.NodeName = key.NodeName
399-
return pod
461+
return pod, nil
462+
}
463+
464+
// configureRequiredEnvVars adds or updates required environment variables
465+
func configureRequiredEnvVars(container *corev1.Container) {
466+
envVars := map[string]string{
467+
"PYTHONPATH": "/app",
468+
"NVIDIA_VISIBLE_DEVICES": "all",
469+
"NVIDIA_DRIVER_CAPABILITIES": "compute,utility",
470+
"VLLM_SERVER_DEV_MODE": "1",
471+
}
472+
473+
// Create a mapping of existing environment variables for easy lookup
474+
existingEnv := make(map[string]*corev1.EnvVar)
475+
for i := range container.Env {
476+
envVar := &container.Env[i]
477+
existingEnv[envVar.Name] = envVar
478+
}
479+
480+
// Add or update required environment variables
481+
for envName, envValue := range envVars {
482+
if envVar, exists := existingEnv[envName]; exists {
483+
// If it already exists, update its value
484+
envVar.Value = envValue
485+
} else {
486+
// If it doesn't exist, add a new environment variable
487+
container.Env = append(container.Env, corev1.EnvVar{
488+
Name: envName,
489+
Value: envValue,
490+
})
491+
}
492+
}
493+
}
494+
495+
// removeGPUResourceLimits removes nvidia.com/gpu from container resource limits and requests
496+
func removeGPUResourceLimits(container *corev1.Container) {
497+
if container.Resources.Limits != nil {
498+
container.Resources.Limits[corev1.ResourceName("nvidia.com/gpu")] = resource.MustParse("0")
499+
}
500+
if container.Resources.Requests != nil {
501+
container.Resources.Requests[corev1.ResourceName("nvidia.com/gpu")] = resource.MustParse("0")
502+
}
400503
}

0 commit comments

Comments
 (0)