@@ -18,12 +18,20 @@ package launcherpopulator
1818
1919import (
2020 "context"
21+ "crypto/sha256"
22+ "encoding/base64"
23+ "encoding/json"
2124 "fmt"
2225
26+ "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/api"
27+ dualpods "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/dual-pods"
28+ "github.com/llm-d-incubation/llm-d-fast-model-actuation/pkg/controller/utils"
2329 corev1 "k8s.io/api/core/v1"
2430 apierrors "k8s.io/apimachinery/pkg/api/errors"
31+ "k8s.io/apimachinery/pkg/api/resource"
2532 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2633 "k8s.io/apimachinery/pkg/labels"
34+ "k8s.io/apimachinery/pkg/util/intstr"
2735 "k8s.io/utils/ptr"
2836
2937 corev1preinformers "k8s.io/client-go/informers/core/v1"
@@ -348,7 +356,10 @@ func (ctl *controller) createLaunchers(ctx context.Context, node corev1.Node, ke
348356
349357 // Create the specified number of launcher pods
350358 for i := 0 ; i < count ; i ++ {
351- pod := ctl .buildPodFromTemplate (launcherConfig .Spec .PodTemplate , key )
359+ pod , err := ctl .buildPodFromTemplate (launcherConfig .Spec .PodTemplate , key )
360+ if err != nil {
361+ return fmt .Errorf ("failed to build launcher pod: %w" , err )
362+ }
352363 pod .GenerateName = fmt .Sprintf ("launcher-%s-" , launcherConfig .Name )
353364 // Set owner reference pointing to LauncherConfig
354365 ownerRef := * metav1 .NewControllerRef (launcherConfig , fmav1alpha1 .SchemeGroupVersion .WithKind ("LauncherConfig" ))
@@ -380,10 +391,10 @@ func (ctl *controller) deleteExcessLaunchers(ctx context.Context, launchers []co
380391}
381392
382393// buildPodFromTemplate creates a pod from a template and assigns it to a node
383- func (ctl * controller ) buildPodFromTemplate (template corev1.PodTemplateSpec , key NodeLauncherKey ) * corev1.Pod {
394+ func (ctl * controller ) buildPodFromTemplate (template corev1.PodTemplateSpec , key NodeLauncherKey ) ( * corev1.Pod , error ) {
384395 pod := & corev1.Pod {
385396 ObjectMeta : template .ObjectMeta ,
386- Spec : template .Spec ,
397+ Spec : * utils . DeIndividualize ( template .Spec . DeepCopy ()) ,
387398 }
388399 pod .Namespace = ctl .namespace
389400 // Ensure labels are set
@@ -394,7 +405,99 @@ func (ctl *controller) buildPodFromTemplate(template corev1.PodTemplateSpec, key
394405 pod .Labels [LauncherGeneratedByLabelKey ] = LauncherGeneratedByLabelValue
395406 pod .Labels [LauncherConfigNameLabelKey ] = key .LauncherConfigName
396407 pod .Labels [NodeNameLabelKey ] = key .NodeName
408+ pod .Labels [api .SleepingLabelName ] = "false"
409+
410+ hasher := sha256 .New ()
411+ modifiedJSON , _ := json .Marshal (pod )
412+ hasher .Write (modifiedJSON )
413+ hasher .Write ([]byte (";gpus=" ))
414+ hasher .Write ([]byte ("all" )) //@TODO will be refined
415+ hasher .Write ([]byte (";node=" ))
416+ hasher .Write ([]byte (key .NodeName ))
417+ var modifiedHash [sha256 .Size ]byte
418+ modifiedHashSl := hasher .Sum (modifiedHash [:0 ])
419+ nominalHash := base64 .RawStdEncoding .EncodeToString (modifiedHashSl )
420+
421+ if pod .Annotations == nil {
422+ pod .Annotations = make (map [string ]string )
423+ }
424+ pod .Annotations = dualpods .MapSet (pod .Annotations , genctlr .NominalHashAnnotationKey , nominalHash )
425+
426+ cIdx , serverPort , err := utils .GetInferenceServerPort (pod )
427+ if err != nil {
428+ return nil , err
429+ }
430+ container := & pod .Spec .Containers [cIdx ]
431+
432+ // Configure required environment variables
433+ configureRequiredEnvVars (container )
434+
435+ // Set fixed liveness probe
436+ container .LivenessProbe = & corev1.Probe {
437+ ProbeHandler : corev1.ProbeHandler {
438+ HTTPGet : & corev1.HTTPGetAction {
439+ Path : "/health" ,
440+ Port : intstr .FromInt (int (serverPort )),
441+ Scheme : corev1 .URISchemeHTTP ,
442+ },
443+ },
444+ InitialDelaySeconds : 10 ,
445+ PeriodSeconds : 20 ,
446+ TimeoutSeconds : 1 ,
447+ SuccessThreshold : 1 ,
448+ FailureThreshold : 3 ,
449+ }
450+
451+ // Remove nvidia.com/gpu from resource limits
452+ removeGPUResourceLimits (container )
453+
454+ // Remove nvidia.com/gpu from Pod-level resource overhead
455+ if pod .Spec .Overhead != nil {
456+ delete (pod .Spec .Overhead , corev1 .ResourceName ("nvidia.com/gpu" ))
457+ }
458+
397459 // Assign to specific node
398460 pod .Spec .NodeName = key .NodeName
399- return pod
461+ return pod , nil
462+ }
463+
464+ // configureRequiredEnvVars adds or updates required environment variables
465+ func configureRequiredEnvVars (container * corev1.Container ) {
466+ envVars := map [string ]string {
467+ "PYTHONPATH" : "/app" ,
468+ "NVIDIA_VISIBLE_DEVICES" : "all" ,
469+ "NVIDIA_DRIVER_CAPABILITIES" : "compute,utility" ,
470+ "VLLM_SERVER_DEV_MODE" : "1" ,
471+ }
472+
473+ // Create a mapping of existing environment variables for easy lookup
474+ existingEnv := make (map [string ]* corev1.EnvVar )
475+ for i := range container .Env {
476+ envVar := & container .Env [i ]
477+ existingEnv [envVar .Name ] = envVar
478+ }
479+
480+ // Add or update required environment variables
481+ for envName , envValue := range envVars {
482+ if envVar , exists := existingEnv [envName ]; exists {
483+ // If it already exists, update its value
484+ envVar .Value = envValue
485+ } else {
486+ // If it doesn't exist, add a new environment variable
487+ container .Env = append (container .Env , corev1.EnvVar {
488+ Name : envName ,
489+ Value : envValue ,
490+ })
491+ }
492+ }
493+ }
494+
495+ // removeGPUResourceLimits removes nvidia.com/gpu from container resource limits and requests
496+ func removeGPUResourceLimits (container * corev1.Container ) {
497+ if container .Resources .Limits != nil {
498+ container .Resources .Limits [corev1 .ResourceName ("nvidia.com/gpu" )] = resource .MustParse ("0" )
499+ }
500+ if container .Resources .Requests != nil {
501+ container .Resources .Requests [corev1 .ResourceName ("nvidia.com/gpu" )] = resource .MustParse ("0" )
502+ }
400503}
0 commit comments