Skip to content

Commit 122494e

Browse files
Merge pull request llm-d-incubation#193 from MikeSpreitzer/tweak-policy
🌱 Tweak launcher pod population policy
2 parents c55285c + d261d4f commit 122494e

25 files changed

+668
-889
lines changed

api/fma/v1alpha1/launcherpoolpolicy_types.go

Lines changed: 0 additions & 186 deletions
This file was deleted.
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
Copyright 2025 The llm-d Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package v1alpha1
18+
19+
import (
20+
corev1 "k8s.io/api/core/v1"
21+
"k8s.io/apimachinery/pkg/api/resource"
22+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
)
24+
25+
// LauncherPopulationPolicy defines the policy for pro-active creation of launcher Pods
26+
// for a given type of Node.
27+
// Here we introduce and use a particular definition of "type" for Nodes.
28+
// All the LauncherPopulationPolicy objects together define a map,
29+
// from (Node, LauncherConfig) to count.
30+
// Call this map `PopulationPolicy`.
31+
// When multiple CountForLauncher apply to the same (Node, LauncherConfig) pair
32+
// the maximum of their counts is what appears in `PopulationPolicy`.
33+
// When no CountForLauncher applies to a given (Node, LauncherConfig),
34+
// `PopulationPolicy` implicitly maps that pair to zero.
35+
//
36+
// The collective meaning of all the LauncherPopulationPolicy objects
37+
// and all the server-requesting Pods is that for a given (Node, LauncherConfig)
38+
// the number of launchers that should exist is the larger of
39+
// (a) what `PopulationPolicy` says for that pair, and
40+
// (b) the number needed to satisfy the server-requesting Pods.
41+
//
42+
// +genclient
43+
// +kubebuilder:object:root=true
44+
// +kubebuilder:subresource:status
45+
// +kubebuilder:resource:shortName=lpp
46+
type LauncherPopulationPolicy struct {
47+
metav1.TypeMeta `json:",inline"`
48+
metav1.ObjectMeta `json:"metadata,omitempty"`
49+
50+
Spec LauncherPopulationPolicySpec `json:"spec,omitempty"`
51+
Status LauncherPopulationPolicyStatus `json:"status,omitempty"`
52+
}
53+
54+
// LauncherPopulationPolicySpec defines policy for one type of Node.
55+
type LauncherPopulationPolicySpec struct {
56+
// Selector describes the hardware characteristics of target nodes.
57+
//
58+
// Introduce an EnhancedNodeSelector that supports combining label-based
59+
// matching with resource field conditions.
60+
// For example:
61+
// enhancedNodeSelector:
62+
// # 1. Label selector (compatible with existing metav1.LabelSelector)
63+
// labelSelector:
64+
// matchLabels:
65+
// nvidia.com/gpu.family: ada-lovelace
66+
// matchExpressions:
67+
// - key: node.kubernetes.io/instance-type
68+
// operator: In
69+
// values: ["gx3-48x240x2l40s", "gx3-96x480x4l40s"]
70+
//
71+
// # 2. Resource condition selector (new capability)
72+
// allocatableResources:
73+
// cpu: {min: "16", max: "64"}
74+
// memory: {min: 128Gi, max: 512Gi}
75+
// "nvidia.com/gpu": {min: 8, max: 8}
76+
// +required
77+
EnhancedNodeSelector EnhancedNodeSelector `json:"enhancedNodeSelector"`
78+
79+
// CountForLauncher declares the desired number of launchers on the
80+
// relevant Node, for various LauncherConfigs.
81+
// +required
82+
CountForLauncher []CountForLauncher `json:"countForLauncher"`
83+
}
84+
85+
// EnhancedNodeSelector defines node selector with label selector and resource requirements.
86+
type EnhancedNodeSelector struct {
87+
// LabelSelector defines the label selector for a node.
88+
// +required
89+
LabelSelector metav1.LabelSelector `json:"labelSelector"`
90+
// ResourceRequirements defines the resource requirements for a node.
91+
// +optional
92+
AllocatableResources ResourceRanges `json:"allocatableResources,omitempty"`
93+
}
94+
95+
// ResourceRanges defines the required range for some resources.
96+
// These are the same sort of resources that appear in the `.status.allocatable`
97+
// of a Node object.
98+
type ResourceRanges map[corev1.ResourceName]ResourceRange
99+
100+
// ResourceRange defines a range by inclusive minimum and maximum quantity values.
101+
type ResourceRange struct {
102+
// Min specifies the minimum quantity required,
103+
// or is `null` to signal that there is no lower bound.
104+
// +optional
105+
Min *resource.Quantity `json:"min,omitempty"`
106+
107+
// Max specifies the maximum quantity allowed,
108+
// or is `null` to signal that there is no upper bound.
109+
// +optional
110+
Max *resource.Quantity `json:"max,omitempty"`
111+
}
112+
113+
type CountForLauncher struct {
114+
// LauncherConfigName is the name of the LauncherConfig this policy applies to.
115+
// +required
116+
LauncherConfigName string `json:"launcherConfigName"`
117+
118+
// LauncherCount is the total number of launcher pods to maintain.
119+
// +required
120+
LauncherCount int32 `json:"launcherCount"`
121+
}
122+
123+
type LauncherPopulationPolicyStatus struct {
124+
// `observedGeneration` is the `metadata.generation` last seen by the controller.
125+
// +optional
126+
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
127+
128+
// `errors` reports problems seen in the desired state of this object;
129+
// in particular, in the version reported by `observedGeneration`.
130+
// +optional
131+
Errors []string `json:"errors,omitempty"`
132+
// Add status fields if needed (e.g., current idle pod counts)
133+
}
134+
135+
// LauncherPopulationPolicyList contains a list of LauncherPopulationPolicy resources.
136+
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
137+
type LauncherPopulationPolicyList struct {
138+
metav1.TypeMeta `json:",inline"`
139+
metav1.ListMeta `json:"metadata,omitempty"`
140+
Items []LauncherPopulationPolicy `json:"items"`
141+
}

0 commit comments

Comments
 (0)