-
Notifications
You must be signed in to change notification settings - Fork 415
fix: Fix topology spread constraints with zonal volume #1907
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -254,11 +254,11 @@ func (p *Provisioner) NewScheduler( | |||||
| instanceTypes[np.Name] = its | ||||||
| } | ||||||
|
|
||||||
| // inject topology constraints | ||||||
| pods = p.injectVolumeTopologyRequirements(ctx, pods) | ||||||
| // Link volume requirements to pods | ||||||
| podsVolumeRequirements := p.convertToPodVolumeRequirements(ctx, pods) | ||||||
|
|
||||||
| // Calculate cluster topology | ||||||
| topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, stateNodes, nodePools, instanceTypes, pods) | ||||||
| topology, err := scheduler.NewTopology(ctx, p.kubeClient, p.cluster, stateNodes, nodePools, instanceTypes, pods, podsVolumeRequirements) | ||||||
| if err != nil { | ||||||
| return nil, fmt.Errorf("tracking topology counts, %w", err) | ||||||
| } | ||||||
|
|
@@ -464,13 +464,13 @@ func validateKarpenterManagedLabelCanExist(p *corev1.Pod) error { | |||||
| return nil | ||||||
| } | ||||||
|
|
||||||
| func (p *Provisioner) injectVolumeTopologyRequirements(ctx context.Context, pods []*corev1.Pod) []*corev1.Pod { | ||||||
| var schedulablePods []*corev1.Pod | ||||||
| func (p *Provisioner) convertToPodVolumeRequirements(ctx context.Context, pods []*corev1.Pod) map[*corev1.Pod][]corev1.NodeSelectorRequirement { | ||||||
| var schedulablePods = make(map[*corev1.Pod][]corev1.NodeSelectorRequirement) | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: this project uses this style for map initialization. Also, I think this name is more representative of what we're storing.
Suggested change
|
||||||
| for _, pod := range pods { | ||||||
| if err := p.volumeTopology.Inject(ctx, pod); err != nil { | ||||||
| if requirements, err := p.volumeTopology.GetVolumeRequirements(ctx, pod); err != nil { | ||||||
| log.FromContext(ctx).WithValues("Pod", klog.KObj(pod)).Error(err, "failed getting volume topology requirements") | ||||||
| } else { | ||||||
| schedulablePods = append(schedulablePods, pod) | ||||||
| schedulablePods[pod] = requirements | ||||||
| } | ||||||
| } | ||||||
| return schedulablePods | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -108,7 +108,7 @@ func NewNodeClaim( | |
| } | ||
| } | ||
|
|
||
| func (n *NodeClaim) Add(ctx context.Context, pod *corev1.Pod, podData *PodData) error { | ||
| func (n *NodeClaim) Add(ctx context.Context, pod *corev1.Pod, podData *PodData, volumeRequirements []corev1.NodeSelectorRequirement) error { | ||
| // Check Taints | ||
| if err := scheduling.Taints(n.Spec.Taints).ToleratesPod(pod); err != nil { | ||
| return err | ||
|
|
@@ -137,6 +137,13 @@ func (n *NodeClaim) Add(ctx context.Context, pod *corev1.Pod, podData *PodData) | |
| } | ||
| nodeClaimRequirements.Add(topologyRequirements.Values()...) | ||
|
|
||
| podVolumeRequirements := scheduling.NewNodeSelectorRequirements(volumeRequirements...) | ||
| // Check Pod Volume Requirements | ||
| if err = nodeClaimRequirements.Compatible(podVolumeRequirements, scheduling.AllowUndefinedWellKnownLabels); err != nil { | ||
| return err | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should wrap this error, this will be propagated out to the user if we fail to schedule the pod. |
||
| } | ||
| nodeClaimRequirements.Add(podVolumeRequirements.Values()...) | ||
|
|
||
| // Check instance type combinations | ||
| requests := resources.Merge(n.Spec.Resources.Requests, podData.Requests) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,6 +60,9 @@ type Topology struct { | |
| excludedPods sets.Set[string] | ||
| cluster *state.Cluster | ||
| stateNodes []*state.StateNode | ||
| // podVolumeRequirements links volume requirements to pods. This is used so we | ||
| // can track the volume requirements in simulate scheduler | ||
| podVolumeRequirements map[*corev1.Pod][]corev1.NodeSelectorRequirement | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would use the pod's UID as the key here rather than a pointer to the pod object. We still use the pod object as a key elsewhere in the project, but we've moved to pod UID here (see |
||
| } | ||
|
|
||
| func NewTopology( | ||
|
|
@@ -70,6 +73,9 @@ func NewTopology( | |
| nodePools []*v1.NodePool, | ||
| instanceTypes map[string][]*cloudprovider.InstanceType, | ||
| pods []*corev1.Pod, | ||
| // podVolumeRequirements links volume requirements to pods. This is used so we | ||
| // can track the volume requirements in simulate scheduler | ||
| podsVolumeRequirements map[*corev1.Pod][]corev1.NodeSelectorRequirement, | ||
| ) (*Topology, error) { | ||
| t := &Topology{ | ||
| kubeClient: kubeClient, | ||
|
|
@@ -79,17 +85,18 @@ func NewTopology( | |
| topologyGroups: map[uint64]*TopologyGroup{}, | ||
| inverseTopologyGroups: map[uint64]*TopologyGroup{}, | ||
| excludedPods: sets.New[string](), | ||
| podVolumeRequirements: podsVolumeRequirements, | ||
| } | ||
|
|
||
| // these are the pods that we intend to schedule, so if they are currently in the cluster we shouldn't count them for | ||
| // topology purposes | ||
| for _, p := range pods { | ||
| for p := range podsVolumeRequirements { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we iterating over the pods stored as keys in |
||
| t.excludedPods.Insert(string(p.UID)) | ||
| } | ||
|
|
||
| errs := t.updateInverseAffinities(ctx) | ||
| for i := range pods { | ||
| errs = multierr.Append(errs, t.Update(ctx, pods[i])) | ||
| for p := range podsVolumeRequirements { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment here - we should still be using
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jmdeal Since we need to know whether the pod has |
||
| errs = multierr.Append(errs, t.Update(ctx, p)) | ||
| } | ||
| if errs != nil { | ||
| return nil, errs | ||
|
|
@@ -228,7 +235,7 @@ func (t *Topology) AddRequirements(p *corev1.Pod, taints []corev1.Taint, podRequ | |
| if nodeRequirements.Has(topology.Key) { | ||
| nodeDomains = nodeRequirements.Get(topology.Key) | ||
| } | ||
| domains := topology.Get(p, podDomains, nodeDomains) | ||
| domains := topology.Get(p, podDomains, nodeDomains, len(t.podVolumeRequirements[p]) != 0) | ||
| if domains.Len() == 0 { | ||
| return nil, topologyError{ | ||
| topology: topology, | ||
|
|
@@ -299,7 +306,7 @@ func (t *Topology) updateInverseAntiAffinity(ctx context.Context, pod *corev1.Po | |
| return err | ||
| } | ||
|
|
||
| tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey]) | ||
| tg := NewTopologyGroup(TopologyTypePodAntiAffinity, term.TopologyKey, pod, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey], t.cluster) | ||
|
|
||
| hash := tg.Hash() | ||
| if existing, ok := t.inverseTopologyGroups[hash]; !ok { | ||
|
|
@@ -442,6 +449,7 @@ func (t *Topology) newForTopologies(p *corev1.Pod) []*TopologyGroup { | |
| tsc.NodeTaintsPolicy, | ||
| tsc.NodeAffinityPolicy, | ||
| t.domainGroups[tsc.TopologyKey], | ||
| t.cluster, | ||
| )) | ||
| } | ||
| return topologyGroups | ||
|
|
@@ -479,7 +487,7 @@ func (t *Topology) newForAffinities(ctx context.Context, p *corev1.Pod) ([]*Topo | |
| if err != nil { | ||
| return nil, err | ||
| } | ||
| topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey])) | ||
| topologyGroups = append(topologyGroups, NewTopologyGroup(topologyType, term.TopologyKey, p, namespaces, term.LabelSelector, math.MaxInt32, nil, nil, nil, t.domainGroups[term.TopologyKey], t.cluster)) | ||
| } | ||
| } | ||
| return topologyGroups, nil | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We're not really converting anything here, right? We're just creating a mapping between pods and their volume requirements. I think something along these lines is more accurate.