Skip to content
Closed
Show file tree
Hide file tree
Changes from 53 commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
b66b446
Update skewer version on master branch
rakechill Feb 11, 2025
72665b3
Undo previous changes made by go mod vendor
rakechill Feb 20, 2025
ed62128
Update only skewer with go get dep@ver
rakechill Feb 20, 2025
c76a771
Support CPU Startup Boost in VPA
laoj2 Feb 24, 2025
4b98746
Add AEP ID
laoj2 Feb 24, 2025
2aba671
Fixes histograms becoming empty after loaded from checkpoints
plkokanov Mar 4, 2025
18ed036
Convert ClusterState to interface
voelzmo Mar 5, 2025
481f8db
Store InitContainers in PodState
voelzmo Mar 5, 2025
3c7689f
Drop MetricSamples for InitContainers
voelzmo Mar 5, 2025
33871fa
Address review feedback
voelzmo Mar 12, 2025
bd363cd
Address comments and wrap lines
laoj2 Mar 12, 2025
0f5fe42
Add adrianmoisey to VPA approvers
adrianmoisey Mar 17, 2025
4400ed9
Merge pull request #7935 from adrianmoisey/add-adrian-as-approver
k8s-ci-robot Mar 17, 2025
29d9088
change our release process to bump the version in the main branch imm…
Mar 17, 2025
f04fd5b
Merge pull request #7939 from raywainman/vpa-version-bump
k8s-ci-robot Mar 17, 2025
003e6cd
make DecreaseTargetSize more accurate for clusterapi
elmiko Mar 5, 2025
9043687
Fix typo
voelzmo Mar 18, 2025
e347836
Address comments + add a feature enablement/rollback section
laoj2 Mar 18, 2025
9937f8f
Merge pull request #7929 from elmiko/issue-7928-decrease-size-fix
k8s-ci-robot Mar 18, 2025
9a5e3d9
Allow using scheduled pods as samples in proactive scale up
norbertcyran Mar 19, 2025
105429c
Fix log for node filtering in static autoscaler
ystryuchkov Mar 19, 2025
71d3595
improve failed machine detection in clusterapi
elmiko Mar 19, 2025
1f65569
Merge pull request #7950 from elmiko/improve-failed-machine-detection
k8s-ci-robot Mar 19, 2025
4aa4657
capi: node and provider ID accounting funcs
jackfrancis Mar 19, 2025
7b5e101
s/nodeHasValidProviderID/isProviderIDNormalized
jackfrancis Mar 19, 2025
dc57f7c
Merge pull request #7952 from jackfrancis/capi-providerID-nodeHasVali…
k8s-ci-robot Mar 19, 2025
5268053
Update default value for scaleDownDelayAfterDelete (#7957)
YahiaBadr Mar 20, 2025
10bb546
Merge pull request #7944 from norbertcyran/proactive-scale-up-sample-…
k8s-ci-robot Mar 20, 2025
455d290
Address more comments in AEP
laoj2 Mar 20, 2025
990ab04
Merge pull request #7949 from ystryuchkov/master
k8s-ci-robot Mar 21, 2025
2bbe859
Fix cool down status condition to trigger scale down
abdelrahman882 Mar 20, 2025
15295c7
Merge pull request #7863 from laoj2/cpu-boost-kep
k8s-ci-robot Mar 23, 2025
a226478
pricing changes: updated z3 pricing information
DigitalVeer Mar 24, 2025
696af98
Add time based drainability rule for non-pdb-assigned system pods
abdelrahman882 Mar 12, 2025
f90590b
Merge pull request #7914 from abdelrahman882/bsp
k8s-ci-robot Mar 24, 2025
72c2f93
Describe why the additional bucket handling is necessary
plkokanov Mar 24, 2025
52cd68a
Merge pull request #7954 from abdelrahman882/FixScaledownCoolDown
k8s-ci-robot Mar 24, 2025
8892f21
Replace PodResizing with PodResizeInProgress condition in AEP-4016
laoj2 Mar 24, 2025
aa1d413
Merge pull request #7970 from laoj2/fix-pod-condition
k8s-ci-robot Mar 25, 2025
4a233bf
Address review comments
plkokanov Mar 25, 2025
e95e35c
Merge pull request #7965 from DigitalVeer/master
k8s-ci-robot Mar 25, 2025
6330997
Merge pull request #7826 from Azure/rakechill/update-skewer-version-m…
k8s-ci-robot Mar 26, 2025
94ae175
Merge pull request #7886 from plkokanov/fix/empty-histogram-after-loa…
k8s-ci-robot Mar 26, 2025
0c52255
Emit event on successful async scale-up
pmendelski Mar 24, 2025
8251159
Merge pull request #7891 from voelzmo/enh/drop-metrics-for-init-conta…
k8s-ci-robot Mar 26, 2025
5e1fc19
refactor findScalableResourceProviderIDs in clusterapi
elmiko Mar 25, 2025
2ca5b44
Merge pull request #7977 from elmiko/refactor-findscalableproviderids
k8s-ci-robot Mar 26, 2025
e713b51
feat: add missing field zeroOrMaxNodeScaling and ignoreDaemonSetsUti…
jincong8973 Mar 25, 2025
7b69964
Merge pull request #7973 from jincong8973/master
k8s-ci-robot Mar 27, 2025
db597b1
Merge pull request #7966 from pmendelski/htnap-events-for-tpu
k8s-ci-robot Mar 27, 2025
63ed537
Add docs for in-place updates
omerap12 Apr 1, 2025
50a1035
Add AEP link and refine intro for VPA in-place updates documentation
omerap12 Apr 2, 2025
a34a352
add feature state
omerap12 Apr 2, 2025
e3fbeff
Update vertical-pod-autoscaler/docs/features.md
omerap12 Apr 2, 2025
3f2845c
Update vertical-pod-autoscaler/docs/features.md
omerap12 Apr 2, 2025
5b5ae39
Update vertical-pod-autoscaler/docs/features.md
omerap12 Apr 3, 2025
125209d
fixed wrong statement
omerap12 Apr 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cluster-autoscaler/cloudprovider/azure/azure_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import (
"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4"
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute"
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute"
"github.com/Azure/go-autorest/autorest"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/Azure/go-autorest/autorest/azure/auth"
Expand Down
106 changes: 88 additions & 18 deletions cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const (
resourceNameMachineSet = "machinesets"
resourceNameMachineDeployment = "machinedeployments"
resourceNameMachinePool = "machinepools"
deletingMachinePrefix = "deleting-machine-"
failedMachinePrefix = "failed-machine-"
pendingMachinePrefix = "pending-machine-"
machineTemplateKind = "MachineTemplate"
Expand Down Expand Up @@ -314,6 +315,9 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
return u.DeepCopy(), nil
}

if isDeletingMachineProviderID(providerID) {
return c.findMachine(machineKeyFromDeletingMachineProviderID(providerID))
}
if isFailedMachineProviderID(providerID) {
return c.findMachine(machineKeyFromFailedProviderID(providerID))
}
Expand All @@ -339,6 +343,24 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
return c.findMachine(path.Join(ns, machineID))
}

func createDeletingMachineNormalizedProviderID(namespace, name string) string {
return fmt.Sprintf("%s%s_%s", deletingMachinePrefix, namespace, name)
}

func isDeletingMachineProviderID(providerID normalizedProviderID) bool {
return strings.HasPrefix(string(providerID), deletingMachinePrefix)
}

func machineKeyFromDeletingMachineProviderID(providerID normalizedProviderID) string {
namespaceName := strings.TrimPrefix(string(providerID), deletingMachinePrefix)
return strings.Replace(namespaceName, "_", "/", 1)
}

// createPendingMachineProviderID creates a providerID for a machine that is pending
func createPendingMachineProviderID(namespace, name string) string {
return fmt.Sprintf("%s%s_%s", pendingMachinePrefix, namespace, name)
}

func isPendingMachineProviderID(providerID normalizedProviderID) bool {
return strings.HasPrefix(string(providerID), pendingMachinePrefix)
}
Expand All @@ -348,6 +370,10 @@ func machineKeyFromPendingMachineProviderID(providerID normalizedProviderID) str
return strings.Replace(namespaceName, "_", "/", 1)
}

func createFailedMachineNormalizedProviderID(namespace, name string) string {
return fmt.Sprintf("%s%s_%s", failedMachinePrefix, namespace, name)
}

func isFailedMachineProviderID(providerID normalizedProviderID) bool {
return strings.HasPrefix(string(providerID), failedMachinePrefix)
}
Expand All @@ -357,6 +383,15 @@ func machineKeyFromFailedProviderID(providerID normalizedProviderID) string {
return strings.Replace(namespaceName, "_", "/", 1)
}

// isProviderIDNormalized determines whether a node's providerID is the standard
// providerID assigned by the cloud provider, or if it has
// been modified by the CAS CAPI provider to indicate deleting, pending, or failed
func isProviderIDNormalized(providerID normalizedProviderID) bool {
return !isDeletingMachineProviderID(providerID) &&
!isPendingMachineProviderID(providerID) &&
!isFailedMachineProviderID(providerID)
}

// findNodeByNodeName finds the Node object keyed by name.. Returns
// nil if it cannot be found. A DeepCopy() of the object is returned
// on success.
Expand Down Expand Up @@ -603,47 +638,80 @@ func (c *machineController) findScalableResourceProviderIDs(scalableResource *un
}

for _, machine := range machines {
providerID, found, err := unstructured.NestedString(machine.UnstructuredContent(), "spec", "providerID")
if err != nil {
return nil, err
}

if found {
if providerID != "" {
providerIDs = append(providerIDs, providerID)
continue
}
}

klog.Warningf("Machine %q has no providerID", machine.GetName())

// Failed Machines
// In some cases it is possible for a machine to have acquired a provider ID from the infrastructure and
// then become failed later. We want to ensure that a failed machine is not counted towards the total
// number of nodes in the cluster, for this reason we will detect a failed machine first, regardless
// of provider ID, and give it a normalized provider ID with failure message prepended.
failureMessage, found, err := unstructured.NestedString(machine.UnstructuredContent(), "status", "failureMessage")
if err != nil {
return nil, err
}

if found {
klog.V(4).Infof("Status.FailureMessage of machine %q is %q", machine.GetName(), failureMessage)
// Provide a fake ID to allow the autoscaler to track machines that will never
// Provide a normalized ID to allow the autoscaler to track machines that will never
// become nodes and mark the nodegroup unhealthy after maxNodeProvisionTime.
// Fake ID needs to be recognised later and converted into a machine key.
// Use an underscore as a separator between namespace and name as it is not a
// valid character within a namespace name.
providerIDs = append(providerIDs, fmt.Sprintf("%s%s_%s", failedMachinePrefix, machine.GetNamespace(), machine.GetName()))
klog.V(4).Infof("Status.FailureMessage of machine %q is %q", machine.GetName(), failureMessage)
providerIDs = append(providerIDs, createFailedMachineNormalizedProviderID(machine.GetNamespace(), machine.GetName()))
continue
}

// Deleting Machines
// Machines that are in deleting state should be identified so that in scenarios where the core
// autoscaler would like to adjust the size of a node group, we can give a proper count and
// be able to filter machines in that state, regardless of whether they are still active nodes in the cluster.
// We give these machines normalized provider IDs to aid in the filtering process.
if !machine.GetDeletionTimestamp().IsZero() {
klog.V(4).Infof("Machine %q has a non-zero deletion timestamp", machine.GetName())
providerIDs = append(providerIDs, createDeletingMachineNormalizedProviderID(machine.GetNamespace(), machine.GetName()))
continue
}

// Pending Machines
// Machines that do not yet have an associated node reference are considering to be pending. These
// nodes need to be filtered so that in a case where a machine is not becoming a node, or the instance
// lifecycle has changed during provisioning (eg spot instance going away), or the core autoscaler has
// decided that the node is not needed.
// Look for a node reference in the status, a machine without a node reference, and that is also not
// in failed or deleting state, has not yet become a node, and should be marked as pending.
// We give these machines normalized provider IDs to aid in the filtering process.
_, found, err = unstructured.NestedFieldCopy(machine.UnstructuredContent(), "status", "nodeRef")
if err != nil {
return nil, err
}

if !found {
klog.V(4).Infof("Status.NodeRef of machine %q is currently nil", machine.GetName())
providerIDs = append(providerIDs, fmt.Sprintf("%s%s_%s", pendingMachinePrefix, machine.GetNamespace(), machine.GetName()))
providerIDs = append(providerIDs, createPendingMachineProviderID(machine.GetNamespace(), machine.GetName()))
continue
}

// Running Machines
// We have filtered out the machines in failed, deleting, and pending states. We now check the provider
// ID and potentially the node reference details. It is ok for a machine not to have a provider ID as
// not all CAPI provider implement this field, but a machine in running state should have a valid
// node reference. If a provider ID is present, we add that to the list as we know it is not failed,
// deleting, or pending. If an empty provider ID is present, we check the node details to ensure that
// the machine references a valid node.
providerID, found, err := unstructured.NestedString(machine.UnstructuredContent(), "spec", "providerID")
if err != nil {
return nil, err
}

if found {
if providerID != "" {
// Machine has a provider ID, add it to the list
providerIDs = append(providerIDs, providerID)
continue
}
}

klog.Warningf("Machine %q has no providerID", machine.GetName())

// Begin checking to determine if the node reference is valid
nodeRefKind, found, err := unstructured.NestedString(machine.UnstructuredContent(), "status", "nodeRef", "kind")
if err != nil {
return nil, err
Expand All @@ -665,6 +733,8 @@ func (c *machineController) findScalableResourceProviderIDs(scalableResource *un
return nil, fmt.Errorf("unknown node %q", nodeRefName)
}

// A node has been found that corresponds to this machine, since we know that this machine has
// an empty provider ID, we add the provider ID from the node to the list.
if node != nil {
providerIDs = append(providerIDs, node.Spec.ProviderID)
}
Expand Down
Loading
Loading