Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9dc8e0f
feat: added consumedCounters into pool util calculation
MenD32 May 23, 2025
9a450a1
feat: added consumedCounters into pool util calculation
MenD32 May 23, 2025
ff1d30f
tests: added test to check partionable devices are calculated correctly
MenD32 May 23, 2025
d0f230e
tests: added test to check partionable devices are calculated correctly
MenD32 May 23, 2025
e92825e
tests: added test to check partionable devices are calculated correctly
MenD32 May 23, 2025
862482f
Merge branch 'master' into feat/partitionable-devices-support
MenD32 Jun 4, 2025
bb5c8d9
fix: bumped crypto and net
MenD32 Jun 10, 2025
f9f397a
Merge branch 'master' into feat/partitionable-devices-support
MenD32 Jun 11, 2025
4453bf2
Add Capacity Buffer controller logic
abdelrahman882 Sep 11, 2025
6e4f48b
feat: added flag to set deletion candidate taint TTL
MenD32 Sep 12, 2025
85a0d94
Add rapid release channel to GKE cluster creation command
laoj2 Sep 17, 2025
b09676c
change kwok nodegroup annotation key
drmorr0 Sep 17, 2025
212869b
merge: merging from main
MenD32 Sep 22, 2025
4fa5202
feat: added flag to set deletion candidate taint TTL
MenD32 Sep 12, 2025
9f2c8db
Merge branch 'master' into feat/partitionable-devices-support
MenD32 Sep 22, 2025
3956443
fix: updated resourceapi to v1
MenD32 Oct 2, 2025
c61b8ad
feat: Partionable Devices Support
MenD32 Nov 7, 2025
c2633bf
fix: added weighting to summation in order to consider a mix of parti…
MenD32 Nov 21, 2025
2ed4602
fix: added weighting to summation in order to consider a mix of parti…
MenD32 Nov 21, 2025
fb70fa4
merged with master
MenD32 Dec 11, 2025
1b0021f
fix(style): capitalized variable inside function
MenD32 Dec 30, 2025
98841cd
docs: added documentation to calculatePoolUtil
MenD32 Dec 30, 2025
c849d52
tests: split resourceSlices with counters and resourceSlices without …
MenD32 Dec 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 137 additions & 5 deletions cluster-autoscaler/simulator/dynamicresources/utils/utilization.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/utils/ptr"
)
Expand All @@ -44,7 +45,7 @@ func CalculateDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (map[stri
poolDevices := getAllDevices(currentSlices)
allocatedDeviceNames := allocatedDevices[driverName][poolName]
unallocated, allocated := splitDevicesByAllocation(poolDevices, allocatedDeviceNames)
result[driverName][poolName] = calculatePoolUtil(unallocated, allocated)
result[driverName][poolName] = calculatePoolUtil(unallocated, allocated, currentSlices)
}
}
return result, nil
Expand All @@ -70,10 +71,141 @@ func HighestDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (v1.Resourc
return highestResourceName, highestUtil, nil
}

func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 {
numAllocated := float64(len(allocated))
numUnallocated := float64(len(unallocated))
return numAllocated / (numAllocated + numUnallocated)
// calculatePoolUtil calculates the utilization of a ResourceSlice pool, accounting for both partitionable (shared counter) and atomic (non-partitionable) devices.
// The calculation is comprised of steps:
//
// 1. Partitionable (Shared) Utilization:
// Identifies the single highest utilization ratio across all shared counters
// in the entire pool. This ratio represents the most constrained resource within the pool.
// For example, if a GPU pool has shared counters for memory and compute cycles, and the memory's shared counter is at 80% utilization, and the compute cycles' counter is at 50%,
// the partitionable utilization for the pool would be 80%.
//
// 2. Atomic (Non-partitionable) Utilization:
// Calculated as the simple ratio of allocated devices to total devices for all devices
// that do not support shared counters.
//
// 3. Final Weighted Average:
// The result is a weighted average of the two types based on their population count in the pool.
// This ensures that in mixed pools, the fullness of one resource type doesn't disproportionately
// mask or amplify the state of the other.
//
// Example (3 total devices: 2 atomic, 1 partitionable):
// - 2 atomic allocated, partitionable at 0% util:
// Result: (1.0 * 2/3) + (0.0 * 1/3) = 66.6%
// - 0 atomic allocated, partitionable at 100% util:
// Result: (0.0 * 2/3) + (1.0 * 1/3) = 33.3%
// - 1 atomic allocated, partitionable at 50% util:
// Result: (0.5 * 2/3) + (0.5 * 1/3) = 50%
func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlices []*resourceapi.ResourceSlice) float64 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a comment explaining how we calculate utilization? Previously it was very easy to reason about its behaviour, right now it's not obvious without throughly reading it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

totalConsumedCounters := map[string]map[string]resource.Quantity{}
for _, resourceSlice := range resourceSlices {
for _, sharedCounter := range resourceSlice.Spec.SharedCounters {
if _, ok := totalConsumedCounters[sharedCounter.Name]; !ok {
totalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{}
}
for counter, value := range sharedCounter.Counters {
totalConsumedCounters[sharedCounter.Name][counter] = value.Value
}
}
}
allocatedConsumedCounters := calculateConsumedCounters(allocated)

// not all devices are partitionable, so fallback to the ratio of non-partionable devices
allocatedDevicesWithoutCounters := 0
devicesWithoutCounters := 0

for _, device := range allocated {
if device.ConsumesCounters == nil {
devicesWithoutCounters++
allocatedDevicesWithoutCounters++
}
}
for _, device := range unallocated {
if device.ConsumesCounters == nil {
devicesWithoutCounters++
}
}

// we want to find the counter that is most utilized, since it is the "bottleneck" of the pool
var partitionableUtilization float64 = 0
var atomicDevicesUtilization float64 = 0
if devicesWithoutCounters != 0 {
atomicDevicesUtilization = float64(allocatedDevicesWithoutCounters) / float64(devicesWithoutCounters)
}
if len(totalConsumedCounters) == 0 {
return atomicDevicesUtilization
}
for counterSet, counters := range totalConsumedCounters {
for counterName, totalValue := range counters {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: is this easier to follow?

			if totalValue.IsZero() {
				continue
			}

(rather then checking for !totalValue.IsZero() two nested iterations later)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, this is wayyy cleaner. I'll change it

if totalValue.IsZero() {
continue
}
if allocatedSet, exists := allocatedConsumedCounters[counterSet]; exists {
if allocatedValue, exists := allocatedSet[counterName]; exists {
utilization := float64(allocatedValue.Value()) / float64(totalValue.Value())
if utilization > partitionableUtilization {
partitionableUtilization = utilization
}
}
}
}
}
var uniquePartitionableDevicesCount float64 = float64(getUniquePartitionableDevicesCount(allocated))
var totalUniqueDevices float64 = uniquePartitionableDevicesCount + float64(devicesWithoutCounters)
var partitionableDevicesUtilizationWeight float64 = uniquePartitionableDevicesCount / totalUniqueDevices
var nonPartitionableDevicesUtilizationWeight float64 = 1 - partitionableDevicesUtilizationWeight
// when a pool has both atomic and partitionable devices, we sum their utilizations since they are mutually exclusive (a partitionable device can't be allocated as an atomic device and vice versa).
return partitionableUtilization*partitionableDevicesUtilizationWeight + atomicDevicesUtilization*nonPartitionableDevicesUtilizationWeight
}

// calculateConsumedCounters calculates the total counters consumed by a list of devices
func calculateConsumedCounters(devices []resourceapi.Device) map[string]map[string]resource.Quantity {
countersConsumed := map[string]map[string]resource.Quantity{}
for _, device := range devices {
if device.ConsumesCounters == nil {
continue
}
for _, consumedCounter := range device.ConsumesCounters {
if _, ok := countersConsumed[consumedCounter.CounterSet]; !ok {
countersConsumed[consumedCounter.CounterSet] = map[string]resource.Quantity{}
}
for counter, value := range consumedCounter.Counters {
if _, ok := countersConsumed[consumedCounter.CounterSet][counter]; !ok {
countersConsumed[consumedCounter.CounterSet][counter] = resource.Quantity{}
}
v := countersConsumed[consumedCounter.CounterSet][counter]
v.Add(value.Value)
countersConsumed[consumedCounter.CounterSet][counter] = v
}
}
}
return countersConsumed
}

// getUniquePartitionableDevicesCount returns the count of unique partitionable devices in the provided list.
// a partitionable device can be represented by multiple devices with different names and properties, and for utilization purposes we'd like to count the hardware and not the software.
func getUniquePartitionableDevicesCount(devices []resourceapi.Device) int {
var deviceCount int = 0
var counted bool
consumedCounters := map[string]bool{}
for _, device := range devices {
// the assumption here is that a partitionable device will consume the actual resources from the hardware, which will be represented by consumedCounters.
// if a device consumes multiple counters of the same device, we count them both at the same time in order to not "overcount" devices with multiple counters, the assumption here is that a device will always consume some of every resource in a device. (f.e. a GPU DRA request cannot use VRAM without using GPU cycles and vice versa)
if device.ConsumesCounters != nil {
counted = false
for _, consumedCounter := range device.ConsumesCounters {
if _, exists := consumedCounters[consumedCounter.CounterSet]; !exists {
consumedCounters[consumedCounter.CounterSet] = true
} else {
counted = true
}
}
if !counted {
deviceCount++
}
}
}
return deviceCount
}

func splitDevicesByAllocation(devices []resourceapi.Device, allocatedNames []string) (unallocated, allocated []resourceapi.Device) {
Expand Down
Loading