@@ -141,6 +141,95 @@ func (a *cpuAccumulator) sort(ids []int, getCPUs func(ids ...int) machine.CPUSet
141141 })
142142}
143143
144+ // getBestMatchCPUsNeededL3Cache returns the L3 cache ID that best matches the number of CPUs needed.
145+ // It directly selects the L3 cache with the closest match to the required number of CPUs,
146+ // preferring caches with CPU count equal to or slightly greater than the requirement.
147+ func (a * cpuAccumulator ) getBestMatchCPUsNeededL3Cache () (int , bool ) {
148+ l3Caches := a .cpuDetails .L3Caches ().ToSliceInt ()
149+ if len (l3Caches ) == 0 {
150+ return 0 , false
151+ }
152+
153+ var bestL3CacheID int
154+ bestMatchFound := false
155+ var bestMatchDiff int = - 1 // -1 indicates no match found yet
156+
157+ for _ , l3CacheID := range l3Caches {
158+ cpusInL3Cache := a .cpuDetails .CPUsInL3Caches (l3CacheID )
159+ cpuCount := cpusInL3Cache .Size ()
160+
161+ // Exact match - return immediately
162+ if cpuCount == a .numCPUsNeeded {
163+ return l3CacheID , true
164+ }
165+
166+ // For caches with more CPUs than needed, prefer the one with the smallest excess
167+ if cpuCount > a .numCPUsNeeded {
168+ diff := cpuCount - a .numCPUsNeeded
169+ if ! bestMatchFound || diff < bestMatchDiff {
170+ bestL3CacheID = l3CacheID
171+ bestMatchDiff = diff
172+ bestMatchFound = true
173+ }
174+ }
175+ }
176+
177+ // If we found a cache with more CPUs than needed, return it
178+ if bestMatchFound {
179+ return bestL3CacheID , true
180+ }
181+
182+ // If no cache with more CPUs was found, find the one with the most CPUs
183+ // (closest match when all caches have fewer CPUs than needed)
184+ for _ , l3CacheID := range l3Caches {
185+ cpusInL3Cache := a .cpuDetails .CPUsInL3Caches (l3CacheID )
186+ cpuCount := cpusInL3Cache .Size ()
187+
188+ if ! bestMatchFound || cpuCount > bestMatchDiff {
189+ bestL3CacheID = l3CacheID
190+ bestMatchDiff = cpuCount
191+ bestMatchFound = true
192+ }
193+ }
194+
195+ return bestL3CacheID , bestMatchFound
196+ }
197+
198+ // tryAlignL3Caches handles remaining CPU allocation with L3 cache topology awareness.
199+ //
200+ // This method implements fine-grained CPU allocation based on L3 cache topology.
201+ // When the requested CPU count doesn't align with complete L3 cache sizes,
202+ // it intelligently selects the most suitable L3 cache to minimize cache contention
203+ // and maximize memory locality for the workload.
204+ //
205+ // Algorithm:
206+ // 1. Directly selects the L3 cache that best matches the remaining CPU requirement
207+ // 2. If remaining need >= cache size: allocate entire cache and recurse
208+ // 3. If remaining need < cache size: restrict allocation to this cache only
209+ func (a * cpuAccumulator ) tryAlignL3Caches () {
210+ l3Cache , found := a .getBestMatchCPUsNeededL3Cache ()
211+ if ! found {
212+ return
213+ }
214+
215+ cpusInL3Cache := a .cpuDetails .CPUsInL3Caches (l3Cache )
216+ if a .numCPUsNeeded >= cpusInL3Cache .Size () {
217+ // Cache is smaller than remaining need - take entire cache for efficiency
218+ klog .V (4 ).InfoS ("tryAlignL3Caches: claiming entire L3 cache (partial)" , "l3Cache" , l3Cache , "cacheSize" , cpusInL3Cache .Size (), "remainingNeed" , a .numCPUsNeeded )
219+ a .take (cpusInL3Cache )
220+ if a .isSatisfied () {
221+ return
222+ }
223+ // Continue with remaining allocation from other caches
224+ a .tryAlignL3Caches ()
225+ } else {
226+ // Cache is larger than remaining need - restrict to this cache for optimal locality
227+ // This ensures all allocated CPUs share the same L3 cache, minimizing memory latency
228+ klog .V (4 ).InfoS ("tryAlignL3Caches: restricting allocation to L3 cache" , "l3Cache" , l3Cache , "cacheSize" , cpusInL3Cache .Size (), "remainingNeed" , a .numCPUsNeeded )
229+ a .cpuDetails = a .cpuDetails .KeepOnly (cpusInL3Cache )
230+ }
231+ }
232+
144233// Sort all sockets with free CPUs using the sort() algorithm defined above.
145234func (a * cpuAccumulator ) sortAvailableSockets () []int {
146235 sockets := a .cpuDetails .Sockets ().ToSliceNoSortInt ()
@@ -224,42 +313,94 @@ func (a *cpuAccumulator) isFailed() bool {
224313 return a .numCPUsNeeded > a .cpuDetails .CPUs ().Size ()
225314}
226315
227- // TakeByTopology tries to allocate those required cpus in the same socket or cores
316+ // TakeByTopology implements a topology-aware CPU allocation strategy that prioritizes
317+ // hardware locality and cache efficiency for optimal workload performance.
318+ //
319+ // This function implements a multi-tier allocation strategy designed to minimize
320+ // cross-socket communication and maximize cache utilization. The allocation follows
321+ // a hierarchical approach from largest to smallest topology units.
322+ //
323+ // Parameters:
324+ // - info: Machine topology information including NUMA, socket, core, and cache hierarchy
325+ // - availableCPUs: Set of CPUs available for allocation
326+ // - cpuRequirement: Number of CPUs needed for the workload
327+ // - alignByL3Caches: Whether to consider L3 cache topology in allocation decisions
328+ //
329+ // Returns:
330+ // - CPUSet: The allocated set of CPUs with optimal topology placement
331+ // - error: Error if allocation fails due to insufficient resources
332+ //
333+ // Allocation Strategy (Topology-Aware Best-Fit):
334+ //
335+ // Phase 1: Socket-Level Allocation (Highest Locality)
336+ // - Attempts to allocate entire CPU sockets when the requirement matches or exceeds socket size
337+ // - Provides maximum memory bandwidth and minimal cross-socket latency
338+ //
339+ // Phase 2: L3 Cache-Aware Allocation (Conditional)
340+ // - Activated when alignByL3Caches is true
341+ // - Prioritizes allocation within shared L3 cache domains to minimize cache contention
342+ // - Uses tryAlignL3Caches() for intelligent cache-aligned distribution
343+ //
344+ // Phase 3: Core-Level Allocation (Medium Locality)
345+ // - Allocates complete CPU cores to avoid hyperthreading contention
346+ // - Preferred for workloads sensitive to thread interference
347+ //
348+ // Phase 4: Thread-Level Allocation (Fine-Grained)
349+ // - Allocates individual hyperthreads from partially utilized cores
350+ // - Prefers cores on sockets already allocated to maintain NUMA affinity
228351func TakeByTopology (info * machine.KatalystMachineInfo , availableCPUs machine.CPUSet ,
229- cpuRequirement int ,
352+ cpuRequirement int , alignByL3Caches bool ,
230353) (machine.CPUSet , error ) {
354+ // Initialize accumulator with topology-aware state
231355 acc := newCPUAccumulator (info , availableCPUs , cpuRequirement )
356+
357+ // Fast-path: Handle edge cases immediately
232358 if acc .isSatisfied () {
359+ // Zero CPU requirement - return empty set immediately
233360 return acc .result .Clone (), nil
234361 }
235362 if acc .isFailed () {
236- return machine .NewCPUSet (), fmt .Errorf ("not enough cpus available to satisfy request" )
363+ // Insufficient resources - fail fast with descriptive error
364+ return machine .NewCPUSet (), fmt .Errorf ("insufficient CPUs: requested %d, available %d" ,
365+ cpuRequirement , availableCPUs .Size ())
237366 }
238367
239- // Algorithm: topology-aware best-fit
240- // 1. Acquire whole sockets, if available and the container requires at
241- // least a socket's-worth of CPUs.
368+ // Phase 1: Socket-level allocation for maximum locality
369+ // This phase attempts to allocate entire CPU sockets when beneficial
242370 acc .takeFullSockets ()
243371 if acc .isSatisfied () {
372+ klog .V (4 ).InfoS ("TakeByTopology: allocated at socket level" , "allocated" , acc .result .Size ())
244373 return acc .result .Clone (), nil
245374 }
246375
247- // 2. Acquire whole cores, if available and the container requires at least
248- // a core's-worth of CPUs.
376+ // Phase 2: L3 cache topology optimization (if enabled)
377+ // This phase considers cache topology to minimize memory latency
378+ if alignByL3Caches {
379+ acc .tryAlignL3Caches ()
380+ if acc .isSatisfied () {
381+ klog .V (4 ).InfoS ("TakeByTopology: allocated with L3 cache alignment" , "allocated" , acc .result .Size ())
382+ return acc .result .Clone (), nil
383+ }
384+ }
385+
386+ // Phase 3: Core-level allocation to avoid HT contention
387+ // Allocates complete cores for workloads sensitive to thread interference
249388 acc .takeFullCores ()
250389 if acc .isSatisfied () {
390+ klog .V (4 ).InfoS ("TakeByTopology: allocated at core level" , "allocated" , acc .result .Size ())
251391 return acc .result .Clone (), nil
252392 }
253393
254- // 3. Acquire single threads, preferring to fill partially-allocated cores
255- // on the same sockets as the whole cores we have already taken in this
256- // allocation.
394+ // Phase 4: Thread-level allocation for remaining needs
395+ // Allocates individual threads from partially utilized cores
257396 acc .takeRemainingCPUs ()
258397 if acc .isSatisfied () {
398+ klog .V (4 ).InfoS ("TakeByTopology: allocated at thread level" , "allocated" , acc .result .Size ())
259399 return acc .result .Clone (), nil
260400 }
261401
262- return machine .NewCPUSet (), fmt .Errorf ("failed to allocate cpus" )
402+ // Exhaustive allocation failed - no combination satisfies requirement
403+ return machine .NewCPUSet (), fmt .Errorf ("topology-aware allocation failed: requested %d CPUs, exhausted all allocation strategies" , cpuRequirement )
263404}
264405
265406// TakeByNUMABalance tries to make the allocated cpu spread on different
0 commit comments