Skip to content

Commit f001c07

Browse files
avagingvisor-bot
authored andcommitted
platform/kvm: prioritize less-recently-used vCPUs for stealing
To minimize vCPU stealing between threads, prioritize stealing vCPUs that have been least recently used. This avoids taking a vCPU from a thread that is likely to use it again soon, reducing the overall number of steal operations. PiperOrigin-RevId: 738607537
1 parent c16d3fd commit f001c07

File tree

4 files changed

+95
-41
lines changed

4 files changed

+95
-41
lines changed

pkg/sentry/platform/kvm/BUILD

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,18 @@ go_template_instance(
1717
},
1818
)
1919

20+
go_template_instance(
21+
name = "vcpu_list",
22+
out = "vcpu_list.go",
23+
package = "kvm",
24+
prefix = "vCPU",
25+
template = "//pkg/ilist:generic_list",
26+
types = {
27+
"Element": "*vCPU",
28+
"Linker": "*vCPU",
29+
},
30+
)
31+
2032
config_setting(
2133
name = "debug_build",
2234
values = {
@@ -67,6 +79,7 @@ go_library(
6779
"physical_map_amd64.go",
6880
"physical_map_arm64.go",
6981
"seccomp_mmap_unsafe.go",
82+
"vcpu_list.go",
7083
"virtual_map.go",
7184
],
7285
visibility = ["//pkg/sentry:internal"],

pkg/sentry/platform/kvm/machine.go

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,17 @@ type machine struct {
7171
// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
7272
vCPUsByID []*vCPU
7373

74+
// vCPUList is a list of vCPUs, ordered by most-recently-used.
75+
// The most recently used vCPUs are at the end of the list.
76+
vCPUList vCPUList
77+
78+
// numRecentVCPUs tracks the number of vCPUs considered recently used.
79+
numRecentVCPUs atomicbitops.Int32
80+
81+
// recentVCPUThreshold is the maximum number of vCPUs to track as
82+
// recently used before triggering a reordering of vCPUList.
83+
recentVCPUThreshold int32
84+
7485
// usedVCPUs is the number of vCPUs that have been used from the
7586
// vCPUsByID pool.
7687
usedVCPUs int
@@ -213,6 +224,9 @@ type vCPU struct {
213224

214225
// dieState holds state related to vCPU death.
215226
dieState dieState
227+
228+
recentlyUsed atomicbitops.Bool
229+
vCPUEntry
216230
}
217231

218232
type dieState struct {
@@ -241,6 +255,7 @@ func (m *machine) createVCPU(id int) *vCPU {
241255
}
242256
c.CPU.Init(&m.kernel, c.id, c)
243257
m.vCPUsByID[c.id] = c
258+
m.vCPUList.PushFront(c)
244259

245260
// Ensure the signal mask is correct.
246261
if err := c.setSignalMask(); err != nil {
@@ -532,6 +547,10 @@ func (m *machine) Get() *vCPU {
532547
runtime.UnlockOSThread()
533548
m.mu.Lock()
534549

550+
if m.numRecentVCPUs.Load() > m.recentVCPUThreshold {
551+
m.resortRecentlyUsedListLocked()
552+
}
553+
535554
for {
536555
runtime.LockOSThread()
537556
tid = hosttid.Current()
@@ -557,10 +576,12 @@ func (m *machine) Get() *vCPU {
557576
}
558577

559578
// Scan for an available vCPU.
560-
for origTID, c := range m.vCPUsByTID {
579+
for c := m.vCPUList.Front(); c != nil; c = c.Next() {
580+
origTID := c.tid.Load()
561581
if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
562582
delete(m.vCPUsByTID, origTID)
563583
m.vCPUsByTID[tid] = c
584+
c.setRecentlyUsed(true)
564585
m.mu.Unlock()
565586
c.loadSegments(tid)
566587
getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
@@ -569,7 +590,7 @@ func (m *machine) Get() *vCPU {
569590
}
570591

571592
// Scan for something not in user mode.
572-
for origTID, c := range m.vCPUsByTID {
593+
for c := m.vCPUList.Front(); c != nil; c = c.Next() {
573594
if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
574595
continue
575596
}
@@ -587,8 +608,10 @@ func (m *machine) Get() *vCPU {
587608
}
588609

589610
// Steal the vCPU.
611+
origTID := c.tid.Load()
590612
delete(m.vCPUsByTID, origTID)
591613
m.vCPUsByTID[tid] = c
614+
c.setRecentlyUsed(true)
592615
m.mu.Unlock()
593616
c.loadSegments(tid)
594617
getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
@@ -636,13 +659,59 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
636659
}
637660
}
638661

662+
// getMaxVCPU get max vCPU number
663+
func (m *machine) getMaxVCPU() {
664+
maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
665+
if errno != 0 {
666+
m.maxVCPUs = _KVM_NR_VCPUS
667+
} else {
668+
m.maxVCPUs = int(maxVCPUs)
669+
}
670+
671+
// The goal here is to avoid vCPU contentions for reasonable workloads.
672+
// But "reasonable" isn't defined well in this case. Let's say that CPU
673+
// overcommit with factor 2 is still acceptable. We allocate a set of
674+
// vCPU for each goruntime processor (P) and two sets of vCPUs to run
675+
// user code.
676+
rCPUs := runtime.GOMAXPROCS(0)
677+
if 3*rCPUs < m.maxVCPUs {
678+
m.maxVCPUs = 3 * rCPUs
679+
}
680+
m.recentVCPUThreshold = int32(m.maxVCPUs * 2 / 3)
681+
}
682+
683+
// resortRecentlyUsedListLocked reorders the m.vCPUList so that the most
684+
// recently used vCPUs are located at the back. It either reset
685+
// `vCPU.recentlyUsed` flag for all vCPUs.
686+
//
687+
// Precondition: callers must hold m.mu for writing.
688+
func (m *machine) resortRecentlyUsedListLocked() {
689+
var activeList vCPUList
690+
cur := m.vCPUList.Front()
691+
next := cur.Next()
692+
for {
693+
if cur.recentlyUsed.Load() {
694+
m.vCPUList.Remove(cur)
695+
activeList.PushBack(cur)
696+
cur.setRecentlyUsed(false)
697+
}
698+
cur = next
699+
if cur == nil {
700+
break
701+
}
702+
next = cur.Next()
703+
}
704+
m.vCPUList.PushBackList(&activeList)
705+
}
706+
639707
// lock marks the vCPU as in user mode.
640708
//
641709
// This should only be called directly when known to be safe, i.e. when
642710
// the vCPU is owned by the current TID with no chance of theft.
643711
//
644712
//go:nosplit
645713
func (c *vCPU) lock() {
714+
c.setRecentlyUsed(true)
646715
atomicbitops.OrUint32(&c.state, vCPUUser)
647716
}
648717

@@ -697,6 +766,17 @@ func (c *vCPU) NotifyInterrupt() {
697766
// pid is used below in bounce.
698767
var pid = unix.Getpid()
699768

769+
func (c *vCPU) setRecentlyUsed(v bool) {
770+
old := c.recentlyUsed.Swap(v)
771+
if v != old {
772+
if v {
773+
c.machine.numRecentVCPUs.Add(1)
774+
} else {
775+
c.machine.numRecentVCPUs.Add(-1)
776+
}
777+
}
778+
}
779+
700780
// bounce forces a return to the kernel or to host mode.
701781
//
702782
// This effectively unwinds the state machine.

pkg/sentry/platform/kvm/machine_amd64.go

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import (
2121
"fmt"
2222
"math/big"
2323
"reflect"
24-
"runtime"
2524

2625
"golang.org/x/sys/unix"
2726
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -495,26 +494,6 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
495494
}
496495
}
497496

498-
// getMaxVCPU get max vCPU number
499-
func (m *machine) getMaxVCPU() {
500-
maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
501-
if errno != 0 {
502-
m.maxVCPUs = _KVM_NR_VCPUS
503-
} else {
504-
m.maxVCPUs = int(maxVCPUs)
505-
}
506-
507-
// The goal here is to avoid vCPU contentions for reasonable workloads.
508-
// But "reasonable" isn't defined well in this case. Let's say that CPU
509-
// overcommit with factor 2 is still acceptable. We allocate a set of
510-
// vCPU for each goruntime processor (P) and two sets of vCPUs to run
511-
// user code.
512-
rCPUs := runtime.GOMAXPROCS(0)
513-
if 3*rCPUs < m.maxVCPUs {
514-
m.maxVCPUs = 3 * rCPUs
515-
}
516-
}
517-
518497
func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion {
519498
return physicalRegions
520499
}

pkg/sentry/platform/kvm/machine_arm64.go

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,10 @@ package kvm
1919

2020
import (
2121
"fmt"
22-
"runtime"
2322

2423
"golang.org/x/sys/unix"
2524
"gvisor.dev/gvisor/pkg/abi/linux"
2625
"gvisor.dev/gvisor/pkg/hostarch"
27-
"gvisor.dev/gvisor/pkg/hostsyscall"
2826
"gvisor.dev/gvisor/pkg/ring0"
2927
"gvisor.dev/gvisor/pkg/ring0/pagetables"
3028
"gvisor.dev/gvisor/pkg/sentry/platform"
@@ -182,19 +180,3 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType,
182180

183181
return accessType, platform.ErrContextSignal
184182
}
185-
186-
// getMaxVCPU get max vCPU number
187-
func (m *machine) getMaxVCPU() {
188-
rmaxVCPUs := runtime.NumCPU()
189-
smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
190-
// compare the max vcpu number from runtime and syscall, use smaller one.
191-
if errno != 0 {
192-
m.maxVCPUs = rmaxVCPUs
193-
} else {
194-
if rmaxVCPUs < int(smaxVCPUs) {
195-
m.maxVCPUs = rmaxVCPUs
196-
} else {
197-
m.maxVCPUs = int(smaxVCPUs)
198-
}
199-
}
200-
}

0 commit comments

Comments
 (0)