diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 2aac1a1ef7..6f7ec1d807 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -17,6 +17,18 @@ go_template_instance( }, ) +go_template_instance( + name = "vcpu_list", + out = "vcpu_list.go", + package = "kvm", + prefix = "vCPU", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*vCPU", + "Linker": "*vCPU", + }, +) + config_setting( name = "debug_build", values = { @@ -67,6 +79,7 @@ go_library( "physical_map_amd64.go", "physical_map_arm64.go", "seccomp_mmap_unsafe.go", + "vcpu_list.go", "virtual_map.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index a89dea1706..d1b3be9dc2 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -71,6 +71,17 @@ type machine struct { // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. vCPUsByID []*vCPU + // vCPUList is a list of vCPUs, ordered by most-recently-used. + // The most recently used vCPUs are at the end of the list. + vCPUList vCPUList + + // numRecentVCPUs tracks the number of vCPUs considered recently used. + numRecentVCPUs atomicbitops.Int32 + + // recentVCPUThreshold is the maximum number of vCPUs to track as + // recently used before triggering a reordering of vCPUList. + recentVCPUThreshold int32 + // usedVCPUs is the number of vCPUs that have been used from the // vCPUsByID pool. usedVCPUs int @@ -213,6 +224,9 @@ type vCPU struct { // dieState holds state related to vCPU death. dieState dieState + + recentlyUsed atomicbitops.Bool + vCPUEntry } type dieState struct { @@ -241,6 +255,7 @@ func (m *machine) createVCPU(id int) *vCPU { } c.CPU.Init(&m.kernel, c.id, c) m.vCPUsByID[c.id] = c + m.vCPUList.PushFront(c) // Ensure the signal mask is correct. if err := c.setSignalMask(); err != nil { @@ -532,6 +547,10 @@ func (m *machine) Get() *vCPU { runtime.UnlockOSThread() m.mu.Lock() + if m.numRecentVCPUs.Load() > m.recentVCPUThreshold { + m.resortRecentlyUsedListLocked() + } + for { runtime.LockOSThread() tid = hosttid.Current() @@ -557,10 +576,12 @@ func (m *machine) Get() *vCPU { } // Scan for an available vCPU. - for origTID, c := range m.vCPUsByTID { + for c := m.vCPUList.Front(); c != nil; c = c.Next() { + origTID := c.tid.Load() if c.state.CompareAndSwap(vCPUReady, vCPUUser) { delete(m.vCPUsByTID, origTID) m.vCPUsByTID[tid] = c + c.setRecentlyUsed(true) m.mu.Unlock() c.loadSegments(tid) getVCPUCounter.Increment(&getVCPUAcquisitionUnused) @@ -569,7 +590,7 @@ func (m *machine) Get() *vCPU { } // Scan for something not in user mode. - for origTID, c := range m.vCPUsByTID { + for c := m.vCPUList.Front(); c != nil; c = c.Next() { if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) { continue } @@ -587,8 +608,10 @@ func (m *machine) Get() *vCPU { } // Steal the vCPU. + origTID := c.tid.Load() delete(m.vCPUsByTID, origTID) m.vCPUsByTID[tid] = c + c.setRecentlyUsed(true) m.mu.Unlock() c.loadSegments(tid) getVCPUCounter.Increment(&getVCPUAcquisitionStolen) @@ -636,6 +659,51 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) { } } +// getMaxVCPU get max vCPU number +func (m *machine) getMaxVCPU() { + maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) + if errno != 0 { + m.maxVCPUs = _KVM_NR_VCPUS + } else { + m.maxVCPUs = int(maxVCPUs) + } + + // The goal here is to avoid vCPU contentions for reasonable workloads. + // But "reasonable" isn't defined well in this case. Let's say that CPU + // overcommit with factor 2 is still acceptable. We allocate a set of + // vCPU for each goruntime processor (P) and two sets of vCPUs to run + // user code. + rCPUs := runtime.GOMAXPROCS(0) + if 3*rCPUs < m.maxVCPUs { + m.maxVCPUs = 3 * rCPUs + } + m.recentVCPUThreshold = int32(m.maxVCPUs * 2 / 3) +} + +// resortRecentlyUsedListLocked reorders the m.vCPUList so that the most +// recently used vCPUs are located at the back. It either reset +// `vCPU.recentlyUsed` flag for all vCPUs. +// +// Precondition: callers must hold m.mu for writing. +func (m *machine) resortRecentlyUsedListLocked() { + var activeList vCPUList + cur := m.vCPUList.Front() + next := cur.Next() + for { + if cur.recentlyUsed.Load() { + m.vCPUList.Remove(cur) + activeList.PushBack(cur) + cur.setRecentlyUsed(false) + } + cur = next + if cur == nil { + break + } + next = cur.Next() + } + m.vCPUList.PushBackList(&activeList) +} + // lock marks the vCPU as in user mode. // // This should only be called directly when known to be safe, i.e. when @@ -643,6 +711,7 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) { // //go:nosplit func (c *vCPU) lock() { + c.setRecentlyUsed(true) atomicbitops.OrUint32(&c.state, vCPUUser) } @@ -697,6 +766,17 @@ func (c *vCPU) NotifyInterrupt() { // pid is used below in bounce. var pid = unix.Getpid() +func (c *vCPU) setRecentlyUsed(v bool) { + old := c.recentlyUsed.Swap(v) + if v != old { + if v { + c.machine.numRecentVCPUs.Add(1) + } else { + c.machine.numRecentVCPUs.Add(-1) + } + } +} + // bounce forces a return to the kernel or to host mode. // // This effectively unwinds the state machine. diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 342013d90b..a4925a45e8 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -21,7 +21,6 @@ import ( "fmt" "math/big" "reflect" - "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" @@ -495,26 +494,6 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { } } -// getMaxVCPU get max vCPU number -func (m *machine) getMaxVCPU() { - maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) - if errno != 0 { - m.maxVCPUs = _KVM_NR_VCPUS - } else { - m.maxVCPUs = int(maxVCPUs) - } - - // The goal here is to avoid vCPU contentions for reasonable workloads. - // But "reasonable" isn't defined well in this case. Let's say that CPU - // overcommit with factor 2 is still acceptable. We allocate a set of - // vCPU for each goruntime processor (P) and two sets of vCPUs to run - // user code. - rCPUs := runtime.GOMAXPROCS(0) - if 3*rCPUs < m.maxVCPUs { - m.maxVCPUs = 3 * rCPUs - } -} - func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion { return physicalRegions } diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index bcc7fb7760..101039bd29 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -19,12 +19,10 @@ package kvm import ( "fmt" - "runtime" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" - "gvisor.dev/gvisor/pkg/hostsyscall" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/platform" @@ -182,19 +180,3 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, return accessType, platform.ErrContextSignal } - -// getMaxVCPU get max vCPU number -func (m *machine) getMaxVCPU() { - rmaxVCPUs := runtime.NumCPU() - smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) - // compare the max vcpu number from runtime and syscall, use smaller one. - if errno != 0 { - m.maxVCPUs = rmaxVCPUs - } else { - if rmaxVCPUs < int(smaxVCPUs) { - m.maxVCPUs = rmaxVCPUs - } else { - m.maxVCPUs = int(smaxVCPUs) - } - } -}