Skip to content

kvm: replace machine.availableCond with futexes #11568

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/sentry/platform/kvm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ go_test(
"//pkg/sentry/platform",
"//pkg/sentry/platform/kvm/testutil",
"//pkg/sentry/time",
"//pkg/sync",
"@org_golang_x_sys//unix:go_default_library",
],
)
Expand Down Expand Up @@ -164,6 +165,7 @@ go_test(
"//pkg/sentry/platform",
"//pkg/sentry/platform/kvm/testutil",
"//pkg/sentry/time",
"//pkg/sync",
"@org_golang_x_sys//unix:go_default_library",
],
)
6 changes: 5 additions & 1 deletion pkg/sentry/platform/kvm/bluepill_unsafe.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {

// Return to the vCPUReady state; notify any waiters.
user := c.state.Load() & vCPUUser
switch c.state.Swap(user) {
oldState := c.state.Swap(user)
if user == 0 {
c.machine.availableNotify()
}
switch oldState {
case user | vCPUGuest: // Expected case.
case user | vCPUGuest | vCPUWaiter:
c.notify()
Expand Down
14 changes: 12 additions & 2 deletions pkg/sentry/platform/kvm/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@
package kvm

// Config sets configuration options for each platform instance.
type Config struct{}
type Config struct {
// MaxVCPUs is the maximum number of vCPUs the platform instance will
// create. If MaxVCPUs is 0, the platform will choose a reasonable default.
MaxVCPUs int
}

func (*machine) applyConfig(config *Config) error { return nil }
func (m *machine) applyConfig(config *Config) error {
if config.MaxVCPUs < 0 {
return fmt.Errorf("invalid Config.MaxVCPUs: %d", config.MaxVCPUs)

Check failure on line 29 in pkg/sentry/platform/kvm/config.go

View workflow job for this annotation

GitHub Actions / generate

undefined: fmt
}
m.maxVCPUs = config.MaxVCPUs
return nil
}
58 changes: 58 additions & 0 deletions pkg/sentry/platform/kvm/kvm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
ktime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sync"
)

// dummyFPState is initialized in TestMain.
Expand Down Expand Up @@ -480,6 +481,63 @@ func TestKernelVDSO(t *testing.T) {
})
}

// Regression test for b/404271139.
func TestSingleVCPU(t *testing.T) {
// Create the machine.
deviceFile, err := OpenDevice("")
if err != nil {
t.Fatalf("error opening device file: %v", err)
}
k, err := New(deviceFile, Config{
MaxVCPUs: 1,
})
if err != nil {
t.Fatalf("error creating KVM instance: %v", err)
}
defer k.machine.Destroy()

// Ping-pong the single vCPU between two goroutines. The test passes if
// this does not deadlock.
stopC := make(chan struct{})
var doneWG sync.WaitGroup
defer func() {
close(stopC)
doneWG.Wait()
}()
var wakeC [2]chan struct{}
for i := range wakeC {
wakeC[i] = make(chan struct{}, 1)
}
for i := range wakeC {
doneWG.Add(1)
go func(i int) {
defer doneWG.Done()
for {
// Multiple ready channels in a select statement are chosen
// from randomly, so have a separate non-blocking receive from
// stopC first to ensure that it's honored in deterministic
// time.
select {
case <-stopC:
return
default:
}
select {
case <-stopC:
return
case <-wakeC[i]:
c := k.machine.Get()
bluepill(c)
wakeC[1-i] <- struct{}{}
k.machine.Put(c)
}
}
}(i)
}
wakeC[0] <- struct{}{}
time.Sleep(time.Second)
}

func BenchmarkApplicationSyscall(b *testing.B) {
var (
i int // Iteration includes machine.Get() / machine.Put().
Expand Down
95 changes: 50 additions & 45 deletions pkg/sentry/platform/kvm/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,12 @@ type machine struct {
// mu protects vCPUs.
mu sync.RWMutex

// available is notified when vCPUs are available.
available sync.Cond
// availableWaiters is the number of goroutines waiting for a vCPU to
// become ready.
availableWaiters atomicbitops.Int32

// availableSeq is incremented whenever a vCPU becomes ready.
availableSeq atomicbitops.Uint32

// vCPUsByTID are the machine vCPUs.
//
Expand Down Expand Up @@ -270,7 +274,6 @@ var forceMappingEntireAddressSpace = false
func newMachine(vm int, config *Config) (*machine, error) {
// Create the machine.
m := &machine{fd: vm}
m.available.L = &m.mu

if err := m.applyConfig(config); err != nil {
panic(fmt.Sprintf("error setting config parameters: %s", err))
Expand Down Expand Up @@ -532,6 +535,7 @@ func (m *machine) Get() *vCPU {
runtime.UnlockOSThread()
m.mu.Lock()

waiter := false
for {
runtime.LockOSThread()
tid = hosttid.Current()
Expand All @@ -540,6 +544,9 @@ func (m *machine) Get() *vCPU {
if c := m.vCPUsByTID[tid]; c != nil {
c.lock()
m.mu.Unlock()
if waiter {
m.availableWaiters.Add(-1)
}
getVCPUCounter.Increment(&getVCPUAcquisitionReused)
return c
}
Expand All @@ -551,68 +558,62 @@ func (m *machine) Get() *vCPU {
c.lock()
m.vCPUsByTID[tid] = c
m.mu.Unlock()
if waiter {
m.availableWaiters.Add(-1)
}
c.loadSegments(tid)
getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
return c
}

// Scan for an available vCPU.
if !waiter {
// Scan for an available vCPU, best-effort.
for origTID, c := range m.vCPUsByTID {
if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
delete(m.vCPUsByTID, origTID)
m.vCPUsByTID[tid] = c
m.mu.Unlock()
c.loadSegments(tid)
getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
return c
}
}

// Indicate that we are waiting for a vCPU.
waiter = true
m.availableWaiters.Add(1)
}

// Scan for an available vCPU, with m.availableWaiters != 0.
epoch := m.availableSeq.Load()
for origTID, c := range m.vCPUsByTID {
if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
delete(m.vCPUsByTID, origTID)
m.vCPUsByTID[tid] = c
m.mu.Unlock()
m.availableWaiters.Add(-1)
c.loadSegments(tid)
getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
return c
}
}

// Scan for something not in user mode.
for origTID, c := range m.vCPUsByTID {
if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
continue
}

// The vCPU is not be able to transition to
// vCPUGuest|vCPUWaiter or to vCPUUser because that
// transition requires holding the machine mutex, as we
// do now. There is no path to register a waiter on
// just the vCPUReady state.
for {
c.waitUntilNot(vCPUGuest | vCPUWaiter)
if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
break
}
}

// Steal the vCPU.
delete(m.vCPUsByTID, origTID)
m.vCPUsByTID[tid] = c
m.mu.Unlock()
c.loadSegments(tid)
getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
return c
}

// Everything is executing in user mode. Wait until something is
// available. As with m.mu.Lock() above, unlock the OS thread while we
// do this to avoid spawning additional system threads. Note that
// signaling the condition variable will have the extra effect of
// kicking the vCPUs out of guest mode if that's where they were.
// All vCPUs are already in guest mode. Wait until a vCPU becomes
// available. m.availableWait() blocks in the host, but unlocking the
// OS thread still makes waking up less expensive if sysmon steals our
// P while we're blocked.
runtime.UnlockOSThread()
m.available.Wait()
m.availableWait(epoch)
}
}

// Put puts the current vCPU.
func (m *machine) Put(c *vCPU) {
c.unlock()
ready := c.unlock()
runtime.UnlockOSThread()

m.mu.RLock()
m.available.Signal()
m.mu.RUnlock()
if ready {
m.availableNotify()
}
}

// newDirtySet returns a new dirty set.
Expand Down Expand Up @@ -646,15 +647,16 @@ func (c *vCPU) lock() {
atomicbitops.OrUint32(&c.state, vCPUUser)
}

// unlock clears the vCPUUser bit.
// unlock clears the vCPUUser bit. It returns true if it transitions the vCPU
// state to vCPUReady.
//
//go:nosplit
func (c *vCPU) unlock() {
func (c *vCPU) unlock() bool {
origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest)
if origState == vCPUUser|vCPUGuest {
// Happy path: no exits are forced, and we can continue
// executing on our merry way with a single atomic access.
return
return false
}

// Clear the lock.
Expand All @@ -668,6 +670,7 @@ func (c *vCPU) unlock() {
switch origState {
case vCPUUser:
// Normal state.
return true
case vCPUUser | vCPUGuest | vCPUWaiter:
// Force a transition: this must trigger a notification when we
// return from guest mode. We must clear vCPUWaiter here
Expand All @@ -677,11 +680,13 @@ func (c *vCPU) unlock() {
// syscall in this period, BounceToKernel will hang.
atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
c.notify()
return false
case vCPUUser | vCPUWaiter:
// Waiting for the lock to be released; the responsibility is
// on us to notify the waiter and clear the associated bit.
atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
c.notify()
return true
default:
panic("invalid state")
}
Expand Down
25 changes: 13 additions & 12 deletions pkg/sentry/platform/kvm/machine_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -497,21 +497,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {

// getMaxVCPU get max vCPU number
func (m *machine) getMaxVCPU() {
if m.maxVCPUs == 0 {
// The goal here is to avoid vCPU contentions for reasonable workloads.
// But "reasonable" isn't defined well in this case. Let's say that CPU
// overcommit with factor 2 is still acceptable. We allocate a set of
// vCPU for each goruntime processor (P) and two sets of vCPUs to run
// user code.
m.maxVCPUs = 3 * runtime.GOMAXPROCS(0)
}

// Apply KVM limit.
maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
if errno != 0 {
m.maxVCPUs = _KVM_NR_VCPUS
} else {
m.maxVCPUs = int(maxVCPUs)
maxVCPUs = _KVM_NR_VCPUS
}

// The goal here is to avoid vCPU contentions for reasonable workloads.
// But "reasonable" isn't defined well in this case. Let's say that CPU
// overcommit with factor 2 is still acceptable. We allocate a set of
// vCPU for each goruntime processor (P) and two sets of vCPUs to run
// user code.
rCPUs := runtime.GOMAXPROCS(0)
if 3*rCPUs < m.maxVCPUs {
m.maxVCPUs = 3 * rCPUs
if m.maxVCPUs > int(maxVCPUs) {
m.maxVCPUs = int(maxVCPUs)
}
}

Expand Down
18 changes: 7 additions & 11 deletions pkg/sentry/platform/kvm/machine_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,12 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType,

// getMaxVCPU get max vCPU number
func (m *machine) getMaxVCPU() {
rmaxVCPUs := runtime.NumCPU()
smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
// compare the max vcpu number from runtime and syscall, use smaller one.
if errno != 0 {
m.maxVCPUs = rmaxVCPUs
} else {
if rmaxVCPUs < int(smaxVCPUs) {
m.maxVCPUs = rmaxVCPUs
} else {
m.maxVCPUs = int(smaxVCPUs)
}
if m.maxVCPUs == 0 {
m.maxVCPUs = runtime.NumCPU()
}

// Apply KVM limit.
if maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS); errno == 0 && m.maxVCPUs > int(maxVCPUs) {
m.maxVCPUs = int(maxVCPUs)
}
}
Loading
Loading