google · copybara-service · Mar 19, 2025
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
@@ -130,6 +130,7 @@ go_test(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/time",
+        "//pkg/sync",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -164,6 +165,7 @@ go_test(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/time",
+        "//pkg/sync",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -72,7 +72,11 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
 
 	// Return to the vCPUReady state; notify any waiters.
 	user := c.state.Load() & vCPUUser
-	switch c.state.Swap(user) {
+	oldState := c.state.Swap(user)
+	if user == 0 {
+		c.machine.availableNotify()
+	}
+	switch oldState {
 	case user | vCPUGuest: // Expected case.
 	case user | vCPUGuest | vCPUWaiter:
 		c.notify()

diff --git a/pkg/sentry/platform/kvm/config.go b/pkg/sentry/platform/kvm/config.go
@@ -18,6 +18,16 @@
 package kvm
 
 // Config sets configuration options for each platform instance.
-type Config struct{}
+type Config struct {
+	// MaxVCPUs is the maximum number of vCPUs the platform instance will
+	// create. If MaxVCPUs is 0, the platform will choose a reasonable default.
+	MaxVCPUs int
+}
 
-func (*machine) applyConfig(config *Config) error { return nil }
+func (m *machine) applyConfig(config *Config) error {
+	if config.MaxVCPUs < 0 {
+		return fmt.Errorf("invalid Config.MaxVCPUs: %d", config.MaxVCPUs)
+	}
+	m.maxVCPUs = config.MaxVCPUs
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
 	ktime "gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // dummyFPState is initialized in TestMain.
@@ -480,6 +481,63 @@ func TestKernelVDSO(t *testing.T) {
 	})
 }
 
+// Regression test for b/404271139.
+func TestSingleVCPU(t *testing.T) {
+	// Create the machine.
+	deviceFile, err := OpenDevice("")
+	if err != nil {
+		t.Fatalf("error opening device file: %v", err)
+	}
+	k, err := New(deviceFile, Config{
+		MaxVCPUs: 1,
+	})
+	if err != nil {
+		t.Fatalf("error creating KVM instance: %v", err)
+	}
+	defer k.machine.Destroy()
+
+	// Ping-pong the single vCPU between two goroutines. The test passes if
+	// this does not deadlock.
+	stopC := make(chan struct{})
+	var doneWG sync.WaitGroup
+	defer func() {
+		close(stopC)
+		doneWG.Wait()
+	}()
+	var wakeC [2]chan struct{}
+	for i := range wakeC {
+		wakeC[i] = make(chan struct{}, 1)
+	}
+	for i := range wakeC {
+		doneWG.Add(1)
+		go func(i int) {
+			defer doneWG.Done()
+			for {
+				// Multiple ready channels in a select statement are chosen
+				// from randomly, so have a separate non-blocking receive from
+				// stopC first to ensure that it's honored in deterministic
+				// time.
+				select {
+				case <-stopC:
+					return
+				default:
+				}
+				select {
+				case <-stopC:
+					return
+				case <-wakeC[i]:
+					c := k.machine.Get()
+					bluepill(c)
+					wakeC[1-i] <- struct{}{}
+					k.machine.Put(c)
+				}
+			}
+		}(i)
+	}
+	wakeC[0] <- struct{}{}
+	time.Sleep(time.Second)
+}
+
 func BenchmarkApplicationSyscall(b *testing.B) {
 	var (
 		i int // Iteration includes machine.Get() / machine.Put().

diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
@@ -60,8 +60,12 @@ type machine struct {
 	// mu protects vCPUs.
 	mu sync.RWMutex
 
-	// available is notified when vCPUs are available.
-	available sync.Cond
+	// availableWaiters is the number of goroutines waiting for a vCPU to
+	// become ready.
+	availableWaiters atomicbitops.Int32
+
+	// availableSeq is incremented whenever a vCPU becomes ready.
+	availableSeq atomicbitops.Uint32
 
 	// vCPUsByTID are the machine vCPUs.
 	//
@@ -270,7 +274,6 @@ var forceMappingEntireAddressSpace = false
 func newMachine(vm int, config *Config) (*machine, error) {
 	// Create the machine.
 	m := &machine{fd: vm}
-	m.available.L = &m.mu
 
 	if err := m.applyConfig(config); err != nil {
 		panic(fmt.Sprintf("error setting config parameters: %s", err))
@@ -532,6 +535,7 @@ func (m *machine) Get() *vCPU {
 	runtime.UnlockOSThread()
 	m.mu.Lock()
 
+	waiter := false
 	for {
 		runtime.LockOSThread()
 		tid = hosttid.Current()
@@ -540,6 +544,9 @@ func (m *machine) Get() *vCPU {
 		if c := m.vCPUsByTID[tid]; c != nil {
 			c.lock()
 			m.mu.Unlock()
+			if waiter {
+				m.availableWaiters.Add(-1)
+			}
 			getVCPUCounter.Increment(&getVCPUAcquisitionReused)
 			return c
 		}
@@ -551,68 +558,62 @@ func (m *machine) Get() *vCPU {
 			c.lock()
 			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
+			if waiter {
+				m.availableWaiters.Add(-1)
+			}
 			c.loadSegments(tid)
 			getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
 			return c
 		}
 
-		// Scan for an available vCPU.
+		if !waiter {
+			// Scan for an available vCPU, best-effort.
+			for origTID, c := range m.vCPUsByTID {
+				if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
+					delete(m.vCPUsByTID, origTID)
+					m.vCPUsByTID[tid] = c
+					m.mu.Unlock()
+					c.loadSegments(tid)
+					getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
+					return c
+				}
+			}
+
+			// Indicate that we are waiting for a vCPU.
+			waiter = true
+			m.availableWaiters.Add(1)
+		}
+
+		// Scan for an available vCPU, with m.availableWaiters != 0.
+		epoch := m.availableSeq.Load()
 		for origTID, c := range m.vCPUsByTID {
 			if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
 				delete(m.vCPUsByTID, origTID)
 				m.vCPUsByTID[tid] = c
 				m.mu.Unlock()
+				m.availableWaiters.Add(-1)
 				c.loadSegments(tid)
 				getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
 				return c
 			}
 		}
 
-		// Scan for something not in user mode.
-		for origTID, c := range m.vCPUsByTID {
-			if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
-				continue
-			}
-
-			// The vCPU is not be able to transition to
-			// vCPUGuest|vCPUWaiter or to vCPUUser because that
-			// transition requires holding the machine mutex, as we
-			// do now. There is no path to register a waiter on
-			// just the vCPUReady state.
-			for {
-				c.waitUntilNot(vCPUGuest | vCPUWaiter)
-				if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
-					break
-				}
-			}
-
-			// Steal the vCPU.
-			delete(m.vCPUsByTID, origTID)
-			m.vCPUsByTID[tid] = c
-			m.mu.Unlock()
-			c.loadSegments(tid)
-			getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
-			return c
-		}
-
-		// Everything is executing in user mode. Wait until something is
-		// available. As with m.mu.Lock() above, unlock the OS thread while we
-		// do this to avoid spawning additional system threads. Note that
-		// signaling the condition variable will have the extra effect of
-		// kicking the vCPUs out of guest mode if that's where they were.
+		// All vCPUs are already in guest mode. Wait until a vCPU becomes
+		// available. m.availableWait() blocks in the host, but unlocking the
+		// OS thread still makes waking up less expensive if sysmon steals our
+		// P while we're blocked.
 		runtime.UnlockOSThread()
-		m.available.Wait()
+		m.availableWait(epoch)
 	}
 }
 
 // Put puts the current vCPU.
 func (m *machine) Put(c *vCPU) {
-	c.unlock()
+	ready := c.unlock()
 	runtime.UnlockOSThread()
-
-	m.mu.RLock()
-	m.available.Signal()
-	m.mu.RUnlock()
+	if ready {
+		m.availableNotify()
+	}
 }
 
 // newDirtySet returns a new dirty set.
@@ -646,15 +647,16 @@ func (c *vCPU) lock() {
 	atomicbitops.OrUint32(&c.state, vCPUUser)
 }
 
-// unlock clears the vCPUUser bit.
+// unlock clears the vCPUUser bit. It returns true if it transitions the vCPU
+// state to vCPUReady.
 //
 //go:nosplit
-func (c *vCPU) unlock() {
+func (c *vCPU) unlock() bool {
 	origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest)
 	if origState == vCPUUser|vCPUGuest {
 		// Happy path: no exits are forced, and we can continue
 		// executing on our merry way with a single atomic access.
-		return
+		return false
 	}
 
 	// Clear the lock.
@@ -668,6 +670,7 @@ func (c *vCPU) unlock() {
 	switch origState {
 	case vCPUUser:
 		// Normal state.
+		return true
 	case vCPUUser | vCPUGuest | vCPUWaiter:
 		// Force a transition: this must trigger a notification when we
 		// return from guest mode. We must clear vCPUWaiter here
@@ -677,11 +680,13 @@ func (c *vCPU) unlock() {
 		// syscall in this period, BounceToKernel will hang.
 		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
 		c.notify()
+		return false
 	case vCPUUser | vCPUWaiter:
 		// Waiting for the lock to be released; the responsibility is
 		// on us to notify the waiter and clear the associated bit.
 		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
 		c.notify()
+		return true
 	default:
 		panic("invalid state")
 	}

diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -497,21 +497,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
 
 // getMaxVCPU get max vCPU number
 func (m *machine) getMaxVCPU() {
+	if m.maxVCPUs == 0 {
+		// The goal here is to avoid vCPU contentions for reasonable workloads.
+		// But "reasonable" isn't defined well in this case. Let's say that CPU
+		// overcommit with factor 2 is still acceptable. We allocate a set of
+		// vCPU for each goruntime processor (P) and two sets of vCPUs to run
+		// user code.
+		m.maxVCPUs = 3 * runtime.GOMAXPROCS(0)
+	}
+
+	// Apply KVM limit.
 	maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
 	if errno != 0 {
-		m.maxVCPUs = _KVM_NR_VCPUS
-	} else {
-		m.maxVCPUs = int(maxVCPUs)
+		maxVCPUs = _KVM_NR_VCPUS
 	}
-
-	// The goal here is to avoid vCPU contentions for reasonable workloads.
-	// But "reasonable" isn't defined well in this case. Let's say that CPU
-	// overcommit with factor 2 is still acceptable. We allocate a set of
-	// vCPU for each goruntime processor (P) and two sets of vCPUs to run
-	// user code.
-	rCPUs := runtime.GOMAXPROCS(0)
-	if 3*rCPUs < m.maxVCPUs {
-		m.maxVCPUs = 3 * rCPUs
+	if m.maxVCPUs > int(maxVCPUs) {
+		m.maxVCPUs = int(maxVCPUs)
 	}
 }
 

diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -185,16 +185,12 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType,
 
 // getMaxVCPU get max vCPU number
 func (m *machine) getMaxVCPU() {
-	rmaxVCPUs := runtime.NumCPU()
-	smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
-	// compare the max vcpu number from runtime and syscall, use smaller one.
-	if errno != 0 {
-		m.maxVCPUs = rmaxVCPUs
-	} else {
-		if rmaxVCPUs < int(smaxVCPUs) {
-			m.maxVCPUs = rmaxVCPUs
-		} else {
-			m.maxVCPUs = int(smaxVCPUs)
-		}
+	if m.maxVCPUs == 0 {
+		m.maxVCPUs = runtime.NumCPU()
+	}
+
+	// Apply KVM limit.
+	if maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS); errno == 0 && m.maxVCPUs > int(maxVCPUs) {
+		m.maxVCPUs = int(maxVCPUs)
 	}
 }