Skip to content

Commit 4cbae8b

Browse files
authored
Merge pull request #28991 from l0rd/fix-machine-start-signal-handling
Fix signal handling during machine start on macOS
2 parents 5c8a6d5 + 12ea195 commit 4cbae8b

7 files changed

Lines changed: 282 additions & 96 deletions

File tree

pkg/machine/apple/apple.go

Lines changed: 67 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"net"
1010
"os"
1111
"os/exec"
12+
"os/signal"
1213
"syscall"
1314
"time"
1415

@@ -72,17 +73,14 @@ func StartGenericAppleVM(mc *vmconfigs.MachineConfig, cmdBinary string, bootload
7273
return nil, nil, err
7374
}
7475
// Set user networking with gvproxy
75-
7676
gvproxySocket, err := mc.GVProxySocket()
7777
if err != nil {
7878
return nil, nil, err
7979
}
80-
8180
// Wait on gvproxy to be running and aware
8281
if err := sockets.WaitForSocketWithBackoffs(gvProxyMaxBackoffAttempts, gvProxyWaitBackoff, gvproxySocket.GetPath(), "gvproxy"); err != nil {
8382
return nil, nil, err
8483
}
85-
8684
netDevice.SetUnixSocketPath(gvproxySocket.GetPath())
8785

8886
// create a one-time virtual machine for starting because we dont want all this information in the
@@ -244,24 +242,62 @@ func StartGenericAppleVM(mc *vmconfigs.MachineConfig, cmdBinary string, bootload
244242
return nil, nil, err
245243
}
246244

247-
returnFunc := func() error {
245+
// Start a goroutine that will evenutally propagate a
246+
// terminate signal to the VM process.
247+
// If the user decides to abort `podman machine start`
248+
// while the VM is starting, we want the VM to be stopped
249+
// too. Or the machine will be left in an inconsistent
250+
// state. Note that the goroutine will wait until the
251+
// the main goroutine completes.
252+
term := make(chan os.Signal, 1)
253+
signal.Notify(term, os.Interrupt, syscall.SIGTERM)
254+
// Get the PID first because cmd.Process will
255+
// be released in the main thread
256+
pid := cmd.Process.Pid
257+
go func() {
258+
<-term
259+
logrus.Debugf("Termination signal forwarded to the VM process (PID: %d)\n", pid)
260+
p, err := os.FindProcess(pid)
261+
if err != nil {
262+
logrus.Errorf("Failed to find process %d: %v", pid, err)
263+
return
264+
}
265+
err = p.Signal(os.Interrupt)
266+
if err != nil {
267+
logrus.Errorf("Termination signal received, but terminating the VM process (PID: %d) failed: %v", pid, err)
268+
return
269+
}
270+
// Wait and release the resources associated with the process
271+
if _, err := p.Wait(); err != nil {
272+
logrus.Debugf("Failed waiting for the process after terminating it: %v", err)
273+
}
274+
}()
275+
276+
// waitForReadyFunc is the callback that the caller of StartVM()
277+
// uses to block its execution until the the VM is ready or an error
278+
// occurs
279+
waitForReadyFunc := func() error {
248280
processErrChan := make(chan error)
249281
ctx, cancel := context.WithCancel(context.Background())
250282
defer cancel()
283+
// The VM readiness will be communicated on readyChan. In the
284+
// meantime, the following goroutine checks every 500ms that the VM
285+
// process is running. If it's not, returns an error on processErrChan.
251286
go func() {
252287
defer close(processErrChan)
288+
ticker := time.NewTicker(500 * time.Millisecond)
289+
defer ticker.Stop()
253290
for {
254-
select {
255-
case <-ctx.Done():
256-
return
257-
default:
258-
}
259291
if err := CheckProcessRunning(cmdBinary, cmd.Process.Pid); err != nil {
260292
processErrChan <- err
261293
return
262294
}
263-
// lets poll status every half second
264-
time.Sleep(500 * time.Millisecond)
295+
select {
296+
case <-ctx.Done():
297+
logrus.Debug("VM waitForReadyFunc goroutine: ctx done")
298+
return
299+
case <-ticker.C:
300+
}
265301
}
266302
}()
267303

@@ -279,7 +315,26 @@ func StartGenericAppleVM(mc *vmconfigs.MachineConfig, cmdBinary string, bootload
279315
}
280316
return nil
281317
}
282-
return cmd.Process.Release, returnFunc, nil
318+
319+
// releaseCmdFunc is the callback that the caller of StartVM()
320+
// uses to release the resources associated with cmd.Process.
321+
// It's important to execute it after waitForReadyFunc completes,
322+
// otherwise cmd.Process methods won't work anymore.
323+
relCmdFunc := func() error {
324+
if err := cmd.Process.Release(); err != nil {
325+
logrus.Errorf("error releasing VM Start command associated resources: %v", err)
326+
}
327+
if ignitionSocket != nil {
328+
if err := ignitionSocket.Delete(); err != nil {
329+
logrus.Errorf("unable to delete ignition socket: %v", err)
330+
}
331+
}
332+
if err := readySocket.Delete(); err != nil {
333+
logrus.Errorf("unable to delete ready socket: %v", err)
334+
}
335+
return nil
336+
}
337+
return relCmdFunc, waitForReadyFunc, nil
283338
}
284339

285340
// CheckProcessRunning checks non blocking if the pid exited

pkg/machine/cleanup.go

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package machine
22

33
import (
4+
"fmt"
45
"os"
56
"os/signal"
7+
"slices"
68
"sync"
79
"syscall"
810

@@ -22,7 +24,7 @@ func (c *CleanupCallback) CleanIfErr(err *error) {
2224
c.clean()
2325
}
2426

25-
func (c *CleanupCallback) CleanOnSignal() {
27+
func (c *CleanupCallback) CleanOnSignal(quiet bool) {
2628
ch := make(chan os.Signal, 1)
2729
signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
2830

@@ -31,28 +33,41 @@ func (c *CleanupCallback) CleanOnSignal() {
3133
return
3234
}
3335

36+
if !quiet {
37+
fmt.Println("Received a terminate signal")
38+
}
3439
c.clean()
40+
if !quiet {
41+
fmt.Println("Machine command rollback completed")
42+
}
3543
os.Exit(1)
3644
}
3745

3846
func (c *CleanupCallback) clean() {
47+
// When a term signal is received the cleanup can be invoked
48+
// concurrently in 2 goroutines:
49+
//
50+
// - signal flow: a termination signal is received and the
51+
// goroutine where CleanOnSignal() is running is
52+
// unblocked and starts invoking the callbacks
53+
// - error flow: an error is returned in the main goroutine, after
54+
// the signal is received, and CleanIfErr() is invoked,
55+
// but c.Funcs has been set to nil and therefore no
56+
// callbacks is exec in this goroutine
57+
//
58+
// When this is the case the second goroutine should be blocked until
59+
// the first goroutine comples the cleanup. c.Funcs is also set to nil
60+
// so that cleanup doesn't happen twice.
3961
c.mu.Lock()
40-
// Claim exclusive usage by copy and resetting to nil
4162
funcs := c.Funcs
4263
c.Funcs = nil
43-
c.mu.Unlock()
44-
45-
// Already claimed or none set
46-
if funcs == nil {
47-
return
48-
}
49-
50-
// Cleanup functions can now exclusively be run
51-
for _, cleanfunc := range funcs {
64+
// Cleanup functions invoked in reverse registration order
65+
for _, cleanfunc := range slices.Backward(funcs) {
5266
if err := cleanfunc(); err != nil {
5367
logrus.Error(err)
5468
}
5569
}
70+
c.mu.Unlock()
5671
}
5772

5873
func CleanUp() CleanupCallback {

pkg/machine/e2e/start_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import (
55
"fmt"
66
"net"
77
"net/url"
8+
"os/exec"
89
"strconv"
910
"sync"
11+
"syscall"
1012
"time"
1113

1214
jsoniter "github.com/json-iterator/go"
@@ -349,6 +351,65 @@ var _ = Describe("podman machine start", func() {
349351
Expect(sshCertFile).To(Exit(0))
350352
Expect(sshCertFile.outputToString()).To(Equal(certFileName))
351353
})
354+
It("start interrupted by SIGTERM while waiting for VM start", func() {
355+
if !isVmtype(define.AppleHvVirt) && !isVmtype(define.LibKrun) {
356+
Skip("SIGTERM interruption is supported on macOS only")
357+
}
358+
// Use the provider binary to pgrep the VM process
359+
var vmProcess string
360+
switch testProvider.VMType() {
361+
case define.AppleHvVirt:
362+
vmProcess = "vfkit"
363+
case define.LibKrun:
364+
vmProcess = "krunkit"
365+
default:
366+
}
367+
368+
i := new(initMachine)
369+
initSession, err := mb.setCmd(i.withImage(mb.imagePath)).run()
370+
Expect(err).ToNot(HaveOccurred())
371+
Expect(initSession).To(Exit(0))
372+
373+
s := new(startMachine)
374+
startSession, err := mb.setCmd(s).runWithoutWait()
375+
Expect(err).ToNot(HaveOccurred())
376+
377+
// Wait 45s for the VM process spawned by `podman machine start`
378+
Eventually(func() error {
379+
_, err := exec.Command("pgrep", vmProcess).Output()
380+
return err
381+
}, 45*time.Second, 500*time.Millisecond).Should(Succeed())
382+
383+
// Send a term signal now (podman machine start is waiting for
384+
// the VM process readiness)
385+
startSession.Signal(syscall.SIGTERM)
386+
387+
// Wait 30s for the podman machine start process to return (no deadlock)
388+
Eventually(startSession, 30*time.Second).Should(Exit())
389+
Expect(startSession.ExitCode()).ToNot(Equal(0))
390+
391+
// Wait 30s for the VM process to return (SIGTERM has been forwarded)
392+
Eventually(func() error {
393+
_, err := exec.Command("pgrep", vmProcess).Output()
394+
return err
395+
}, 30*time.Second, 500*time.Millisecond).ShouldNot(Succeed())
396+
397+
// Verify machine state is Stopped
398+
inspect := new(inspectMachine)
399+
inspectSession, err := mb.setCmd(inspect.withFormat("{{.State}}")).run()
400+
Expect(err).ToNot(HaveOccurred())
401+
Expect(inspectSession).To(Exit(0))
402+
Expect(inspectSession.outputToString()).To(Equal(define.Stopped))
403+
404+
// Verify no orphan gvproxy
405+
_, err = pgrep("gvproxy")
406+
Expect(err).To(HaveOccurred(), "gvproxy should not be running after SIGTERM cleanup")
407+
408+
// Restart without interrupting and confirm that completes without error
409+
startSession, err = mb.setCmd(s).run()
410+
Expect(err).ToNot(HaveOccurred())
411+
Expect(startSession).To(Exit(0))
412+
})
352413
})
353414

354415
func mapToPort(uris []string) ([]string, error) {

pkg/machine/gvproxy_unix.go

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package machine
55
import (
66
"errors"
77
"fmt"
8+
"os"
89
"syscall"
910
"time"
1011

@@ -42,10 +43,10 @@ func backoffForProcess(p *psutil.Process) error {
4243
// double the time
4344
sleepInterval += sleepInterval
4445
}
45-
return fmt.Errorf("process %d has not ended", p.Pid)
46+
return fmt.Errorf("process has not ended (PID %d)", p.Pid)
4647
}
4748

48-
// / waitOnProcess takes a pid and sends a sigterm to it. it then waits for the
49+
// waitOnProcess takes a pid and sends a sigterm to it. it then waits for the
4950
// process to not exist. if the sigterm does not end the process after an interval,
5051
// then sigkill is sent. it also waits for the process to exit after the sigkill too.
5152
func waitOnProcess(processID int) error {
@@ -64,7 +65,35 @@ func waitOnProcess(processID int) error {
6465
return nil
6566
}
6667

67-
if err := p.Kill(); err != nil {
68+
// Start a goroutine that waits until the gvproxy process completes.
69+
// This is necessary to reaps the process and so that Process.IsRunning()
70+
// in backoffForProcess() returns false. Otherwise the process will
71+
// be defunct and backoffForProcess fails because Process.IsRunning()
72+
// returns true
73+
go func() {
74+
gv, err := os.FindProcess(processID)
75+
if err != nil {
76+
logrus.Errorf("failed to find process %d: %v", processID, err)
77+
return
78+
}
79+
if _, err = gv.Wait(); err != nil {
80+
logrus.Debugf("gvproxy exited: %v", err)
81+
}
82+
}()
83+
84+
if err = p.Terminate(); err != nil {
85+
if errors.Is(err, syscall.ESRCH) {
86+
logrus.Debugf("Gvproxy already dead, exiting cleanly")
87+
return nil
88+
}
89+
return err
90+
}
91+
92+
if err = backoffForProcess(p); err == nil {
93+
return nil
94+
}
95+
96+
if err = p.Kill(); err != nil {
6897
if errors.Is(err, syscall.ESRCH) {
6998
logrus.Debugf("Gvproxy already dead, exiting cleanly")
7099
return nil

pkg/machine/hyperv/stubber.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ func (h HyperVStubber) CreateVM(_ define.CreateVMOpts, mc *vmconfigs.MachineConf
6464
callbackFuncs := machine.CleanUp()
6565
defer callbackFuncs.CleanIfErr(&err)
6666
callbackFuncs.Add(createErrorLogCallback(&err))
67-
go callbackFuncs.CleanOnSignal()
67+
go callbackFuncs.CleanOnSignal(false)
6868

6969
hwConfig := hypervctl.HardwareConfig{
7070
CPUs: uint16(mc.Resources.CPUs),
@@ -220,7 +220,7 @@ func (h HyperVStubber) MountVolumesToVM(mc *vmconfigs.MachineConfig, _ bool) err
220220
)
221221
callbackFuncs := machine.CleanUp()
222222
defer callbackFuncs.CleanIfErr(&err)
223-
go callbackFuncs.CleanOnSignal()
223+
go callbackFuncs.CleanOnSignal(true)
224224

225225
if len(mc.Mounts) == 0 {
226226
return nil
@@ -568,7 +568,7 @@ func (h HyperVStubber) StartVM(mc *vmconfigs.MachineConfig) (func() error, func(
568568
callbackFuncs := machine.CleanUp()
569569
defer callbackFuncs.CleanIfErr(&err)
570570
callbackFuncs.Add(createErrorLogCallback(&err))
571-
go callbackFuncs.CleanOnSignal()
571+
go callbackFuncs.CleanOnSignal(true)
572572

573573
if mc.IsFirstBoot() {
574574
// this is added because if the machine does not start

0 commit comments

Comments
 (0)