diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 79eed9f55b..94cd792f22 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -6,7 +6,7 @@ package( ) proto_library( - name = "control", + name = "control_api", srcs = ["control.proto"], visibility = ["//visibility:public"], deps = [ @@ -27,14 +27,16 @@ go_library( "pprof.go", "proc.go", "state.go", + "state_impl.go", "usage.go", ], visibility = [ "//:sandbox", ], deps = [ - ":control_go_proto", + ":control_api_go_proto", "//pkg/abi/linux", + "//pkg/cleanup", "//pkg/context", "//pkg/eventchannel", "//pkg/fd", @@ -43,8 +45,12 @@ go_library( "//pkg/metric", "//pkg/metric:metric_go_proto", "//pkg/prometheus", + "//pkg/sentry/devices/memdev", + "//pkg/sentry/devices/nvproxy", + "//pkg/sentry/fdcollector", "//pkg/sentry/fdimport", "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/user", "//pkg/sentry/fsmetric", "//pkg/sentry/kernel", @@ -59,6 +65,7 @@ go_library( "//pkg/sentry/watchdog", "//pkg/sync", "//pkg/tcpip/link/sniffer", + "//pkg/timing", "//pkg/urpc", "//pkg/usermem", "@org_golang_google_protobuf//types/known/timestamppb", diff --git a/pkg/sentry/control/lifecycle.go b/pkg/sentry/control/lifecycle.go index 8ccb3d9810..84d1e592da 100644 --- a/pkg/sentry/control/lifecycle.go +++ b/pkg/sentry/control/lifecycle.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" - pb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" + pb "gvisor.dev/gvisor/pkg/sentry/control/control_api_go_proto" "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" "gvisor.dev/gvisor/pkg/sentry/kernel" diff --git a/pkg/sentry/control/state.go b/pkg/sentry/control/state.go index b3b284f0cd..1be1c3a253 100644 --- a/pkg/sentry/control/state.go +++ b/pkg/sentry/control/state.go @@ -17,14 +17,41 @@ package control import ( "errors" "fmt" + "strings" + "time" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fdcollector" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/state" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/timing" "gvisor.dev/gvisor/pkg/urpc" ) +// SaveRestoreExecMode is the mode for the save/restore binary. +type SaveRestoreExecMode string + +const ( + // DefaultSaveRestoreExecTimeout is the default timeout for the save/restore + // binary. + DefaultSaveRestoreExecTimeout = 10 * time.Minute + // SaveRestoreExecSave is the save mode for the save/restore exec. + SaveRestoreExecSave SaveRestoreExecMode = "save" + // SaveRestoreExecRestore is the restore mode for the save/restore exec. + SaveRestoreExecRestore SaveRestoreExecMode = "restore" + // SaveRestoreExecResume is the resume mode for the save/restore binary. + SaveRestoreExecResume SaveRestoreExecMode = "resume" + + saveRestoreExecEnvVar = "GVISOR_SAVE_RESTORE_AUTO_EXEC_MODE" +) + // ErrInvalidFiles is returned when the urpc call to Save does not include an // appropriate file payload (e.g. there is no output file!). var ErrInvalidFiles = errors.New("exactly one file must be provided") @@ -59,6 +86,18 @@ type SaveOpts struct { // Resume indicates if the sandbox process should continue running // after checkpointing. Resume bool + + // SaveRestoreExecArgv is the argv of the save/restore binary split by spaces. + // The first element is the path to the binary. + SaveRestoreExecArgv string + + // SaveRestoreExecTimeout is the timeout for waiting for the save/restore + // binary. + SaveRestoreExecTimeout time.Duration + + // SaveRestoreExecContainerID is the ID of the container that the + // save/restore binary executes in. + SaveRestoreExecContainerID string } // Save saves the running system. @@ -97,5 +136,197 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error { } defer saveOpts.PagesFile.Close() } - return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog) + if err := PreSave(s.Kernel, o); err != nil { + return err + } + if err := saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog); err != nil { + return err + } + if o.Resume { + err = PostResume(s.Kernel, nil) + } + return err +} + +// PreSave is called before saving the kernel. +func PreSave(k *kernel.Kernel, o *SaveOpts) error { + if o.SaveRestoreExecArgv != "" { + saveRestoreExecArgv := strings.Split(o.SaveRestoreExecArgv, " ") + if err := ConfigureSaveRestoreExec(k, saveRestoreExecArgv, o.SaveRestoreExecTimeout, o.SaveRestoreExecContainerID); err != nil { + return fmt.Errorf("failed to configure save/restore binary: %w", err) + } + if err := SaveRestoreExec(k, SaveRestoreExecSave); err != nil { + return fmt.Errorf("failed to exec save/restore binary: %w", err) + } + } + return preSaveImpl(k, o) +} + +// PostResume is called after resuming the kernel. +// +// Precondition: The kernel should be running. +func PostResume(k *kernel.Kernel, timeline *timing.Timeline) error { + if k.IsPaused() { + // The kernel is still paused (double-pause can happen with Docker which + // calls pause first and then checkpoint command). The final resume command + // will invoke save/restore binary if necessary. + return nil + } + if k.TaskSet().IsExiting() { + // This can occur when kernel is saved with control.SaveOpts.Resume=false. + // We can not invoke the save/restore binary on such a kernel. + return nil + } + if err := SaveRestoreExec(k, SaveRestoreExecResume); err != nil { + return fmt.Errorf("failed to wait for save/restore binary: %w", err) + } + return postResumeImpl(k, timeline) +} + +// PostRestore is called after restoring the kernel. +// +// Precondition: The kernel should be running. +func PostRestore(k *kernel.Kernel, timeline *timing.Timeline) error { + if k.IsPaused() { + // The kernel is still paused (double-pause can happen with Docker which + // calls pause first and then checkpoint command). The final resume command + // will invoke cuda-checkpoint if necessary. + return nil + } + if k.TaskSet().IsExiting() { + // This can occur when kernel is saved with control.SaveOpts.Resume=false. + // We can not invoke cuda-checkpoint on such a kernel. + return nil + } + if err := SaveRestoreExec(k, SaveRestoreExecRestore); err != nil { + return fmt.Errorf("failed to wait for save/restore binary: %w", err) + } + return postRestoreImpl(k, timeline) +} + +// SaveRestoreExec creates a new process that executes the save/restore +// binary specified by k.SaveRestoreExecConfig and waits for it to finish. +// +// Precondition: The kernel should be running; k.SetSaveRestoreExecConfig should +// be setup with an argv, otherwise this function is a no-op. +func SaveRestoreExec(k *kernel.Kernel, mode SaveRestoreExecMode) error { + if k.SaveRestoreExecConfig == nil { + return nil + } + + leader := k.SaveRestoreExecConfig.LeaderTask + argv := k.SaveRestoreExecConfig.Argv + timeout := k.SaveRestoreExecConfig.Timeout + sctx := k.SupervisorContext() + contID := leader.ContainerID() + mntns := leader.MountNamespace() + if mntns == nil || !mntns.TryIncRef() { + log.Warningf("PID %d in container %q has exited, skipping CUDA checkpoint for it", leader.ThreadGroup().ID(), contID) + return nil + } + mntns.IncRef() + root := mntns.Root(sctx) + cu := cleanup.Make(func() { + root.DecRef(sctx) + }) + defer cu.Clean() + ctx := vfs.WithRoot(sctx, root) + cu.Add(func() { + mntns.DecRef(ctx) + }) + + fdTable := k.NewFDTable() + cu.Add(func() { + fdTable.DecRef(sctx) + }) + var execOut *fdcollector.Agent + rfd, wfd, err := pipefs.NewConnectedPipeFDs(ctx, k.PipeMount(), 0 /* flags */) + if err != nil { + log.Warningf("Failed to create stdout/stderr pipe for %s: %v", argv[0], err) + } else { + if _, err := fdTable.NewFDAt(ctx, 1, wfd, kernel.FDFlags{}); err != nil { + log.Warningf("Failed to make pipe stdout for %s: %v", argv[0], err) + } + if _, err := fdTable.NewFDAt(ctx, 2, wfd, kernel.FDFlags{}); err != nil { + log.Warningf("Failed to make pipe stderr for %s: %v", argv[0], err) + } + wfd.DecRef(ctx) + execOut = fdcollector.NewAgent(ctx, rfd, argv[0]) // transfers ownership of rfd + cu.Add(execOut.Stop) + } + // TODO(b/419041893): Support running the save/restore binary with container + // env vars without relying on the Saver(). + var envv []string + if k.Saver() != nil { + envv = k.Saver().SpecEnviron(contID) + } + + proc := Proc{ + Kernel: k, + } + execArgs := ExecArgs{ + Filename: argv[0], + Argv: argv, + Envv: append(envv, fmt.Sprintf("%s=%s", saveRestoreExecEnvVar, mode)), + ContainerID: contID, + MountNamespace: mntns, + PIDNamespace: k.RootPIDNamespace(), + Limits: limits.NewLimitSet(), + FDTable: fdTable, + } + tg, _, _, err := ExecAsync(&proc, &execArgs) + if err != nil { + return fmt.Errorf("failed to exec save/restore binary: %w", err) + } + + waitC := make(chan struct{}) + go func() { + tg.WaitExited() + waitC <- struct{}{} + }() + select { + case <-waitC: + if tg.ExitStatus() != 0 { + return fmt.Errorf("%v exited with non-zero status %d", argv[0], tg.ExitStatus()) + } + case <-time.After(timeout): + tg.SendSignal(&linux.SignalInfo{Signo: int32(linux.SIGKILL)}) + return fmt.Errorf("%s timed out after %v", argv[0], timeout) + } + log.Debugf("save/restore binary %s output: %s", argv[0], execOut.String()) + return nil +} + +// ConfigureSaveRestoreExec sets the configuration for the save/restore binary. +// If containerID is empty, the global init process will be used for the +// save/restore binary's leader task. +func ConfigureSaveRestoreExec(k *kernel.Kernel, argv []string, timeout time.Duration, containerID string) error { + if k.SaveRestoreExecConfig != nil { + return fmt.Errorf("save/restore binary is already set") + } + k.SaveRestoreExecConfig = &kernel.SaveRestoreExecConfig{ + Argv: argv, + Timeout: timeout, + } + + var leader *kernel.Task + if containerID != "" { + for _, tg := range k.RootPIDNamespace().ThreadGroups() { + // Find all processes with no parent (root of execution). + if tg.Leader().Parent() == nil { + cid := tg.Leader().ContainerID() + if cid == containerID { + leader = tg.Leader() + break + } + } + } + if leader == nil { + return fmt.Errorf("failed to find process associated with container %s", containerID) + } + } else { + leader = k.GlobalInit().Leader() + } + k.SaveRestoreExecConfig.LeaderTask = leader + return nil } diff --git a/pkg/sentry/control/state_impl.go b/pkg/sentry/control/state_impl.go new file mode 100644 index 0000000000..346c2248f4 --- /dev/null +++ b/pkg/sentry/control/state_impl.go @@ -0,0 +1,35 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !false +// +build !false + +package control + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/timing" +) + +func preSaveImpl(k *kernel.Kernel, o *SaveOpts) error { + return nil +} + +func postRestoreImpl(k *kernel.Kernel, _ *timing.Timeline) error { + return nil +} + +func postResumeImpl(k *kernel.Kernel, _ *timing.Timeline) error { + return nil +} diff --git a/pkg/sentry/fdcollector/BUILD b/pkg/sentry/fdcollector/BUILD new file mode 100644 index 0000000000..5d2216eb82 --- /dev/null +++ b/pkg/sentry/fdcollector/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_library") + +package( + default_applicable_licenses = ["//:license"], + licenses = ["notice"], +) + +go_library( + name = "fdcollector", + srcs = ["fdcollector.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/log", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fdcollector/fdcollector.go b/pkg/sentry/fdcollector/fdcollector.go new file mode 100644 index 0000000000..ae34403865 --- /dev/null +++ b/pkg/sentry/fdcollector/fdcollector.go @@ -0,0 +1,109 @@ +// Copyright 2022 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fdcollector provides a goroutine that reads from a +// vfs.FileDescription (which may block) into a bytes.Buffer. +package fdcollector + +import ( + "bytes" + "io" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Agent represents a goroutine that reads from a vfs.FileDescription +// (which may block) into a bytes.Buffer. +type Agent struct { + ctx context.Context + rfd *vfs.FileDescription + desc string + stopC chan struct{} + mu sync.Mutex + buf bytes.Buffer +} + +// NewAgent creates a new Fdcollector agent. +func NewAgent(ctx context.Context, rfd *vfs.FileDescription, desc string) *Agent { + c := &Agent{ + ctx: ctx, + rfd: rfd, + desc: desc, + stopC: make(chan struct{}), + } + go c.run() + return c +} + +// Run starts the goroutine that reads from the vfs.FileDescription. It blocks +// until the vfs.FileDescription is closed or an error occurs. +func (c *Agent) run() { + defer c.rfd.DecRef(c.ctx) + + var buf [4096]byte // arbitrary size + dst := usermem.BytesIOSequence(buf[:]) + e, ch := waiter.NewChannelEntry(waiter.EventIn) + if err := c.rfd.EventRegister(&e); err != nil { + log.Warningf("Error registering for events from %s: %v", c.desc, err) + return + } + defer c.rfd.EventUnregister(&e) + for { + n, err := c.rfd.Read(c.ctx, dst, vfs.ReadOptions{}) + if n != 0 { + c.mu.Lock() + c.buf.Write(buf[:n]) + c.mu.Unlock() + } + if err != nil { + switch err { + case linuxerr.ErrWouldBlock: + select { + case <-ch: + continue + case <-c.stopC: + return + } + case io.EOF: + log.Debugf("Finished reading output from %s", c.desc) + return + default: + log.Warningf("Error reading output from %s: %v", c.desc, err) + return + } + } + } +} + +// Stop stops the goroutine that reads from the vfs.FileDescription. +func (c *Agent) Stop() { + close(c.stopC) +} + +// String returns a string representation of the FdCollector. +func (c *Agent) String() string { + c.mu.Lock() + defer c.mu.Unlock() + // Note that the conversion to string is significant since it copies + // c.buf.Bytes(), which may be modified after c.mu.Unlock(). If you change + // this function to return []byte for some reason, c.buf.Bytes() needs to + // be cloned instead. + return string(c.buf.Bytes()) +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index f2d2359128..6e4383f403 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -340,6 +340,8 @@ go_library( "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", + "//pkg/sentry/devices/nvproxy/nvconf", + "//pkg/sentry/fdcollector", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/lock", "//pkg/sentry/fsimpl/mqfs", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 7e936f46c0..83c63eed9f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -53,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy/nvconf" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" @@ -123,6 +124,24 @@ type CgroupMount struct { Mount *vfs.Mount } +// SaveRestoreExecConfig contains the configuration for the save/restore binary. +// +// +stateify savable +type SaveRestoreExecConfig struct { + // Argv is the argv to the save/restore binary. The binary path is expected to + // be argv[0]. The specified binary is executed with an environment variable + // (GVISOR_SAVE_RESTORE_AUTO_EXEC_MODE) set to "save" before the kernel is + // saved, "restore" after the kernel is restored and restarted, and "resume" + // after the kernel is saved and resumed. + Argv []string + // Timeout is the timeout for the save/restore binary. If the binary fails to + // exit within this timeout the save/restore operation will fail. + Timeout time.Duration + // LeaderTask is the task in the kernel that the save/restore binary will run + // under. + LeaderTask *Task +} + // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // @@ -369,6 +388,14 @@ type Kernel struct { // UnixSocketOpts stores configuration options for management of unix sockets. UnixSocketOpts transport.UnixSocketOpts + + // SaveRestoreExecConfig stores configuration options for the save/restore + // exec binary. + SaveRestoreExecConfig *SaveRestoreExecConfig + + // NvidiaDriverVersion is the NVIDIA driver version configured for this + // sandbox. + NvidiaDriverVersion nvconf.DriverVersion } // InitKernelArgs holds arguments to Init. diff --git a/pkg/state/statefile/statefile.go b/pkg/state/statefile/statefile.go index 3182c51220..109044b6b3 100644 --- a/pkg/state/statefile/statefile.go +++ b/pkg/state/statefile/statefile.go @@ -115,6 +115,18 @@ type Options struct { // Resume indicates if the sandbox process should continue running // after checkpointing. Resume bool + + // SaveRestoreExecArgv is the argv of the save/restore binary split by spaces. + // The first element is the path to the binary. + SaveRestoreExecArgv string + + // SaveRestoreExecTimeout is the timeout for waiting for the save/restore + // binary. + SaveRestoreExecTimeout time.Duration + + // SaveRestoreExecContainerID is the ID of the container that the + // save/restore binary executes in. + SaveRestoreExecContainerID string } // WriteToMetadata save options to the metadata storage. Method returns the diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 1558f2a76c..8383709f2e 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -725,7 +725,7 @@ func (cm *containerManager) Pause(_, _ *struct{}) error { // Resume resumes all tasks. func (cm *containerManager) Resume(_, _ *struct{}) error { cm.l.k.Unpause() - return postResumeImpl(cm.l, nil) + return control.PostResume(cm.l.k, nil) } // Wait waits for the init process in the given container. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 2993ba7b2e..ab2fcc5ef8 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -149,10 +149,6 @@ type containerInfo struct { // nvidiaUVMDevMajor is the device major number used for nvidia-uvm. nvidiaUVMDevMajor uint32 - - // nvidiaDriverVersion is the NVIDIA driver ABI version to use for - // communicating with NVIDIA devices on the host. - nvidiaDriverVersion nvconf.DriverVersion } type loaderState int @@ -472,12 +468,11 @@ func New(args Args) (*Loader, error) { containerName := l.registerContainer(args.Spec, args.ID) l.root = containerInfo{ - cid: args.ID, - containerName: containerName, - conf: args.Conf, - spec: args.Spec, - goferMountConfs: args.GoferMountConfs, - nvidiaDriverVersion: args.NvidiaDriverVersion, + cid: args.ID, + containerName: containerName, + conf: args.Conf, + spec: args.Spec, + goferMountConfs: args.GoferMountConfs, } // Make host FDs stable between invocations. Host FDs must map to the exact @@ -532,7 +527,10 @@ func New(args Args) (*Loader, error) { log.Warningf("Application cudaMallocManaged() is flaky on -platform=kvm, see gvisor.dev/docs/user_guide/gpu/#platforms") } } - l.k = &kernel.Kernel{Platform: p} + l.k = &kernel.Kernel{ + Platform: p, + NvidiaDriverVersion: args.NvidiaDriverVersion, + } // Create memory file. mf, err := createMemoryFile(args.Conf.AppHugePages, args.HostTHP) @@ -1099,16 +1097,15 @@ func (l *Loader) startSubcontainer(spec *specs.Spec, conf *config.Config, cid st containerName := l.registerContainerLocked(spec, cid) l.k.RegisterContainerName(cid, containerName) info := &containerInfo{ - cid: cid, - containerName: containerName, - conf: conf, - spec: spec, - goferFDs: goferFDs, - devGoferFD: devGoferFD, - goferFilestoreFDs: goferFilestoreFDs, - goferMountConfs: goferMountConfs, - nvidiaUVMDevMajor: l.root.nvidiaUVMDevMajor, - nvidiaDriverVersion: l.root.nvidiaDriverVersion, + cid: cid, + containerName: containerName, + conf: conf, + spec: spec, + goferFDs: goferFDs, + devGoferFD: devGoferFD, + goferFilestoreFDs: goferFilestoreFDs, + goferMountConfs: goferMountConfs, + nvidiaUVMDevMajor: l.root.nvidiaUVMDevMajor, } var err error info.procArgs, err = createProcessArgs(cid, spec, conf, creds, l.k, pidns) diff --git a/runsc/boot/restore.go b/runsc/boot/restore.go index e052fc24c4..7fbcbe76b7 100644 --- a/runsc/boot/restore.go +++ b/runsc/boot/restore.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy/nvconf" "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -208,7 +209,9 @@ func (r *restorer) restore(l *Loader) error { l.watchdog.Start() // Release the kernel and replace it with a new one that will be restored into. + var oldNvidiaDriverVersion nvconf.DriverVersion if l.k != nil { + oldNvidiaDriverVersion = l.k.NvidiaDriverVersion l.k.Release() } l.k = &kernel.Kernel{ @@ -277,6 +280,9 @@ func (r *restorer) restore(l *Loader) error { return fmt.Errorf("failed to load kernel: %w", err) } r.timer.Reached("kernel loaded") + if oldNvidiaDriverVersion.Major() > 0 && !l.k.NvidiaDriverVersion.Equals(oldNvidiaDriverVersion) { + return fmt.Errorf("nvidia driver version changed during restore: was %v, now %v", oldNvidiaDriverVersion, l.k.NvidiaDriverVersion) + } if r.asyncMFLoader != nil { if r.background { @@ -362,7 +368,7 @@ func (r *restorer) restore(l *Loader) error { go func() { defer postRestoreThread.End() postRestoreThread.Reached("scheduled") - if err := postRestoreImpl(l, postRestoreThread); err != nil { + if err := control.PostRestore(l.k, postRestoreThread); err != nil { log.Warningf("Killing the sandbox after post restore work failed: %v", err) l.k.Kill(linux.WaitStatusTerminationSignal(linux.SIGKILL)) return @@ -428,10 +434,6 @@ func (l *Loader) save(o *control.SaveOpts) (err error) { } o.Metadata[ContainerSpecsKey] = specsStr - if err := preSaveImpl(l, o); err != nil { - return err - } - state := control.State{ Kernel: l.k, Watchdog: l.watchdog, @@ -439,11 +441,5 @@ func (l *Loader) save(o *control.SaveOpts) (err error) { if err := state.Save(o, nil); err != nil { return err } - - if o.Resume { - if err := postResumeImpl(l, nil); err != nil { - return err - } - } return nil } diff --git a/runsc/boot/restore_impl.go b/runsc/boot/restore_impl.go index e18fe822c4..0f552e8420 100644 --- a/runsc/boot/restore_impl.go +++ b/runsc/boot/restore_impl.go @@ -19,26 +19,10 @@ package boot import ( specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" - "gvisor.dev/gvisor/pkg/timing" "gvisor.dev/gvisor/runsc/config" ) -func preSaveImpl(*Loader, *control.SaveOpts) error { - return nil -} - -// Precondition: The kernel should be running. -func postRestoreImpl(*Loader, *timing.Timeline) error { - return nil -} - -// Precondition: The kernel should be running. -func postResumeImpl(*Loader, *timing.Timeline) error { - return nil -} - func newProcInternalData(conf *config.Config, _ *specs.Spec) *proc.InternalData { return &proc.InternalData{ GVisorMarkerFile: conf.GVisorMarkerFile, diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index cf7aa06bee..727597ff39 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/devices/memdev" "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" + "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy/nvconf" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy/vfio" "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" @@ -154,7 +155,7 @@ func registerFilesystems(k *kernel.Kernel, info *containerInfo) error { return fmt.Errorf("registering fusedev: %w", err) } - if err := nvproxyRegisterDevices(info, vfsObj); err != nil { + if err := nvproxyRegisterDevices(info, vfsObj, k.NvidiaDriverVersion); err != nil { return err } @@ -1476,7 +1477,7 @@ func createDeviceFile(ctx context.Context, creds *auth.Credentials, info *contai return dev.CreateDeviceFile(ctx, vfsObj, creds, root, devSpec.Path, major, minor, mode, devSpec.UID, devSpec.GID) } -func nvproxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error { +func nvproxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem, nvidiaDriverVersion nvconf.DriverVersion) error { if !specutils.NVProxyEnabled(info.spec, info.conf) { return nil } @@ -1488,7 +1489,7 @@ func nvproxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) if err != nil { return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err) } - if err := nvproxy.Register(vfsObj, info.nvidiaDriverVersion, driverCaps, uvmDevMajor, true /* useDevGofer */); err != nil { + if err := nvproxy.Register(vfsObj, nvidiaDriverVersion, driverCaps, uvmDevMajor, true /* useDevGofer */); err != nil { return fmt.Errorf("registering nvproxy driver: %w", err) } info.nvidiaUVMDevMajor = uvmDevMajor diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go index a865bd72f4..c7460c6d2b 100644 --- a/runsc/cmd/checkpoint.go +++ b/runsc/cmd/checkpoint.go @@ -18,8 +18,10 @@ import ( "context" "fmt" "os" + "time" "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/state/statefile" "gvisor.dev/gvisor/runsc/cmd/util" @@ -34,6 +36,8 @@ type Checkpoint struct { leaveRunning bool compression CheckpointCompression excludeCommittedZeroPages bool + saveRestoreExecArgv string + saveRestoreExecTimeout time.Duration // direct indicates whether O_DIRECT should be used for writing the // checkpoint pages file. It bypasses the kernel page cache. It is beneficial @@ -65,6 +69,8 @@ func (c *Checkpoint) SetFlags(f *flag.FlagSet) { f.Var(newCheckpointCompressionValue(statefile.CompressionLevelDefault, &c.compression), "compression", "compress checkpoint image on disk. Values: none|flate-best-speed.") f.BoolVar(&c.excludeCommittedZeroPages, "exclude-committed-zero-pages", false, "exclude committed zero-filled pages from checkpoint") f.BoolVar(&c.direct, "direct", false, "use O_DIRECT for writing checkpoint pages file") + f.StringVar(&c.saveRestoreExecArgv, "save-restore-exec-argv", "", "argv (split by spaces) for a save/restore binary that's automatically executed in the sandbox before saving and after restoring. If the execution fails, the save/restore process will fail.") + f.DurationVar(&c.saveRestoreExecTimeout, "save-restore-exec-timeout", control.DefaultSaveRestoreExecTimeout, "timeout for the binary pointed to by save-restore-exec-argv.") // Unimplemented flags necessary for compatibility with docker. var wp string @@ -95,7 +101,10 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...any) su } sOpts := statefile.Options{ - Compression: c.compression.Level(), + Compression: c.compression.Level(), + SaveRestoreExecArgv: c.saveRestoreExecArgv, + SaveRestoreExecTimeout: c.saveRestoreExecTimeout, + SaveRestoreExecContainerID: id, } mfOpts := pgalloc.SaveOpts{ ExcludeCommittedZeroPages: c.excludeCommittedZeroPages, diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 6a4db8b596..a9d525e4d3 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -1469,8 +1469,11 @@ func (s *Sandbox) Checkpoint(cid string, imagePath string, direct bool, sfOpts s FilePayload: urpc.FilePayload{ Files: files, }, - HavePagesFile: len(files) > 1, - Resume: sfOpts.Resume, + HavePagesFile: len(files) > 1, + Resume: sfOpts.Resume, + SaveRestoreExecArgv: sfOpts.SaveRestoreExecArgv, + SaveRestoreExecTimeout: sfOpts.SaveRestoreExecTimeout, + SaveRestoreExecContainerID: sfOpts.SaveRestoreExecContainerID, } if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil {