Skip to content

Add support for executing a binary before saving and after restoring. #11697

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions pkg/sentry/control/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ package(
)

proto_library(
name = "control",
name = "control_api",
srcs = ["control.proto"],
visibility = ["//visibility:public"],
deps = [
Expand All @@ -27,14 +27,16 @@ go_library(
"pprof.go",
"proc.go",
"state.go",
"state_impl.go",
"usage.go",
],
visibility = [
"//:sandbox",
],
deps = [
":control_go_proto",
":control_api_go_proto",
"//pkg/abi/linux",
"//pkg/cleanup",
"//pkg/context",
"//pkg/eventchannel",
"//pkg/fd",
Expand All @@ -43,8 +45,12 @@ go_library(
"//pkg/metric",
"//pkg/metric:metric_go_proto",
"//pkg/prometheus",
"//pkg/sentry/devices/memdev",
"//pkg/sentry/devices/nvproxy",
"//pkg/sentry/fdcollector",
"//pkg/sentry/fdimport",
"//pkg/sentry/fsimpl/host",
"//pkg/sentry/fsimpl/pipefs",
"//pkg/sentry/fsimpl/user",
"//pkg/sentry/fsmetric",
"//pkg/sentry/kernel",
Expand All @@ -59,6 +65,7 @@ go_library(
"//pkg/sentry/watchdog",
"//pkg/sync",
"//pkg/tcpip/link/sniffer",
"//pkg/timing",
"//pkg/urpc",
"//pkg/usermem",
"@org_golang_google_protobuf//types/known/timestamppb",
Expand Down
2 changes: 1 addition & 1 deletion pkg/sentry/control/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/eventchannel"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
pb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto"
pb "gvisor.dev/gvisor/pkg/sentry/control/control_api_go_proto"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/user"
"gvisor.dev/gvisor/pkg/sentry/kernel"
Expand Down
233 changes: 232 additions & 1 deletion pkg/sentry/control/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,41 @@ package control
import (
"errors"
"fmt"
"strings"
"time"

"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fdcollector"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/state"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
"gvisor.dev/gvisor/pkg/timing"
"gvisor.dev/gvisor/pkg/urpc"
)

// SaveRestoreExecMode is the mode for the save/restore binary.
type SaveRestoreExecMode string

const (
// DefaultSaveRestoreExecTimeout is the default timeout for the save/restore
// binary.
DefaultSaveRestoreExecTimeout = 10 * time.Minute
// SaveRestoreExecSave is the save mode for the save/restore exec.
SaveRestoreExecSave SaveRestoreExecMode = "save"
// SaveRestoreExecRestore is the restore mode for the save/restore exec.
SaveRestoreExecRestore SaveRestoreExecMode = "restore"
// SaveRestoreExecResume is the resume mode for the save/restore binary.
SaveRestoreExecResume SaveRestoreExecMode = "resume"

saveRestoreExecEnvVar = "GVISOR_SAVE_RESTORE_AUTO_EXEC_MODE"
)

// ErrInvalidFiles is returned when the urpc call to Save does not include an
// appropriate file payload (e.g. there is no output file!).
var ErrInvalidFiles = errors.New("exactly one file must be provided")
Expand Down Expand Up @@ -59,6 +86,18 @@ type SaveOpts struct {
// Resume indicates if the sandbox process should continue running
// after checkpointing.
Resume bool

// SaveRestoreExecArgv is the argv of the save/restore binary split by spaces.
// The first element is the path to the binary.
SaveRestoreExecArgv string

// SaveRestoreExecTimeout is the timeout for waiting for the save/restore
// binary.
SaveRestoreExecTimeout time.Duration

// SaveRestoreExecContainerID is the ID of the container that the
// save/restore binary executes in.
SaveRestoreExecContainerID string
}

// Save saves the running system.
Expand Down Expand Up @@ -97,5 +136,197 @@ func (s *State) Save(o *SaveOpts, _ *struct{}) error {
}
defer saveOpts.PagesFile.Close()
}
return saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog)
if err := PreSave(s.Kernel, o); err != nil {
return err
}
if err := saveOpts.Save(s.Kernel.SupervisorContext(), s.Kernel, s.Watchdog); err != nil {
return err
}
if o.Resume {
err = PostResume(s.Kernel, nil)
}
return err
}

// PreSave is called before saving the kernel.
func PreSave(k *kernel.Kernel, o *SaveOpts) error {
if o.SaveRestoreExecArgv != "" {
saveRestoreExecArgv := strings.Split(o.SaveRestoreExecArgv, " ")
if err := ConfigureSaveRestoreExec(k, saveRestoreExecArgv, o.SaveRestoreExecTimeout, o.SaveRestoreExecContainerID); err != nil {
return fmt.Errorf("failed to configure save/restore binary: %w", err)
}
if err := SaveRestoreExec(k, SaveRestoreExecSave); err != nil {
return fmt.Errorf("failed to exec save/restore binary: %w", err)
}
}
return preSaveImpl(k, o)
}

// PostResume is called after resuming the kernel.
//
// Precondition: The kernel should be running.
func PostResume(k *kernel.Kernel, timeline *timing.Timeline) error {
if k.IsPaused() {
// The kernel is still paused (double-pause can happen with Docker which
// calls pause first and then checkpoint command). The final resume command
// will invoke save/restore binary if necessary.
return nil
}
if k.TaskSet().IsExiting() {
// This can occur when kernel is saved with control.SaveOpts.Resume=false.
// We can not invoke the save/restore binary on such a kernel.
return nil
}
if err := SaveRestoreExec(k, SaveRestoreExecResume); err != nil {
return fmt.Errorf("failed to wait for save/restore binary: %w", err)
}
return postResumeImpl(k, timeline)
}

// PostRestore is called after restoring the kernel.
//
// Precondition: The kernel should be running.
func PostRestore(k *kernel.Kernel, timeline *timing.Timeline) error {
if k.IsPaused() {
// The kernel is still paused (double-pause can happen with Docker which
// calls pause first and then checkpoint command). The final resume command
// will invoke cuda-checkpoint if necessary.
return nil
}
if k.TaskSet().IsExiting() {
// This can occur when kernel is saved with control.SaveOpts.Resume=false.
// We can not invoke cuda-checkpoint on such a kernel.
return nil
}
if err := SaveRestoreExec(k, SaveRestoreExecRestore); err != nil {
return fmt.Errorf("failed to wait for save/restore binary: %w", err)
}
return postRestoreImpl(k, timeline)
}

// SaveRestoreExec creates a new process that executes the save/restore
// binary specified by k.SaveRestoreExecConfig and waits for it to finish.
//
// Precondition: The kernel should be running; k.SetSaveRestoreExecConfig should
// be setup with an argv, otherwise this function is a no-op.
func SaveRestoreExec(k *kernel.Kernel, mode SaveRestoreExecMode) error {
if k.SaveRestoreExecConfig == nil {
return nil
}

leader := k.SaveRestoreExecConfig.LeaderTask
argv := k.SaveRestoreExecConfig.Argv
timeout := k.SaveRestoreExecConfig.Timeout
sctx := k.SupervisorContext()
contID := leader.ContainerID()
mntns := leader.MountNamespace()
if mntns == nil || !mntns.TryIncRef() {
log.Warningf("PID %d in container %q has exited, skipping CUDA checkpoint for it", leader.ThreadGroup().ID(), contID)
return nil
}
mntns.IncRef()
root := mntns.Root(sctx)
cu := cleanup.Make(func() {
root.DecRef(sctx)
})
defer cu.Clean()
ctx := vfs.WithRoot(sctx, root)
cu.Add(func() {
mntns.DecRef(ctx)
})

fdTable := k.NewFDTable()
cu.Add(func() {
fdTable.DecRef(sctx)
})
var execOut *fdcollector.Agent
rfd, wfd, err := pipefs.NewConnectedPipeFDs(ctx, k.PipeMount(), 0 /* flags */)
if err != nil {
log.Warningf("Failed to create stdout/stderr pipe for %s: %v", argv[0], err)
} else {
if _, err := fdTable.NewFDAt(ctx, 1, wfd, kernel.FDFlags{}); err != nil {
log.Warningf("Failed to make pipe stdout for %s: %v", argv[0], err)
}
if _, err := fdTable.NewFDAt(ctx, 2, wfd, kernel.FDFlags{}); err != nil {
log.Warningf("Failed to make pipe stderr for %s: %v", argv[0], err)
}
wfd.DecRef(ctx)
execOut = fdcollector.NewAgent(ctx, rfd, argv[0]) // transfers ownership of rfd
cu.Add(execOut.Stop)
}
// TODO(b/419041893): Support running the save/restore binary with container
// env vars without relying on the Saver().
var envv []string
if k.Saver() != nil {
envv = k.Saver().SpecEnviron(contID)
}

proc := Proc{
Kernel: k,
}
execArgs := ExecArgs{
Filename: argv[0],
Argv: argv,
Envv: append(envv, fmt.Sprintf("%s=%s", saveRestoreExecEnvVar, mode)),
ContainerID: contID,
MountNamespace: mntns,
PIDNamespace: k.RootPIDNamespace(),
Limits: limits.NewLimitSet(),
FDTable: fdTable,
}
tg, _, _, err := ExecAsync(&proc, &execArgs)
if err != nil {
return fmt.Errorf("failed to exec save/restore binary: %w", err)
}

waitC := make(chan struct{})
go func() {
tg.WaitExited()
waitC <- struct{}{}
}()
select {
case <-waitC:
if tg.ExitStatus() != 0 {
return fmt.Errorf("%v exited with non-zero status %d", argv[0], tg.ExitStatus())
}
case <-time.After(timeout):
tg.SendSignal(&linux.SignalInfo{Signo: int32(linux.SIGKILL)})
return fmt.Errorf("%s timed out after %v", argv[0], timeout)
}
log.Debugf("save/restore binary %s output: %s", argv[0], execOut.String())
return nil
}

// ConfigureSaveRestoreExec sets the configuration for the save/restore binary.
// If containerID is empty, the global init process will be used for the
// save/restore binary's leader task.
func ConfigureSaveRestoreExec(k *kernel.Kernel, argv []string, timeout time.Duration, containerID string) error {
if k.SaveRestoreExecConfig != nil {
return fmt.Errorf("save/restore binary is already set")
}
k.SaveRestoreExecConfig = &kernel.SaveRestoreExecConfig{
Argv: argv,
Timeout: timeout,
}

var leader *kernel.Task
if containerID != "" {
for _, tg := range k.RootPIDNamespace().ThreadGroups() {
// Find all processes with no parent (root of execution).
if tg.Leader().Parent() == nil {
cid := tg.Leader().ContainerID()
if cid == containerID {
leader = tg.Leader()
break
}
}
}
if leader == nil {
return fmt.Errorf("failed to find process associated with container %s", containerID)
}
} else {
leader = k.GlobalInit().Leader()
}
k.SaveRestoreExecConfig.LeaderTask = leader
return nil
}
35 changes: 35 additions & 0 deletions pkg/sentry/control/state_impl.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright 2025 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !false
// +build !false

package control

import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/timing"
)

func preSaveImpl(k *kernel.Kernel, o *SaveOpts) error {
return nil
}

func postRestoreImpl(k *kernel.Kernel, _ *timing.Timeline) error {
return nil
}

func postResumeImpl(k *kernel.Kernel, _ *timing.Timeline) error {
return nil
}
21 changes: 21 additions & 0 deletions pkg/sentry/fdcollector/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
load("//tools:defs.bzl", "go_library")

package(
default_applicable_licenses = ["//:license"],
licenses = ["notice"],
)

go_library(
name = "fdcollector",
srcs = ["fdcollector.go"],
visibility = ["//:sandbox"],
deps = [
"//pkg/context",
"//pkg/errors/linuxerr",
"//pkg/log",
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/usermem",
"//pkg/waiter",
],
)
Loading
Loading