diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go index 06df6cbbf60..a537ee965e5 100644 --- a/libcontainer/mount_linux.go +++ b/libcontainer/mount_linux.go @@ -8,9 +8,11 @@ import ( "strconv" "strings" + securejoin "github.com/cyphar/filepath-securejoin" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + devices "github.com/opencontainers/cgroups/devices/config" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/internal/userns" "github.com/opencontainers/runc/libcontainer/utils" @@ -342,3 +344,29 @@ func mountFd(nsHandles *userns.Handles, m *configs.Mount) (_ *mountSource, retEr file: mountFile, }, nil } + +// Use mknod to create devices in the initial user namespace to pass devices with 0660 permissions +func usernsMknod(rootfs string, node *devices.Device) (*mountSource, error) { + var err error + + destPath, err := securejoin.SecureJoin(rootfs, node.Path) + if err != nil { + return nil, err + } + + err = createDeviceNode(rootfs, node, false) + if err != nil { + return nil, err + } + + mountFile, err := os.OpenFile(destPath, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return nil, err + } + + sourceType := mountSourcePlain + return &mountSource{ + Type: sourceType, + file: mountFile, + }, nil +} diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 439c7c7341d..01b126d31c4 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -18,6 +18,7 @@ import ( "syscall" "time" + securejoin "github.com/cyphar/filepath-securejoin" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -25,6 +26,7 @@ import ( "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fs2" + "github.com/opencontainers/runc/internal/sys" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/internal/userns" @@ -729,7 +731,45 @@ func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequ if !ok { break loop } - src, err := mountFd(nsHandles, m) + var src *mountSource + var err error + if m.Device == "usernsMknod" { + // Create device in initial user ns + for _, device := range p.config.Config.Devices { + if device.Path == m.Source { + src, err = usernsMknod(p.config.Config.Rootfs, device) + break + } + } + if src == nil && err == nil { + err = fmt.Errorf("can not find device node") + } + } else if m.Device != "tmpfs" { + src, err = mountFd(nsHandles, m) + } else { + // Mount tmpfs in initial user ns and chown it + entry := mountEntry{Mount: m} + mountConfig := &mountConfig{ + root: p.config.Config.Rootfs, + label: p.config.Config.MountLabel, + rootlessCgroups: p.config.Config.RootlessCgroups, + cgroupns: p.config.Config.Namespaces.Contains(configs.NEWCGROUP), + } + err := mountToRootfs(mountConfig, entry) + if err == nil { + destPath, _ := securejoin.SecureJoin(mountConfig.root, m.Destination) + mountFile, err := os.OpenFile(destPath, unix.O_PATH|unix.O_CLOEXEC, 0) + uid, _ := p.config.Config.HostRootUID() + gid, _ := p.config.Config.HostRootGID() + err = sys.FchownFile(mountFile, uid, gid) + if err == nil { + src = &mountSource{ + file: mountFile, + Type: mountSourcePlain, + } + } + } + } logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err) responseCh <- response{ src: src, diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 1ba386ed015..b54d66af258 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -14,13 +14,13 @@ import ( "github.com/cyphar/filepath-securejoin/pathrs-lite/procfs" "github.com/moby/sys/mountinfo" - "github.com/moby/sys/userns" "github.com/mrunalp/fileutils" "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + "github.com/moby/sys/userns" "github.com/opencontainers/cgroups" devices "github.com/opencontainers/cgroups/devices/config" "github.com/opencontainers/cgroups/fs2" @@ -113,6 +113,9 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { rootlessCgroups: config.RootlessCgroups, cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), } + + useMknodInInitialUserNS := (userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)) && !config.RootlessEUID + for _, m := range config.Mounts { entry := mountEntry{Mount: m} // Figure out whether we need to request runc to give us an @@ -120,7 +123,7 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { // necessary. For bind-mounts, this is only necessary if we cannot // resolve the parent mount (this is only hit if you are running in a // userns -- but for rootless the host-side thread can't help). - wantSourceFile := m.IsIDMapped() + wantSourceFile := m.IsIDMapped() || useMknodInInitialUserNS && pathrs.LexicallyCleanPath(m.Destination) == "/dev" if m.IsBind() && !config.RootlessEUID { if _, err := os.Stat(m.Source); err != nil { wantSourceFile = true @@ -143,7 +146,7 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { // that while m.Source might contain symlinks, the (*os.File).Name // is based on the path provided to os.OpenFile, not what it // resolves to. So this should never happen. - if sync.File.Name() != m.Source { + if sync.File.Name() != m.Source && (!useMknodInInitialUserNS || m.Destination != "/dev") { return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name()) } // Unmarshal the procMountFd argument (the file is sync.File). @@ -160,6 +163,9 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { src.file = sync.File entry.srcFile = src } + if useMknodInInitialUserNS && pathrs.LexicallyCleanPath(m.Destination) == "/dev" { + continue + } if err := mountToRootfs(mountConfig, entry); err != nil { return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) } @@ -167,9 +173,46 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { setupDev := needsSetupDev(config) if setupDev { - if err := createDevices(config); err != nil { - return fmt.Errorf("error creating device nodes: %w", err) + if !useMknodInInitialUserNS { + if err := createDevices(config, config.RootlessEUID); err != nil { + return fmt.Errorf("error creating device nodes: %w", err) + } + } else { + for _, node := range config.Devices { + if pathrs.LexicallyCleanPath(node.Path) == "/dev/ptmx" { + continue + } + + // We need to bind /dev/null to restore masked pathes. + if pathrs.LexicallyCleanPath(node.Path) == "/dev/null" { + if err := createDeviceNode(config.Rootfs, node, true); err != nil { + return fmt.Errorf("error creating device nodes: %w", err) + } + } + + m := &configs.Mount{ + Source: node.Path, + Destination: node.Path, + Device: "usernsMknod", + } + + // Request a source file from the host. + if err := writeSyncArg(pipe, procMountPlease, m); err != nil { + return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err) + } + + sync, err := readSyncFull(pipe, procMountFd) + if err != nil { + return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err) + } + + if sync.File == nil { + return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source) + } + defer sync.File.Close() + } } + if err := setupPtmx(config); err != nil { return fmt.Errorf("error setting up ptmx: %w", err) } @@ -935,8 +978,7 @@ func reOpenDevNull() error { } // Create the device nodes in the container. -func createDevices(config *configs.Config) error { - useBindMount := userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) +func createDevices(config *configs.Config, useBindMount bool) error { for _, node := range config.Devices { // The /dev/ptmx device is setup by setupPtmx() diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 5713460e486..f5980050c56 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -1116,17 +1116,15 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { "gid_map": gidMap, }).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached") } - rootUID, err := config.HostRootUID() - if err != nil { - return err - } - rootGID, err := config.HostRootGID() - if err != nil { - return err - } for _, node := range config.Devices { - node.Uid = uint32(rootUID) - node.Gid = uint32(rootGID) + hostUID, err := config.HostUID(int(node.Uid)) + if err == nil { + node.Uid = uint32(hostUID) + } + hostGID, err := config.HostGID(int(node.Gid)) + if err == nil { + node.Gid = uint32(hostGID) + } } return nil } diff --git a/tests/integration/userns.bats b/tests/integration/userns.bats index 1f2e83400e8..9059543c01b 100644 --- a/tests/integration/userns.bats +++ b/tests/integration/userns.bats @@ -280,3 +280,69 @@ function teardown() { # is deleted during the namespace cleanup. run ! ip link del dummy0 } + +@test "checkpoint userns with the new device" { + requires criu root + + update_config ' .linux.devices += [{"path": "/dev/another", "type": "c", "major": 1, "minor": 9, "fileMode": 432}]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + runc exec test_busybox head -n 1 /dev/another + [ "$status" -eq 0 ] + + for _ in $(seq 2); do + runc checkpoint --work-path ./work-dir test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox checkpointed + + # we need to chown images because child process can try to read them. + chown -R 100000:200000 ./checkpoint + + runc restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + runc exec test_busybox head -n 1 /dev/another + [ "$status" -eq 0 ] + done +} + +@test "checkpoint userns with the new device and non-root user" { + requires criu root + + update_config ' .process.user.uid = 42 + | .process.user.gid = 42 + | .linux.devices += [{"path": "/dev/another", "type": "c", "major": 1, "minor": 9, "fileMode": 432, "uid": 42, "gid": 42}]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + runc exec test_busybox head -n 1 /dev/another + [ "$status" -eq 0 ] + + for _ in $(seq 2); do + runc checkpoint --work-path ./work-dir test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox checkpointed + + # we need to chown images because child process can try to read them. + chown -R 100000:200000 ./checkpoint + + runc restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + runc exec test_busybox head -n 1 /dev/another + [ "$status" -eq 0 ] + done +} \ No newline at end of file