Skip to content

Commit acc4aa3

Browse files
shayonjgvisor-bot
authored andcommitted
setns: support user namespaces
User namespace entries under `/proc/[pid]/ns` currently render as fake namespace symlinks. They look like the other namespace files, but opening them does not produce an `nsfs` file that `setns(2)` can use. Rootless container tools such as `buildah` and `podman` rely on that file when they re-enter the pause process user namespace, so the second lifecycle command fails with `EINVAL`. Make `UserNamespace` implement `vfs.Namespace` and give each user namespace an `nsfs` inode when it is created. `/proc/[pid]/ns/user` now uses the regular namespace symlink path, so opening it returns a joinable namespace file instead of a fake link target. `Setns` now accepts `CLONE_NEWUSER` from both `nsfd`s and `pidfd`s. It follows the Linux restrictions for user namespace joins by rejecting the caller's current user namespace, requiring `CAP_SYS_ADMIN` in the target user namespace, rejecting multithreaded callers, and rejecting callers with `fs` state shared outside the thread group. The capability checks for any other namespaces in the same `setns` call use the credentials the caller would have after joining the user namespace. Add a syscall regression test that creates a child user namespace, opens `/proc/<pid>/ns/user`, and verifies that `setns(CLONE_NEWUSER)` succeeds. Fixes #13314 FUTURE_COPYBARA_INTEGRATE_REVIEW=#13323 from shayonj:issue-13314-userns-setns 8060b5f PiperOrigin-RevId: 925507008
1 parent 5fcf887 commit acc4aa3

11 files changed

Lines changed: 325 additions & 19 deletions

File tree

pkg/sentry/fsimpl/proc/task.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns
7979
"net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET),
8080
"mnt": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS),
8181
"pid": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWPID),
82-
"user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"),
82+
"user": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUSER),
8383
"ipc": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC),
8484
"uts": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS),
8585
}),

pkg/sentry/fsimpl/proc/task_files.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,6 +1461,12 @@ func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode {
14611461
return pidns.GetInode()
14621462
}
14631463
return nil
1464+
case linux.CLONE_NEWUSER:
1465+
inode, _ := t.UserNamespace().TryGetInode().(*nsfs.Inode)
1466+
if inode == nil {
1467+
return nil
1468+
}
1469+
return inode
14641470
default:
14651471
panic("unknown namespace")
14661472
}

pkg/sentry/fsimpl/testutil/kernel.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
158158
UserCounters: k.GetUserCounters(creds.RealKUID),
159159
}
160160
config.NetworkNamespace.IncRef()
161+
config.Credentials.UserNamespace.IncRef()
161162
t, err := k.TaskSet().NewTask(ctx, config)
162163
if err != nil {
163164
config.ThreadGroup.Release(ctx)

pkg/sentry/kernel/auth/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ go_library(
9999
"//pkg/errors/linuxerr",
100100
"//pkg/log",
101101
"//pkg/rand",
102+
"//pkg/refs",
102103
"//pkg/sentry/seccheck",
103104
"//pkg/sentry/seccheck/points:points_go_proto",
104105
"//pkg/sync",

pkg/sentry/kernel/auth/user_namespace.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"gvisor.dev/gvisor/pkg/abi/linux"
2121
"gvisor.dev/gvisor/pkg/context"
2222
"gvisor.dev/gvisor/pkg/errors/linuxerr"
23+
"gvisor.dev/gvisor/pkg/refs"
2324
)
2425

2526
// A UserNamespace represents a user namespace. See user_namespaces(7) for
@@ -60,6 +61,10 @@ type UserNamespace struct {
6061

6162
// setgroupsAllowed mirrors USERNS_SETGROUPS_ALLOWED in Linux. Protected by mu.
6263
setgroupsAllowed bool
64+
65+
// inode is the nsfs inode associated with this namespace. This is stored as
66+
// refs.TryRefCounter instead of *nsfs.Inode because nsfs imports auth.
67+
inode refs.TryRefCounter
6368
}
6469

6570
// NewRootUserNamespace returns a UserNamespace that is appropriate for a
@@ -99,6 +104,56 @@ func (ns *UserNamespace) Root() *UserNamespace {
99104
return ns
100105
}
101106

107+
// Type implements vfs.Namespace.Type.
108+
func (ns *UserNamespace) Type() string {
109+
return "user"
110+
}
111+
112+
// Destroy implements vfs.Namespace.Destroy.
113+
func (ns *UserNamespace) Destroy(ctx context.Context) {}
114+
115+
// UserNamespace implements vfs.Namespace.UserNamespace.
116+
func (ns *UserNamespace) UserNamespace() *UserNamespace {
117+
return ns
118+
}
119+
120+
// SetInode sets the nsfs inode associated with ns. The initial ref on inode is
121+
// the task or kernel ref for a newly-created user namespace, so those callers
122+
// don't need a separate IncRef.
123+
func (ns *UserNamespace) SetInode(inode refs.TryRefCounter) {
124+
ns.mu.Lock()
125+
defer ns.mu.Unlock()
126+
ns.inode = inode
127+
}
128+
129+
// IncRef increments ns's inode refcount.
130+
func (ns *UserNamespace) IncRef() {
131+
ns.mu.Lock()
132+
defer ns.mu.Unlock()
133+
if ns.inode != nil {
134+
ns.inode.IncRef()
135+
}
136+
}
137+
138+
// TryGetInode returns ns's inode with an incremented refcount.
139+
func (ns *UserNamespace) TryGetInode() refs.TryRefCounter {
140+
ns.mu.Lock()
141+
defer ns.mu.Unlock()
142+
if ns.inode == nil || !ns.inode.TryIncRef() {
143+
return nil
144+
}
145+
return ns.inode
146+
}
147+
148+
// DecRef decrements ns's inode refcount.
149+
func (ns *UserNamespace) DecRef(ctx context.Context) {
150+
ns.mu.Lock()
151+
defer ns.mu.Unlock()
152+
if ns.inode != nil {
153+
ns.inode.DecRef(ctx)
154+
}
155+
}
156+
102157
// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
103158
// namespaces." - user_namespaces(7)
104159
const maxUserNamespaceDepth = 32

pkg/sentry/kernel/kernel.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
558558
}
559559
defer nsfsFilesystem.DecRef(ctx)
560560
k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{})
561+
k.rootUserNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUserNamespace))
561562
k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace))
562563
k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace))
563564
k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace))
@@ -1330,6 +1331,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
13301331
config.UTSNamespace.IncRef()
13311332
config.IPCNamespace.IncRef()
13321333
config.NetworkNamespace.IncRef()
1334+
config.Credentials.UserNamespace.IncRef()
13331335
refcountCu.Release() // refs(mntns, fsContext) are transferred to NewTask()
13341336
t, err := k.tasks.NewTask(ctx, config)
13351337
if err != nil {
@@ -2105,6 +2107,7 @@ func (k *Kernel) Release() {
21052107
k.cleaupDevGofers()
21062108
k.mf.Destroy()
21072109
k.RootPIDNamespace().DecRef(ctx)
2110+
k.rootUserNamespace.DecRef(ctx)
21082111
}
21092112

21102113
// PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup

pkg/sentry/kernel/task_clone.go

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
2424
"gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs"
2525
"gvisor.dev/gvisor/pkg/sentry/inet"
26+
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
2627
"gvisor.dev/gvisor/pkg/sentry/seccheck"
2728
pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
2829
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -124,6 +125,8 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
124125
// user_namespaces(7)
125126
creds := t.Credentials()
126127
userns := creds.UserNamespace
128+
cu := cleanup.Make(func() {})
129+
defer cu.Clean()
127130
if args.Flags&linux.CLONE_NEWUSER != 0 {
128131
var err error
129132
// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
@@ -138,14 +141,17 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
138141
if err != nil {
139142
return 0, nil, err
140143
}
144+
userns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, userns))
145+
} else {
146+
userns.IncRef()
141147
}
148+
cu.Add(func() {
149+
userns.DecRef(t)
150+
})
142151
if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
143152
return 0, nil, linuxerr.EPERM
144153
}
145154

146-
cu := cleanup.Make(func() {})
147-
defer cu.Clean()
148-
149155
utsns := t.utsns
150156
if args.Flags&linux.CLONE_NEWUTS != 0 {
151157
// Note that this must happen after NewUserNamespace so we get
@@ -285,6 +291,10 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
285291
if uc.uid != creds.RealKUID {
286292
uc = t.k.GetUserCounters(creds.RealKUID)
287293
}
294+
childCreds := creds
295+
if userns != creds.UserNamespace {
296+
childCreds = creds.ForkIntoUserNamespace(userns)
297+
}
288298

289299
cfg := &TaskConfig{
290300
Kernel: t.k,
@@ -293,7 +303,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
293303
TaskImage: image,
294304
FSContext: fsContext,
295305
FDTable: fdTable,
296-
Credentials: creds,
306+
Credentials: childCreds,
297307
NoNewPrivs: t.GetNoNewPrivs(),
298308
Niceness: t.Niceness(),
299309
NetworkNamespace: netns,
@@ -341,10 +351,6 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
341351
nt.SetSignalStack(t.SignalStack())
342352
}
343353

344-
if userns != creds.UserNamespace {
345-
nt.creds.Store(creds.ForkIntoUserNamespace(userns))
346-
}
347-
348354
// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
349355
// nt that it must receive before its task goroutine starts running.
350356
tid := nt.k.tasks.Root.IDOfTask(nt)
@@ -509,6 +515,7 @@ type namespaceSet struct {
509515
utsNS *UTSNamespace
510516
ipcNS *IPCNamespace
511517
mountNS *vfs.MountNamespace
518+
userNS *auth.UserNamespace
512519

513520
fsContext *FSContext
514521
}
@@ -529,14 +536,17 @@ func (nss *namespaceSet) release(t *Task) {
529536
if nss.mountNS != nil {
530537
nss.mountNS.DecRef(t)
531538
}
539+
if nss.userNS != nil {
540+
nss.userNS.DecRef(t)
541+
}
532542

533543
if nss.fsContext != nil {
534544
nss.fsContext.DecRef(t)
535545
}
536546
}
537547

538548
func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error {
539-
supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS)
549+
supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS | linux.CLONE_NEWUSER)
540550
if (uint32(flags) & ^supported) != 0 || flags == 0 {
541551
return linuxerr.EINVAL
542552
}
@@ -567,6 +577,15 @@ func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error
567577
target.mu.Lock()
568578
defer target.mu.Unlock()
569579

580+
if flags&linux.CLONE_NEWUSER != 0 {
581+
// User namespaces are stored in credentials, which outlive the other
582+
// namespace fields cleared during task exit.
583+
if target.ExitState() >= TaskExitInitiated {
584+
return linuxerr.ESRCH
585+
}
586+
nss.userNS = target.Credentials().UserNamespace
587+
nss.userNS.IncRef()
588+
}
570589
if flags&linux.CLONE_NEWNET != 0 {
571590
nss.netNS = target.netns
572591
if nss.netNS == nil {
@@ -631,6 +650,12 @@ func (nss *namespaceSet) initFromNS(ns vfs.Namespace, flags int32) error {
631650
}
632651
nss.mountNS = ns
633652
ns.IncRef()
653+
case *auth.UserNamespace:
654+
if flags != 0 && flags != linux.CLONE_NEWUSER {
655+
return linuxerr.EINVAL
656+
}
657+
nss.userNS = ns
658+
ns.IncRef()
634659
default:
635660
return linuxerr.EINVAL
636661
}
@@ -667,8 +692,29 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
667692
return err
668693
}
669694

695+
creds := t.Credentials()
696+
checkCreds := creds
697+
if nss.userNS != nil {
698+
if nss.userNS == creds.UserNamespace {
699+
return linuxerr.EINVAL
700+
}
701+
t.tg.signalHandlers.mu.Lock()
702+
if t.tg.tasksCount != 1 {
703+
t.tg.signalHandlers.mu.Unlock()
704+
return linuxerr.EINVAL
705+
}
706+
t.tg.signalHandlers.mu.Unlock()
707+
if t.FSContext().ReadRefs() != 1 {
708+
return linuxerr.EINVAL
709+
}
710+
if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.userNS) {
711+
return linuxerr.EPERM
712+
}
713+
checkCreds = creds.ForkIntoUserNamespace(nss.userNS)
714+
}
715+
670716
if nss.childPIDNS != nil {
671-
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
717+
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
672718
return linuxerr.EPERM
673719
}
674720
// Allow setting the current or a child pid namespace.
@@ -685,25 +731,25 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
685731
}
686732

687733
if nss.netNS != nil {
688-
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
734+
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
689735
return linuxerr.EPERM
690736
}
691737
}
692738

693739
if nss.utsNS != nil {
694-
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
740+
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
695741
return linuxerr.EPERM
696742
}
697743
}
698744

699745
if nss.ipcNS != nil {
700-
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
746+
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
701747
return linuxerr.EPERM
702748
}
703749
}
704750

705751
if nss.mountNS != nil {
706-
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_CHROOT) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
752+
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_CHROOT) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
707753
return linuxerr.EPERM
708754
}
709755
oldFSContext := t.FSContext()
@@ -723,6 +769,10 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
723769
// Swap to new namespaces.
724770
// Store replaced resources in nss so that they're cleaned up by the deferred function.
725771
t.mu.Lock()
772+
if nss.userNS != nil {
773+
t.creds.Store(checkCreds)
774+
nss.userNS = creds.UserNamespace
775+
}
726776
if nss.childPIDNS != nil {
727777
t.childPIDNamespace, nss.childPIDNS = nss.childPIDNS, t.childPIDNamespace
728778
}
@@ -786,10 +836,12 @@ func (t *Task) Unshare(flags int32) error {
786836

787837
// Prepare new execution context.
788838
creds := t.Credentials()
839+
originalUserNS := creds.UserNamespace
789840
var (
790841
newFSContext *FSContext
791842
newFDTable *FDTable
792843
newCreds bool
844+
newUserNS *auth.UserNamespace
793845
newChildPIDNS *PIDNamespace
794846
newNetNS *inet.Namespace
795847
newUTSNS *UTSNamespace
@@ -803,6 +855,9 @@ func (t *Task) Unshare(flags int32) error {
803855
if newFDTable != nil {
804856
newFDTable.DecRef(t)
805857
}
858+
if newUserNS != nil {
859+
newUserNS.DecRef(t)
860+
}
806861
if newNetNS != nil {
807862
newNetNS.DecRef(t)
808863
}
@@ -827,10 +882,11 @@ func (t *Task) Unshare(flags int32) error {
827882
return linuxerr.EPERM
828883
}
829884
var err error
830-
newUserNS, err := creds.NewChildUserNamespace()
885+
newUserNS, err = creds.NewChildUserNamespace()
831886
if err != nil {
832887
return err
833888
}
889+
newUserNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newUserNS))
834890
creds = t.Credentials().ForkIntoUserNamespace(newUserNS)
835891
newCreds = true
836892
}
@@ -869,11 +925,12 @@ func (t *Task) Unshare(flags int32) error {
869925

870926
// Switch to new execution context. Store replaced resources in new* so
871927
// that they're cleaned up by the deferred function.
928+
t.mu.Lock()
929+
defer t.mu.Unlock()
872930
if newCreds {
873931
t.creds.Store(creds)
932+
newUserNS = originalUserNS
874933
}
875-
t.mu.Lock()
876-
defer t.mu.Unlock()
877934
if newFSContext != nil {
878935
oldFSContext := t.FSContext()
879936
// unshareFromTask() lowers the old fs context's ref count, but its for us to

0 commit comments

Comments
 (0)