@@ -23,6 +23,7 @@ import (
2323 "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
2424 "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs"
2525 "gvisor.dev/gvisor/pkg/sentry/inet"
26+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
2627 "gvisor.dev/gvisor/pkg/sentry/seccheck"
2728 pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
2829 "gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -124,6 +125,8 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
124125 // user_namespaces(7)
125126 creds := t .Credentials ()
126127 userns := creds .UserNamespace
128+ cu := cleanup .Make (func () {})
129+ defer cu .Clean ()
127130 if args .Flags & linux .CLONE_NEWUSER != 0 {
128131 var err error
129132 // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
@@ -138,14 +141,17 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
138141 if err != nil {
139142 return 0 , nil , err
140143 }
144+ userns .SetInode (nsfs .NewInode (t , t .k .nsfsMount , userns ))
145+ } else {
146+ userns .IncRef ()
141147 }
148+ cu .Add (func () {
149+ userns .DecRef (t )
150+ })
142151 if args .Flags & (linux .CLONE_NEWPID | linux .CLONE_NEWNET | linux .CLONE_NEWUTS | linux .CLONE_NEWIPC ) != 0 && ! creds .HasCapabilityIn (linux .CAP_SYS_ADMIN , userns ) {
143152 return 0 , nil , linuxerr .EPERM
144153 }
145154
146- cu := cleanup .Make (func () {})
147- defer cu .Clean ()
148-
149155 utsns := t .utsns
150156 if args .Flags & linux .CLONE_NEWUTS != 0 {
151157 // Note that this must happen after NewUserNamespace so we get
@@ -285,6 +291,10 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
285291 if uc .uid != creds .RealKUID {
286292 uc = t .k .GetUserCounters (creds .RealKUID )
287293 }
294+ childCreds := creds
295+ if userns != creds .UserNamespace {
296+ childCreds = creds .ForkIntoUserNamespace (userns )
297+ }
288298
289299 cfg := & TaskConfig {
290300 Kernel : t .k ,
@@ -293,7 +303,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
293303 TaskImage : image ,
294304 FSContext : fsContext ,
295305 FDTable : fdTable ,
296- Credentials : creds ,
306+ Credentials : childCreds ,
297307 NoNewPrivs : t .GetNoNewPrivs (),
298308 Niceness : t .Niceness (),
299309 NetworkNamespace : netns ,
@@ -341,10 +351,6 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
341351 nt .SetSignalStack (t .SignalStack ())
342352 }
343353
344- if userns != creds .UserNamespace {
345- nt .creds .Store (creds .ForkIntoUserNamespace (userns ))
346- }
347-
348354 // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
349355 // nt that it must receive before its task goroutine starts running.
350356 tid := nt .k .tasks .Root .IDOfTask (nt )
@@ -509,6 +515,7 @@ type namespaceSet struct {
509515 utsNS * UTSNamespace
510516 ipcNS * IPCNamespace
511517 mountNS * vfs.MountNamespace
518+ userNS * auth.UserNamespace
512519
513520 fsContext * FSContext
514521}
@@ -529,14 +536,17 @@ func (nss *namespaceSet) release(t *Task) {
529536 if nss .mountNS != nil {
530537 nss .mountNS .DecRef (t )
531538 }
539+ if nss .userNS != nil {
540+ nss .userNS .DecRef (t )
541+ }
532542
533543 if nss .fsContext != nil {
534544 nss .fsContext .DecRef (t )
535545 }
536546}
537547
538548func (nss * namespaceSet ) initFromTask (t * Task , target * Task , flags int32 ) error {
539- supported := uint32 (linux .CLONE_NEWPID | linux .CLONE_NEWNET | linux .CLONE_NEWUTS | linux .CLONE_NEWIPC | linux .CLONE_NEWNS )
549+ supported := uint32 (linux .CLONE_NEWPID | linux .CLONE_NEWNET | linux .CLONE_NEWUTS | linux .CLONE_NEWIPC | linux .CLONE_NEWNS | linux . CLONE_NEWUSER )
540550 if (uint32 (flags ) & ^ supported ) != 0 || flags == 0 {
541551 return linuxerr .EINVAL
542552 }
@@ -567,6 +577,15 @@ func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error
567577 target .mu .Lock ()
568578 defer target .mu .Unlock ()
569579
580+ if flags & linux .CLONE_NEWUSER != 0 {
581+ // User namespaces are stored in credentials, which outlive the other
582+ // namespace fields cleared during task exit.
583+ if target .ExitState () >= TaskExitInitiated {
584+ return linuxerr .ESRCH
585+ }
586+ nss .userNS = target .Credentials ().UserNamespace
587+ nss .userNS .IncRef ()
588+ }
570589 if flags & linux .CLONE_NEWNET != 0 {
571590 nss .netNS = target .netns
572591 if nss .netNS == nil {
@@ -631,6 +650,12 @@ func (nss *namespaceSet) initFromNS(ns vfs.Namespace, flags int32) error {
631650 }
632651 nss .mountNS = ns
633652 ns .IncRef ()
653+ case * auth.UserNamespace :
654+ if flags != 0 && flags != linux .CLONE_NEWUSER {
655+ return linuxerr .EINVAL
656+ }
657+ nss .userNS = ns
658+ ns .IncRef ()
634659 default :
635660 return linuxerr .EINVAL
636661 }
@@ -667,8 +692,29 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
667692 return err
668693 }
669694
695+ creds := t .Credentials ()
696+ checkCreds := creds
697+ if nss .userNS != nil {
698+ if nss .userNS == creds .UserNamespace {
699+ return linuxerr .EINVAL
700+ }
701+ t .tg .signalHandlers .mu .Lock ()
702+ if t .tg .tasksCount != 1 {
703+ t .tg .signalHandlers .mu .Unlock ()
704+ return linuxerr .EINVAL
705+ }
706+ t .tg .signalHandlers .mu .Unlock ()
707+ if t .FSContext ().ReadRefs () != 1 {
708+ return linuxerr .EINVAL
709+ }
710+ if ! creds .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .userNS ) {
711+ return linuxerr .EPERM
712+ }
713+ checkCreds = creds .ForkIntoUserNamespace (nss .userNS )
714+ }
715+
670716 if nss .childPIDNS != nil {
671- if ! t .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .childPIDNS .UserNamespace ()) || ! t .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
717+ if ! checkCreds .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .childPIDNS .UserNamespace ()) || ! checkCreds .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
672718 return linuxerr .EPERM
673719 }
674720 // Allow setting the current or a child pid namespace.
@@ -685,25 +731,25 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
685731 }
686732
687733 if nss .netNS != nil {
688- if ! t .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .netNS .UserNamespace ()) || ! t .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
734+ if ! checkCreds .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .netNS .UserNamespace ()) || ! checkCreds .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
689735 return linuxerr .EPERM
690736 }
691737 }
692738
693739 if nss .utsNS != nil {
694- if ! t .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .utsNS .UserNamespace ()) || ! t .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
740+ if ! checkCreds .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .utsNS .UserNamespace ()) || ! checkCreds .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
695741 return linuxerr .EPERM
696742 }
697743 }
698744
699745 if nss .ipcNS != nil {
700- if ! t .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .ipcNS .UserNamespace ()) || ! t .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
746+ if ! checkCreds .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .ipcNS .UserNamespace ()) || ! checkCreds .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
701747 return linuxerr .EPERM
702748 }
703749 }
704750
705751 if nss .mountNS != nil {
706- if ! t .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .mountNS .UserNamespace ()) || ! t .HasSelfCapability (linux .CAP_SYS_CHROOT ) || ! t .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
752+ if ! checkCreds .HasCapabilityIn (linux .CAP_SYS_ADMIN , nss .mountNS .UserNamespace ()) || ! checkCreds .HasSelfCapability (linux .CAP_SYS_CHROOT ) || ! checkCreds .HasSelfCapability (linux .CAP_SYS_ADMIN ) {
707753 return linuxerr .EPERM
708754 }
709755 oldFSContext := t .FSContext ()
@@ -723,6 +769,10 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
723769 // Swap to new namespaces.
724770 // Store replaced resources in nss so that they're cleaned up by the deferred function.
725771 t .mu .Lock ()
772+ if nss .userNS != nil {
773+ t .creds .Store (checkCreds )
774+ nss .userNS = creds .UserNamespace
775+ }
726776 if nss .childPIDNS != nil {
727777 t .childPIDNamespace , nss .childPIDNS = nss .childPIDNS , t .childPIDNamespace
728778 }
@@ -786,10 +836,12 @@ func (t *Task) Unshare(flags int32) error {
786836
787837 // Prepare new execution context.
788838 creds := t .Credentials ()
839+ originalUserNS := creds .UserNamespace
789840 var (
790841 newFSContext * FSContext
791842 newFDTable * FDTable
792843 newCreds bool
844+ newUserNS * auth.UserNamespace
793845 newChildPIDNS * PIDNamespace
794846 newNetNS * inet.Namespace
795847 newUTSNS * UTSNamespace
@@ -803,6 +855,9 @@ func (t *Task) Unshare(flags int32) error {
803855 if newFDTable != nil {
804856 newFDTable .DecRef (t )
805857 }
858+ if newUserNS != nil {
859+ newUserNS .DecRef (t )
860+ }
806861 if newNetNS != nil {
807862 newNetNS .DecRef (t )
808863 }
@@ -827,10 +882,11 @@ func (t *Task) Unshare(flags int32) error {
827882 return linuxerr .EPERM
828883 }
829884 var err error
830- newUserNS , err : = creds .NewChildUserNamespace ()
885+ newUserNS , err = creds .NewChildUserNamespace ()
831886 if err != nil {
832887 return err
833888 }
889+ newUserNS .SetInode (nsfs .NewInode (t , t .k .nsfsMount , newUserNS ))
834890 creds = t .Credentials ().ForkIntoUserNamespace (newUserNS )
835891 newCreds = true
836892 }
@@ -869,11 +925,12 @@ func (t *Task) Unshare(flags int32) error {
869925
870926 // Switch to new execution context. Store replaced resources in new* so
871927 // that they're cleaned up by the deferred function.
928+ t .mu .Lock ()
929+ defer t .mu .Unlock ()
872930 if newCreds {
873931 t .creds .Store (creds )
932+ newUserNS = originalUserNS
874933 }
875- t .mu .Lock ()
876- defer t .mu .Unlock ()
877934 if newFSContext != nil {
878935 oldFSContext := t .FSContext ()
879936 // unshareFromTask() lowers the old fs context's ref count, but its for us to
0 commit comments