Skip to content

Commit a69f91e

Browse files
authored
Merge pull request #36 from dgl/dgl/userns-otherfs
Add support for mounting other filesystems in user namespaces
2 parents aac15ef + 836d609 commit a69f91e

File tree

6 files changed

+239
-6
lines changed

6 files changed

+239
-6
lines changed

README.md

+105-2
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ chmod: /: Bad message
4242
```
4343

4444
## Demo on Kubernetes
45-
Before you install the demo on k8s, please ensure all [the requirements](./docs/install.md) are satisfied
45+
Before you install the demo on k8s, please ensure all [the requirements](./docs/install.md) are satisfied.
4646

4747
This demo shows that the Seccomp Agent can have different behaviour depending on the Kubernetes pod (in this case, the pod's namespace and name).
4848

@@ -82,11 +82,17 @@ apiVersion: v1
8282
kind: Pod
8383
metadata:
8484
name: mynotifypod
85-
# /var/lib/kubelet/seccomp/notify.json
85+
# For older versions of Kubernetes (this annotation was deprecated in
86+
# Kubernetes v1.19 and completely removed in v1.27):
8687
annotations:
8788
seccomp.security.alpha.kubernetes.io/pod: localhost/notify.json
8889
spec:
8990
restartPolicy: Never
91+
securityContext:
92+
# /var/lib/kubelet/seccomp/notify.json
93+
seccompProfile:
94+
type: Localhost
95+
localhostProfile: notify.json
9096
containers:
9197
- name: container1
9298
image: busybox
@@ -108,3 +114,100 @@ proc on /root type proc (rw,relatime)
108114
/ # time -f %E /bin/true
109115
0m 2.00s
110116
```
117+
118+
## Combining with user namespaces
119+
120+
By combining this with Kubernetes's user namespace support it is possible to
121+
allow a user within a user namespace to perform some operations which would
122+
otherwise be limited to host root.
123+
124+
One example is mounting other filesystem types. This is most useful combined
125+
with user namespaces to allow mounting network file systems while a pod is
126+
running. This is far safer than giving the container `privileged` access but
127+
does expose more of the kernel to the pod, so you should consider your security
128+
carefully.
129+
130+
There is a possibility a process could change its user namespace after making
131+
the mount system call, which could result in a confusing state. To fix this the
132+
seccomp notify policy should use the SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV
133+
flag, however this is [not yet available in
134+
runc](https://github.com/opencontainers/runc/issues/3860) and requires Linux >=
135+
5.19.
136+
137+
Configure a policy, similar to above, but with the following metadata:
138+
```json
139+
{
140+
"architectures" : [
141+
"SCMP_ARCH_X86",
142+
"SCMP_ARCH_X32"
143+
],
144+
"defaultAction" : "SCMP_ACT_ALLOW",
145+
"listenerPath": "/run/seccomp-agent.socket",
146+
"listenerMetadata": "MOUNT_OTHER_FS_LIST=cifs\nMOUNT_NEED_CAP_ADMIN=true",
147+
"syscalls" : [
148+
{
149+
"action" : "SCMP_ACT_NOTIFY",
150+
"names" : [
151+
"mount"
152+
]
153+
},
154+
{
155+
"action" : "SCMP_ACT_ALLOW",
156+
"names" : [
157+
"umount"
158+
]
159+
}
160+
]
161+
}
162+
```
163+
164+
(Policy cut down for sake of example, recommended to use a full policy that
165+
additionally configures notify for mount and allows umount.)
166+
167+
This has currently been successfully tested with cifs. Other filesystem types
168+
should work; NFS will need NFS client utilities installing within the container
169+
*and* on the host (e.g. to make upcalls work).
170+
171+
* Deploy a pod with the seccomp policy and user namespaces:
172+
```yaml
173+
apiVersion: v1
174+
kind: Pod
175+
metadata:
176+
name: mynotifypod-userns
177+
spec:
178+
restartPolicy: Never
179+
# Needs "UserNamespacesSupport" feature gate currently
180+
hostUsers: false
181+
securityContext:
182+
# /var/lib/kubelet/seccomp/notify.json
183+
seccompProfile:
184+
type: Localhost
185+
localhostProfile: notify.json
186+
containers:
187+
- name: container1
188+
image: alpine
189+
command: ["sh"]
190+
args: ["-c", "sleep infinity"]
191+
securityContext:
192+
capabilities:
193+
# This is safe combined with hostUsers: false
194+
add: [SYS_ADMIN]
195+
```
196+
197+
* Run commands in the pod:
198+
```shell
199+
$ kubectl exec -it mynotifypod-userns -- /bin/sh
200+
/ # mkdir /mnt
201+
/ # mount -t cifs -o username=user,password=pass '//10.0.0.1/C' /mnt
202+
/ # df -h /mnt
203+
/mnt # df -h /mnt
204+
Filesystem Size Used Available Use% Mounted on
205+
//10.0.0.1/C 95.4G 85.3G 10.1G 89% /mnt
206+
/ # ls /mnt
207+
$Recycle.Bin Documents and Settings Program files
208+
[...]
209+
/ # sed -i 's!^\(nobody.*/\)false!\1sh!' /etc/passwd
210+
/ # su nobody
211+
/ $ mount -t cifs -o username=user,password=pass '//10.0.0.1/C' /mnt
212+
mount: permission denied (are you root?)
213+
```

cmd/seccompagent/seccompagent.go

+14-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//go:build linux && cgo
1516
// +build linux,cgo
1617

1718
package main
@@ -121,7 +122,7 @@ func main() {
121122
// / # ls /root/self/cmdline
122123
// /root/self/cmdline
123124
allowedFilesystems := map[string]struct{}{"proc": struct{}{}}
124-
r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems)
125+
r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems, false /* do not check capabilities */)
125126

126127
// Example:
127128
// # chmod 777 /
@@ -214,8 +215,19 @@ func main() {
214215
if v, ok := metadata["MOUNT_SYSFS"]; ok && v == "true" {
215216
allowedFilesystems["sysfs"] = struct{}{}
216217
}
218+
if v, ok := metadata["MOUNT_OTHER_FS_LIST"]; ok {
219+
for _, fs := range strings.Split(v, ",") {
220+
allowedFilesystems[fs] = struct{}{}
221+
}
222+
}
223+
224+
requireCapsForMount := false
225+
if v, ok := metadata["MOUNT_NEED_CAP_ADMIN"]; ok && v == "true" {
226+
requireCapsForMount = true
227+
}
228+
217229
if len(allowedFilesystems) > 0 {
218-
r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems)
230+
r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems, requireCapsForMount)
219231
}
220232
return r
221233
}

go.mod

+2
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ require (
6161
k8s.io/klog/v2 v2.70.1 // indirect
6262
k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect
6363
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect
64+
kernel.org/pub/linux/libs/security/libcap/cap v1.2.69 // indirect
65+
kernel.org/pub/linux/libs/security/libcap/psx v1.2.69 // indirect
6466
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
6567
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
6668
sigs.k8s.io/yaml v1.3.0 // indirect

go.sum

+4
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,10 @@ k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkI
329329
k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU=
330330
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4=
331331
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA=
332+
kernel.org/pub/linux/libs/security/libcap/cap v1.2.69 h1:N0m3tKYbkRMmDobh/47ngz+AWeV7PcfXMDi8xu3Vrag=
333+
kernel.org/pub/linux/libs/security/libcap/cap v1.2.69/go.mod h1:Tk5Ip2TuxaWGpccL7//rAsLRH6RQ/jfqTGxuN/+i/FQ=
334+
kernel.org/pub/linux/libs/security/libcap/psx v1.2.69 h1:IdrOs1ZgwGw5CI+BH6GgVVlOt+LAXoPyh7enr8lfaXs=
335+
kernel.org/pub/linux/libs/security/libcap/psx v1.2.69/go.mod h1:+l6Ee2F59XiJ2I6WR5ObpC1utCQJZ/VLsEbQCD8RG24=
332336
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k=
333337
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
334338
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE=

pkg/handlers/mount.go

+70-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//go:build linux && cgo
1516
// +build linux,cgo
1617

1718
package handlers
@@ -28,6 +29,7 @@ import (
2829
"github.com/kinvolk/seccompagent/pkg/nsenter"
2930
"github.com/kinvolk/seccompagent/pkg/readarg"
3031
"github.com/kinvolk/seccompagent/pkg/registry"
32+
"github.com/kinvolk/seccompagent/pkg/userns"
3133
)
3234

3335
var _ = nsenter.RegisterModule("mount", runMountInNamespaces)
@@ -37,6 +39,8 @@ type mountModuleParams struct {
3739
Source string `json:"source,omitempty"`
3840
Dest string `json:"dest,omitempty"`
3941
Filesystem string `json:"filesystem,omitempty"`
42+
Flags int64 `json:"flags,omitempty"`
43+
Options string `json:"options,omitempty"`
4044
}
4145

4246
func runMountInNamespaces(param []byte) string {
@@ -46,14 +50,14 @@ func runMountInNamespaces(param []byte) string {
4650
return fmt.Sprintf("%d", int(unix.ENOSYS))
4751
}
4852

49-
err = unix.Mount(params.Source, params.Dest, params.Filesystem, 0, "")
53+
err = unix.Mount(params.Source, params.Dest, params.Filesystem, 0, params.Options)
5054
if err != nil {
5155
return fmt.Sprintf("%d", int(err.(unix.Errno)))
5256
}
5357
return "0"
5458
}
5559

56-
func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc {
60+
func Mount(allowedFilesystems map[string]struct{}, requireUserNamespaceAdmin bool) registry.HandlerFunc {
5761
return func(fd libseccomp.ScmpFd, req *libseccomp.ScmpNotifReq) (result registry.HandlerResult) {
5862
memFile, err := readarg.OpenMem(req.Pid)
5963
if err != nil {
@@ -96,12 +100,17 @@ func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc {
96100
return registry.HandlerResultErrno(unix.EFAULT)
97101
}
98102

103+
// We don't handle flags, we may want to consider allowing a few.
104+
// This is here so the debug logging makes it possible to see flags used.
105+
flags := int64(req.Data.Args[3])
106+
99107
log.WithFields(log.Fields{
100108
"fd": fd,
101109
"pid": req.Pid,
102110
"source": source,
103111
"dest": dest,
104112
"filesystem": filesystem,
113+
"flags": flags,
105114
}).Debug("Mount")
106115

107116
if _, ok := allowedFilesystems[filesystem]; !ok {
@@ -110,11 +119,70 @@ func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc {
110119
return registry.HandlerResultContinue()
111120
}
112121

122+
var options string
123+
if req.Data.Args[4] != 0/* NULL */ && filesystem != "sysfs" {
124+
// Get options, we assume because this is specified in
125+
// allowedFilesystems that the data argument to mount(2)
126+
// is a string so this is safe now. We ignore options for sysfs, as it
127+
// doesn't define options.
128+
options, err = readarg.ReadString(memFile, int64(req.Data.Args[4]))
129+
if err != nil {
130+
log.WithFields(log.Fields{
131+
"fd": fd,
132+
"pid": req.Pid,
133+
"arg": 4,
134+
"err": err,
135+
}).Error("Cannot read argument")
136+
return registry.HandlerResultErrno(unix.EFAULT)
137+
}
138+
139+
// Log this at trace level only as it could have user credentials.
140+
log.WithFields(log.Fields{
141+
"fd": fd,
142+
"pid": req.Pid,
143+
"source": source,
144+
"dest": dest,
145+
"filesystem": filesystem,
146+
"flags": flags,
147+
"options": options,
148+
}).Trace("Handle mount")
149+
}
150+
151+
if requireUserNamespaceAdmin {
152+
ok, err := userns.IsPIDAdminCapable(req.Pid)
153+
if err != nil {
154+
log.WithFields(log.Fields{
155+
"fd": fd,
156+
"pid": req.Pid,
157+
"err": err,
158+
}).Error("Cannot check user namespace capabilities")
159+
return registry.HandlerResultErrno(unix.EFAULT)
160+
}
161+
if !ok {
162+
log.WithFields(log.Fields{
163+
"fd": fd,
164+
"pid": req.Pid,
165+
}).Info("Mount attempted without CAP_SYS_ADMIN")
166+
return registry.HandlerResultErrno(unix.EPERM)
167+
}
168+
169+
// Ensure the notification is still valid after checking user namespace capabilities.
170+
if err := libseccomp.NotifIDValid(fd, req.ID); err != nil {
171+
log.WithFields(log.Fields{
172+
"fd": fd,
173+
"req": req,
174+
"err": err,
175+
}).Debug("Notification no longer valid")
176+
return registry.HandlerResultIntr()
177+
}
178+
}
179+
113180
params := mountModuleParams{
114181
Module: "mount",
115182
Source: source,
116183
Dest: dest,
117184
Filesystem: filesystem,
185+
Options: options,
118186
}
119187

120188
mntns, err := nsenter.OpenNamespace(req.Pid, "mnt")

pkg/userns/check.go

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package userns
2+
3+
import (
4+
"fmt"
5+
6+
"golang.org/x/sys/unix"
7+
"kernel.org/pub/linux/libs/security/libcap/cap"
8+
)
9+
10+
// IsPIDAdminCapable returns true if the PID is considered an admin of a user
11+
// namespace, that is, it's in either in the init user namespace or one created
12+
// by the host root and has CAP_SYS_ADMIN. The protects against a less
13+
// privileged user either mounting a directory over a tree that gives them more
14+
// access (e.g. /etc/sudoers.d) or hiding files.
15+
func IsPIDAdminCapable(pid uint32) (bool, error) {
16+
// We unfortunately need to reimplement some of the kernel's user namespace logic.
17+
// Our goal is to allow a user with CAP_SYS_ADMIN inside the first user
18+
// namespace to call mount(). If the user nests a user namespace below that,
19+
// we don't want to allow that process to call mount.
20+
21+
// This is security sensitive code, however TOCTOU isn't a concern in this case
22+
// as this is designed to be used while blocked on a syscall and the kernel
23+
// does not let multi-threaded processes change their user namespace (see
24+
// setns() and unshare() docs).
25+
fd, err := unix.Open(fmt.Sprintf("/proc/%d/ns/user", pid), unix.O_RDONLY, 0)
26+
if err != nil {
27+
return false, err
28+
}
29+
defer unix.Close(fd)
30+
31+
uid, err := unix.IoctlGetInt(fd, unix.NS_GET_OWNER_UID)
32+
if err != nil {
33+
return false, err
34+
}
35+
if uid != 0 {
36+
return false, err
37+
}
38+
set, err := cap.GetPID(int(pid))
39+
if err != nil {
40+
return false, err
41+
}
42+
43+
return set.GetFlag(cap.Effective, cap.SYS_ADMIN)
44+
}

0 commit comments

Comments
 (0)