From f83c591bcf9dea485acd3f488cba3c67764ed0d1 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 27 Sep 2024 22:22:28 +0100 Subject: [PATCH 01/11] sys_mmap: improve a comment --- sys/vm/vm_mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 4cbd76031c1e..a16a288e0879 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -318,7 +318,7 @@ sys_mmap(struct thread *td, struct mmap_args *uap) * derived from the passed capability. In all other cases, the * new capability is derived from the per-thread mmap capability. * - * If MAP_FIXED specified and addr does not meet the above + * If MAP_FIXED is specified and addr does not meet the above * requirements, then MAP_EXCL is implied to prevent changing * page contents without permission. * From 7e2b5ddb769d183205819cfb000a8b020852ffd1 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 27 Sep 2024 22:19:27 +0100 Subject: [PATCH 02/11] cheribsdtest: prefer mmap(MAP_SHARED) to minherit(INHERIT_SHARE) While here, explicitly request PROT_CAP permissions. In purecap compilations of the test suite we store a full capability for faulting addresses and thus need the ability to share them. --- bin/cheribsdtest/cheribsdtest.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bin/cheribsdtest/cheribsdtest.c b/bin/cheribsdtest/cheribsdtest.c index fb6e5f2fc803..d1190bce9d7f 100644 --- a/bin/cheribsdtest/cheribsdtest.c +++ b/bin/cheribsdtest/cheribsdtest.c @@ -876,12 +876,10 @@ main(int argc, char *argv[]) * failure status. */ assert(sizeof(*ccsp) <= (size_t)getpagesize()); - ccsp = mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE, MAP_ANON, -1, - 0); + ccsp = mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE | PROT_CAP, + MAP_ANON | MAP_SHARED, -1, 0); if (ccsp == MAP_FAILED) err(EX_OSERR, "mmap"); - if (minherit(ccsp, getpagesize(), INHERIT_SHARE) < 0) - err(EX_OSERR, "minherit"); /* * Disable core dumps unless specifically enabled. From b3e309fa3026cd0778364273d5b97c8e9e0357d1 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Wed, 28 Aug 2024 18:38:20 +0100 Subject: [PATCH 03/11] cheribsdtest: make vm_shm_open_anon_unix_surprise worse Alter the child side to use MAP_PRIVATE which can still leak capabilities into the address space, just not back the other way. --- bin/cheribsdtest/cheribsdtest_vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/cheribsdtest/cheribsdtest_vm.c b/bin/cheribsdtest/cheribsdtest_vm.c index a2db3ef3ac09..fda81f072731 100644 --- a/bin/cheribsdtest/cheribsdtest_vm.c +++ b/bin/cheribsdtest/cheribsdtest_vm.c @@ -280,7 +280,7 @@ CHERIBSDTEST(vm_shm_open_anon_unix_surprise, CHERIBSDTEST_VERIFY2(fd >= 0, "fd read OK"); map = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, getpagesize(), - PROT_READ, MAP_SHARED, fd, 0)); + PROT_READ, MAP_PRIVATE, fd, 0)); c = *map; if (verbose) From ad73db85bd1bbca9736f272d5fef1671a406247b Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Wed, 4 Sep 2024 00:38:35 +0100 Subject: [PATCH 04/11] cheribsdtest: better error handling in vm_shm_open_anon_unix_surprise Differentiate between system calls failing (e.g., due to ABI changes) and tags being transfered. Syscalls wrapped with cheribsdtest CHERI/VERIFY macros exit with EX_SOFTWARE where success or tag transfer now returns 0 or 1 respectively. --- bin/cheribsdtest/cheribsdtest_vm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/cheribsdtest/cheribsdtest_vm.c b/bin/cheribsdtest/cheribsdtest_vm.c index fda81f072731..733788ad1ad8 100644 --- a/bin/cheribsdtest/cheribsdtest_vm.c +++ b/bin/cheribsdtest/cheribsdtest_vm.c @@ -287,7 +287,6 @@ CHERIBSDTEST(vm_shm_open_anon_unix_surprise, fprintf(stderr, "rx cap: %#lp\n", c); tag = cheri_gettag(c); - CHERIBSDTEST_VERIFY2(tag == 0, "tag read"); CHERIBSDTEST_CHECK_SYSCALL(munmap(map, getpagesize())); close(sv[0]); @@ -347,8 +346,10 @@ CHERIBSDTEST(vm_shm_open_anon_unix_surprise, waitpid(pid, &res, 0); if (res == 0) { cheribsdtest_success(); - } else { + } else if (WIFEXITED(res) && WEXITSTATUS(res) == 1) { cheribsdtest_failure_errx("tag transfer succeeded"); + } else { + cheribsdtest_failure_errx("child setup error occured (this is *unexpected*"); } } } From a4f5b8feb76118e49653dae4e8b61407b05e5563 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Wed, 11 Sep 2024 11:40:26 +0100 Subject: [PATCH 05/11] libsysdecode: add sysdecode_shm_open_flags This uses a new shm_open_flags table which is positively defined to the list of allowed flags as the majority of open(2) flags don't apply to shm_open(2) or memfd_create(2). This will permit shm-specific O_ flags which reuse otherwise irrelevant values (e.g., O_DIRECTORY or O_EXEC) to avoid consuming scarce O_ values. --- lib/libsysdecode/flags.c | 19 ++++++++++++++++--- lib/libsysdecode/mktables | 1 + lib/libsysdecode/sysdecode.h | 1 + lib/libsysdecode/sysdecode_mask.3 | 4 ++++ usr.bin/kdump/kdump.c | 4 ++-- usr.bin/truss/syscall.h | 1 + usr.bin/truss/syscalls.c | 7 +++++-- 7 files changed, 30 insertions(+), 7 deletions(-) diff --git a/lib/libsysdecode/flags.c b/lib/libsysdecode/flags.c index cea5f83f3ad6..441e8e2ca884 100644 --- a/lib/libsysdecode/flags.c +++ b/lib/libsysdecode/flags.c @@ -276,8 +276,9 @@ sysdecode_fadvice(int advice) return (lookup_value(fadvisebehav, advice)); } -bool -sysdecode_open_flags(FILE *fp, int flags, int *rem) +static bool +_sysdecode_open_flags(FILE *fp, int flags, int *rem, + struct name_table open_flag_table[]) { bool printed; int mode; @@ -309,12 +310,24 @@ sysdecode_open_flags(FILE *fp, int flags, int *rem) printed = false; } val = (unsigned)flags; - print_mask_part(fp, openflags, &val, &printed); + print_mask_part(fp, open_flag_table, &val, &printed); if (rem != NULL) *rem = val | mode; return (printed); } +bool +sysdecode_open_flags(FILE *fp, int flags, int *rem) +{ + return (_sysdecode_open_flags(fp, flags, rem, openflags)); +} + +bool +sysdecode_shm_open_flags(FILE *fp, int flags, int *rem) +{ + return (_sysdecode_open_flags(fp, flags, rem, shm_open_flags)); +} + bool sysdecode_fcntl_fileflags(FILE *fp, int flags, int *rem) { diff --git a/lib/libsysdecode/mktables b/lib/libsysdecode/mktables index 954c721ca69c..a52b46747f77 100644 --- a/lib/libsysdecode/mktables +++ b/lib/libsysdecode/mktables @@ -127,6 +127,7 @@ gen_table "rlimit" "RLIMIT_[A-Z]+[[:space:]]+[0-9]+" "sys/ gen_table "rusage" "RUSAGE_[A-Z]+[[:space:]]+[-0-9]+" "sys/resource.h" gen_table "schedpolicy" "SCHED_[A-Z]+[[:space:]]+[0-9]+" "sys/sched.h" gen_table "sendfileflags" "SF_[A-Z]+[[:space:]]+[0-9]+" "sys/socket.h" +gen_table "shm_open_flags" "O_ACCMODE|O_CREAT|O_EXCL|O_TRUNC|O_CLOEXEC" "sys/fcntl.h" gen_table "shmatflags" "SHM_[A-Z]+[[:space:]]+[0-9]{6}" "sys/shm.h" gen_table "shutdownhow" "SHUT_[A-Z]+[[:space:]]+[0-9]+" "sys/socket.h" gen_table "sigbuscode" "BUS_[A-Z]+[[:space:]]+[0-9]+" "sys/signal.h" diff --git a/lib/libsysdecode/sysdecode.h b/lib/libsysdecode/sysdecode.h index 8ab7f5de892f..56c7bb58c270 100644 --- a/lib/libsysdecode/sysdecode.h +++ b/lib/libsysdecode/sysdecode.h @@ -109,6 +109,7 @@ bool sysdecode_sctp_snd_flags(FILE *_fp, int _flags, int *_rem); const char *sysdecode_semctl_cmd(int _cmd); bool sysdecode_semget_flags(FILE *_fp, int _flag, int *_rem); bool sysdecode_sendfile_flags(FILE *_fp, int _flags, int *_rem); +bool sysdecode_shm_open_flags(FILE *_fp, int _flags, int *_rem); bool sysdecode_shmat_flags(FILE *_fp, int _flags, int *_rem); const char *sysdecode_shmctl_cmd(int _cmd); const char *sysdecode_shutdown_how(int _how); diff --git a/lib/libsysdecode/sysdecode_mask.3 b/lib/libsysdecode/sysdecode_mask.3 index 20e9ca63a189..82e90e4d721e 100644 --- a/lib/libsysdecode/sysdecode_mask.3 +++ b/lib/libsysdecode/sysdecode_mask.3 @@ -50,6 +50,7 @@ .Nm sysdecode_rfork_flags , .Nm sysdecode_semget_flags , .Nm sysdecode_sendfile_flags , +.Nm sysdecode_shm_open_flags , .Nm sysdecode_shmat_flags , .Nm sysdecode_sctp_nxt_flags , .Nm sysdecode_sctp_rcv_flags , @@ -119,6 +120,8 @@ .Ft bool .Fn sysdecode_sendfile_flags "FILE *fp" "int flags" "int *rem" .Ft bool +.Fn sysdecode_shm_open_flags "FILE *fp" "int flags" "int *rem" +.Ft bool .Fn sysdecode_shmat_flags "FILE *fp" "int flags" "int *rem" .Ft bool .Fn sysdecode_socket_type "FILE *fp" "int type" "int *rem" @@ -186,6 +189,7 @@ Most of these functions decode an argument passed to a system call: .It Fn sysdecode_rfork_flags Ta Xr rfork 2 Ta Fa flags .It Fn sysdecode_semget_flags Ta Xr semget 2 Ta Fa flags .It Fn sysdecode_sendfile_flags Ta Xr sendfile 2 Ta Fa flags +.It Fn sysdecode_shm_open_flags Ta Xr shm_open 2 Ta Fa flags .It Fn sysdecode_shmat_flags Ta Xr shmat 2 Ta Fa flags .It Fn sysdecode_socket_type Ta Xr socket 2 Ta Fa type .It Fn sysdecode_thr_create_flags Ta Xr thr_create 2 Ta Fa flags diff --git a/usr.bin/kdump/kdump.c b/usr.bin/kdump/kdump.c index 2d9775f3abe7..1842aa098813 100644 --- a/usr.bin/kdump/kdump.c +++ b/usr.bin/kdump/kdump.c @@ -1339,7 +1339,7 @@ ktrsyscall_freebsd(struct ktr_syscall *ktr, register_t **resip, print_number(ip, narg, c); } putchar(','); - print_mask_arg(sysdecode_open_flags, ip[0]); + print_mask_arg(sysdecode_shm_open_flags, ip[0]); putchar(','); decode_filemode(ip[1]); ip += 2; @@ -1354,7 +1354,7 @@ ktrsyscall_freebsd(struct ktr_syscall *ktr, register_t **resip, print_number(ip, narg, c); } putchar(','); - print_mask_arg(sysdecode_open_flags, ip[0]); + print_mask_arg(sysdecode_shm_open_flags, ip[0]); putchar(','); decode_filemode(ip[1]); putchar(','); diff --git a/usr.bin/truss/syscall.h b/usr.bin/truss/syscall.h index 9258c72c0ae3..28a911dfde15 100644 --- a/usr.bin/truss/syscall.h +++ b/usr.bin/truss/syscall.h @@ -121,6 +121,7 @@ enum Argtype { Ptraceop, Sendfileflags, Sendfilehdtr, + ShmOpen, Quotactlcmd, Reboothowto, Resource, diff --git a/usr.bin/truss/syscalls.c b/usr.bin/truss/syscalls.c index b6f1534612ec..30e0b5f80081 100644 --- a/usr.bin/truss/syscalls.c +++ b/usr.bin/truss/syscalls.c @@ -529,9 +529,9 @@ static const struct syscall_decode decoded_syscalls[] = { .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 }, { Ptr | IN, 3 }, { Socklent, 4 } } }, { .name = "shm_open", .ret_type = 1, .nargs = 3, - .args = { { ShmName | IN, 0 }, { Open, 1 }, { Octal, 2 } } }, + .args = { { ShmName | IN, 0 }, { ShmOpen, 1 }, { Octal, 2 } } }, { .name = "shm_open2", .ret_type = 1, .nargs = 5, - .args = { { ShmName | IN, 0 }, { Open, 1 }, { Octal, 2 }, + .args = { { ShmName | IN, 0 }, { ShmOpen, 1 }, { Octal, 2 }, { ShmFlags, 3 }, { Name | IN, 4 } } }, { .name = "shm_rename", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { Name | IN, 1 }, { Hex, 2 } } }, @@ -2029,6 +2029,9 @@ print_arg(struct syscall_arg *sc, syscallarg_t *args, syscallarg_t *retval, case Whence: print_integer_arg(sysdecode_whence, fp, args[sc->offset]); break; + case ShmOpen: + print_mask_arg(sysdecode_shm_open_flags, fp, args[sc->offset]); + break; case ShmFlags: print_mask_arg(sysdecode_shmflags, fp, args[sc->offset]); break; From d071b1e39d9ce63fc556543ee899db82bdbdae84 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 4 Oct 2024 14:54:44 +0000 Subject: [PATCH 06/11] shm: Respect PROT_MAX when creating private mappings We were previously unconditionally adding PROT_WRITE to the maxprot of private mapping (because a private mapping can be written even if the fd is read-only), but this might violate the user's PROT_MAX request. While here, rename cap_maxprot to max_maxprot. This is the intersection of the maximum protections imposed by capsicum rights on the fd (not really relevant for private mappings) and the user-required maximum protections (which were not being obeyed). In particular, cap_maxprot is a misnomer after the introduction of PROT_MAX. Add some regression test cases. mmap__maxprot_shm fails without this patch. Note: Capsicum's CAP_MMAP_W is a bit ambiguous. Should it be required in order to create writeable private mappings? Currently it is, even though such mappings don't permit writes to the object referenced by the fd. Reported by: brooks Reviewed by: brooks MFC after: 1 month Fixes: c7841c6b8e41 ("Relax restrictions on private mappings of POSIX shm objects.") Differential Revision: https://reviews.freebsd.org/D46741 (cherry picked from commit 33c2c58f0a3db0a6d3996fa14ac7967274678771) --- sys/kern/uipc_shm.c | 8 ++-- tests/sys/posixshm/posixshm_test.c | 29 ++++++++++++- tests/sys/vm/mmap_test.c | 70 +++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 6 deletions(-) diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 701a5dc46c68..c7e34a3e9472 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -1718,7 +1718,7 @@ shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_pointer_t *addr, int shm_mmap(struct file *fp, vm_map_t map, vm_pointer_t *addr, vm_offset_t max_addr, vm_size_t objsize, - vm_prot_t prot, vm_prot_t cap_maxprot, int flags, + vm_prot_t prot, vm_prot_t max_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; @@ -1741,8 +1741,8 @@ shm_mmap(struct file *fp, vm_map_t map, vm_pointer_t *addr, * writeable. */ if ((flags & MAP_SHARED) == 0) { - cap_maxprot |= VM_PROT_WRITE; - maxprot |= VM_PROT_WRITE; + if ((max_maxprot & VM_PROT_WRITE) != 0) + maxprot |= VM_PROT_WRITE; writecnt = false; } else { if ((fp->f_flag & FWRITE) != 0 && @@ -1762,7 +1762,7 @@ shm_mmap(struct file *fp, vm_map_t map, vm_pointer_t *addr, goto out; } } - maxprot &= cap_maxprot; + maxprot &= max_maxprot; prot = VM_PROT_ADD_CAP(prot); maxprot = VM_PROT_ADD_CAP(prot); diff --git a/tests/sys/posixshm/posixshm_test.c b/tests/sys/posixshm/posixshm_test.c index d2127aaae5a9..f1641079ea88 100644 --- a/tests/sys/posixshm/posixshm_test.c +++ b/tests/sys/posixshm/posixshm_test.c @@ -1190,6 +1190,33 @@ ATF_TC_BODY(accounting, tc) ATF_REQUIRE(close(fd) == 0); } +ATF_TC_WITHOUT_HEAD(mmap_prot); +ATF_TC_BODY(mmap_prot, tc) +{ + void *p; + int fd, pagesize; + + ATF_REQUIRE((pagesize = getpagesize()) > 0); + + gen_test_path(); + fd = shm_open(test_path, O_RDONLY | O_CREAT, 0644); + ATF_REQUIRE(fd >= 0); + + p = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + ATF_REQUIRE(p != MAP_FAILED); + ATF_REQUIRE(munmap(p, pagesize) == 0); + p = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + ATF_REQUIRE_ERRNO(EACCES, p == MAP_FAILED); + p = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + ATF_REQUIRE(p != MAP_FAILED); + ATF_REQUIRE(munmap(p, pagesize) == 0); + + ATF_REQUIRE_MSG(shm_unlink(test_path) == 0, + "shm_unlink failed; errno=%d", errno); + ATF_REQUIRE_MSG(close(fd) == 0, + "close failed; errno=%d", errno); +} + static int shm_open_large(int psind, int policy, size_t sz) { @@ -1928,7 +1955,6 @@ ATF_TC_BODY(largepage_reopen, tc) ATF_TP_ADD_TCS(tp) { - ATF_TP_ADD_TC(tp, remap_object); ATF_TP_ADD_TC(tp, rename_from_anon); ATF_TP_ADD_TC(tp, rename_bad_path_pointer); @@ -1962,6 +1988,7 @@ ATF_TP_ADD_TCS(tp) ATF_TP_ADD_TC(tp, fallocate); ATF_TP_ADD_TC(tp, fspacectl); ATF_TP_ADD_TC(tp, accounting); + ATF_TP_ADD_TC(tp, mmap_prot); ATF_TP_ADD_TC(tp, largepage_basic); ATF_TP_ADD_TC(tp, largepage_config); ATF_TP_ADD_TC(tp, largepage_mmap); diff --git a/tests/sys/vm/mmap_test.c b/tests/sys/vm/mmap_test.c index e5f4a81a7858..6bc30f73ca95 100644 --- a/tests/sys/vm/mmap_test.c +++ b/tests/sys/vm/mmap_test.c @@ -295,14 +295,82 @@ ATF_TC_BODY(mmap__write_only, tc) munmap(p, pagesize); } -ATF_TP_ADD_TCS(tp) +ATF_TC_WITHOUT_HEAD(mmap__maxprot_basic); +ATF_TC_BODY(mmap__maxprot_basic, tc) +{ + void *p; + int error, pagesize; + + ATF_REQUIRE((pagesize = getpagesize()) > 0); + + p = mmap(NULL, pagesize, PROT_READ | PROT_MAX(PROT_READ), + MAP_ANON, -1, 0); + ATF_REQUIRE(p != MAP_FAILED); + + error = mprotect(p, pagesize, PROT_WRITE); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + error = mprotect(p, pagesize, PROT_READ | PROT_WRITE); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + error = mprotect(p, pagesize, PROT_READ | PROT_EXEC); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + + ATF_REQUIRE(munmap(p, pagesize) == 0); +} + +/* Make sure that PROT_MAX applies as expected to mappings of shm objects */ +ATF_TC_WITHOUT_HEAD(mmap__maxprot_shm); +ATF_TC_BODY(mmap__maxprot_shm, tc) { + void *p; + int error, fd, pagesize; + + ATF_REQUIRE((pagesize = getpagesize()) > 0); + fd = shm_open(SHM_ANON, O_RDWR, 0644); + ATF_REQUIRE(fd >= 0); + + error = ftruncate(fd, pagesize); + ATF_REQUIRE(error == 0); + + p = mmap(NULL, pagesize, PROT_READ | PROT_MAX(PROT_READ), + MAP_PRIVATE, fd, 0); + ATF_REQUIRE(p != MAP_FAILED); + + error = mprotect(p, pagesize, PROT_WRITE); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + error = mprotect(p, pagesize, PROT_READ | PROT_WRITE); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + error = mprotect(p, pagesize, PROT_READ | PROT_EXEC); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + + ATF_REQUIRE(munmap(p, pagesize) == 0); + + /* Again, this time with a shared mapping. */ + p = mmap(NULL, pagesize, PROT_READ | PROT_MAX(PROT_READ), + MAP_SHARED, fd, 0); + ATF_REQUIRE(p != MAP_FAILED); + + error = mprotect(p, pagesize, PROT_WRITE); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + error = mprotect(p, pagesize, PROT_READ | PROT_WRITE); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + error = mprotect(p, pagesize, PROT_READ | PROT_EXEC); + ATF_REQUIRE_ERRNO(EACCES, error == -1); + + ATF_REQUIRE(munmap(p, pagesize) == 0); + + ATF_REQUIRE(close(fd) == 0); +} + +ATF_TP_ADD_TCS(tp) +{ ATF_TP_ADD_TC(tp, mmap__map_at_zero); ATF_TP_ADD_TC(tp, mmap__bad_arguments); ATF_TP_ADD_TC(tp, mmap__dev_zero_private); ATF_TP_ADD_TC(tp, mmap__dev_zero_shared); ATF_TP_ADD_TC(tp, mmap__write_only); + ATF_TP_ADD_TC(tp, mmap__maxprot_basic); + ATF_TP_ADD_TC(tp, mmap__maxprot_shm); return (atf_no_error()); } From 577fd3bd5fa320f4af7d77f097b33d791ae169cc Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Mon, 26 Aug 2024 22:57:01 +0100 Subject: [PATCH 07/11] posix shm: rework capability support In order to preserve provenance and prevent colluding proceses from hiding capabilities from the revoker, we must not share capabilities across address spaces. However, doing so is often useful and some existing code makes use of of this functionality in relatively safe ways. As a compromise, allow sharing, but only if the programmer makes their intent clear by making the following changes: - Require that PROT_CAP be requested explicitly in mmap to allow load or store of capabilities. - By default, attach each share memory object to the first address space (vmspace) that opens it and allow only that address space to map the object with PROT_CAP. On fork, remove capability permissions from the mapping. - Add an O_SHARECAP flag for shm_open2 (aka shm_open and memfd_create) which allows sharing capabilities across address spaces, overriding the default. --- bin/cheribsdtest/cheribsdtest_ptrace.c | 5 +- bin/cheribsdtest/cheribsdtest_vm.c | 70 ++++++++++------ lib/libsys/shm_open.2 | 21 +++++ lib/libsysdecode/mktables | 2 +- .../linuxkpi/common/include/linux/list.h | 1 + sys/kern/kern_umtx.c | 2 +- sys/kern/uipc_shm.c | 80 +++++++++++++++++-- sys/sys/fcntl.h | 6 ++ sys/sys/mman.h | 6 +- sys/vm/vm_map.c | 37 ++++++++- sys/vm/vm_map.h | 1 + sys/vm/vm_object.h | 2 + 12 files changed, 190 insertions(+), 43 deletions(-) diff --git a/bin/cheribsdtest/cheribsdtest_ptrace.c b/bin/cheribsdtest/cheribsdtest_ptrace.c index 05107c09c25e..6bfd487e3882 100644 --- a/bin/cheribsdtest/cheribsdtest_ptrace.c +++ b/bin/cheribsdtest/cheribsdtest_ptrace.c @@ -217,11 +217,12 @@ CHERIBSDTEST(ptrace_writecap, "Basic tests of PIOD_WRITE_CHERI_CAP") uintcap_t *map, pp[2]; char capbuf[2][sizeof(uintcap_t) + 1]; - fd = CHERIBSDTEST_CHECK_SYSCALL(shm_open(SHM_ANON, O_RDWR, 0600)); + fd = CHERIBSDTEST_CHECK_SYSCALL(shm_open(SHM_ANON, O_RDWR | O_SHARECAP, + 0600)); CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); map = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_SHARED, fd, 0)); pid = fork_child(); diff --git a/bin/cheribsdtest/cheribsdtest_vm.c b/bin/cheribsdtest/cheribsdtest_vm.c index 733788ad1ad8..a46db1b51fee 100644 --- a/bin/cheribsdtest/cheribsdtest_vm.c +++ b/bin/cheribsdtest/cheribsdtest_vm.c @@ -191,20 +191,37 @@ CHERIBSDTEST(vm_mmap_diallowed_prot, } CHERIBSDTEST(vm_tag_shm_open_anon_shared, - "check tags are stored for SHM_ANON MAP_SHARED pages") + "check tags are stored for SHM_ANON MAP_SHARED pages when requested") { int fd = CHERIBSDTEST_CHECK_SYSCALL(shm_open(SHM_ANON, O_RDWR, 0600)); CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); - mmap_and_check_tag_stored(fd, PROT_READ | PROT_WRITE, MAP_SHARED); + mmap_and_check_tag_stored(fd, PROT_READ | PROT_WRITE | PROT_CAP, + MAP_SHARED); cheribsdtest_success(); } +CHERIBSDTEST(vm_tag_shm_open_anon_shared_implied_cap, + "check tags are not stored for SHM_ANON MAP_SHARED pages by default", + .ct_flags = CT_FLAG_SIGNAL | CT_FLAG_SI_CODE | CT_FLAG_SI_TRAPNO, + .ct_signum = SIGSEGV, + .ct_si_code = SEGV_STORETAG, + .ct_si_trapno = TRAPNO_STORE_CAP_PF, + .ct_check_skip = skip_need_writable_tmp) +{ + int fd = CHERIBSDTEST_CHECK_SYSCALL(shm_open(SHM_ANON, O_RDWR, 0600)); + CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); + mmap_and_check_tag_stored(fd, PROT_READ | PROT_WRITE, MAP_SHARED); + + cheribsdtest_failure_errx("store succeeded"); +} + CHERIBSDTEST(vm_tag_shm_open_anon_private, "check tags are stored for SHM_ANON MAP_PRIVATE pages") { int fd = CHERIBSDTEST_CHECK_SYSCALL(shm_open(SHM_ANON, O_RDWR, 0600)); CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); - mmap_and_check_tag_stored(fd, PROT_READ | PROT_WRITE, MAP_PRIVATE); + mmap_and_check_tag_stored(fd, PROT_READ | PROT_WRITE | PROT_CAP, + MAP_PRIVATE); cheribsdtest_success(); } @@ -220,14 +237,15 @@ CHERIBSDTEST(vm_tag_shm_open_anon_shared2x, CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); map2 = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, getpagesize(), - PROT_READ, MAP_SHARED, fd, 0)); + PROT_READ | PROT_CAP, MAP_SHARED, fd, 0)); /* Verify that no capability present */ c2 = *map2; CHERIBSDTEST_VERIFY2(cheri_gettag(c2) == 0, "tag exists on first read"); CHERIBSDTEST_VERIFY2(c2 == NULL, "Initial read NULL"); - mmap_and_check_tag_stored(fd, PROT_READ | PROT_WRITE, MAP_SHARED); + mmap_and_check_tag_stored(fd, PROT_READ | PROT_WRITE | PROT_CAP, + MAP_SHARED); /* And now verify that it is, thanks to the aliased maps */ c2 = *map2; @@ -237,10 +255,7 @@ CHERIBSDTEST(vm_tag_shm_open_anon_shared2x, cheribsdtest_success(); } -CHERIBSDTEST(vm_shm_open_anon_unix_surprise, - "test SHM_ANON vs SCM_RIGHTS", - .ct_xfail_reason = - "Tags currently survive cross-AS aliasing of SHM_ANON objects") +CHERIBSDTEST(vm_shm_open_anon_unix_cross_as, "test SHM_ANON vs SCM_RIGHTS") { int sv[2]; int pid; @@ -280,7 +295,7 @@ CHERIBSDTEST(vm_shm_open_anon_unix_surprise, CHERIBSDTEST_VERIFY2(fd >= 0, "fd read OK"); map = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, getpagesize(), - PROT_READ, MAP_PRIVATE, fd, 0)); + PROT_READ | PROT_CAP, MAP_SHARED, fd, 0)); c = *map; if (verbose) @@ -308,12 +323,12 @@ CHERIBSDTEST(vm_shm_open_anon_unix_surprise, close(sv[0]); - fd = CHERIBSDTEST_CHECK_SYSCALL(shm_open(SHM_ANON, O_RDWR, 0600)); + fd = CHERIBSDTEST_CHECK_SYSCALL(shm_open(SHM_ANON, + O_RDWR | O_SHARECAP, 0600)); CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); map = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0)); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_SHARED, fd, 0)); /* Just some pointer */ *map = &fd; @@ -345,9 +360,9 @@ CHERIBSDTEST(vm_shm_open_anon_unix_surprise, waitpid(pid, &res, 0); if (res == 0) { - cheribsdtest_success(); + cheribsdtest_failure_errx("tags failed to transfer"); } else if (WIFEXITED(res) && WEXITSTATUS(res) == 1) { - cheribsdtest_failure_errx("tag transfer succeeded"); + cheribsdtest_success(); } else { cheribsdtest_failure_errx("child setup error occured (this is *unexpected*"); } @@ -366,7 +381,7 @@ CHERIBSDTEST(shm_open_read_nocaps, CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); map = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_SHARED, fd, 0)); /* Just some pointer */ *map = &fd; @@ -637,12 +652,11 @@ skip_need_writable_tmp(const char *name __unused) * and write a tagged capability to it. * * 2) Create a second copy-on-write mapping; read back the tagged value via - * the second mapping, and confirm that it still has a tag. - * (cheribsdtest_vm_cow_read) + * the second mapping, and confirm that it still has a tag. (vm_cow_read) * * 3) Write an adjacent word in the second mapping, which should cause a * copy-on-write, then read back the capability and confirm that it still has - * a tag. (cheribsdtest_vm_cow_write) + * a tag. (vm_cow_write) */ CHERIBSDTEST(vm_cow_read, "read capabilities from a copy-on-write page") @@ -662,9 +676,11 @@ CHERIBSDTEST(vm_cow_read, * Create 'real' and copy-on-write mappings. */ cp_real = CHERIBSDTEST_CHECK_SYSCALL2(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0), "mmap cp_real"); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_SHARED, fd, 0), + "mmap cp_real"); cp_copy = CHERIBSDTEST_CHECK_SYSCALL2(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0), "mmap cp_copy"); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_PRIVATE, fd, 0), + "mmap cp_copy"); /* * Write out a tagged capability to 'real' mapping -- doesn't really @@ -711,9 +727,11 @@ CHERIBSDTEST(vm_cow_write, * Create 'real' and copy-on-write mappings. */ cp_real = CHERIBSDTEST_CHECK_SYSCALL2(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0), "mmap cp_real"); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_SHARED, fd, 0), + "mmap cp_real"); cp_copy = CHERIBSDTEST_CHECK_SYSCALL2(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0), "mmap cp_copy"); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_PRIVATE, fd, 0), + "mmap cp_copy"); /* * Write out a tagged capability to 'real' mapping -- doesn't really @@ -1249,7 +1267,7 @@ CHERIBSDTEST(vm_shm_largepage_basic, "psind=%d errno=%d", psind, errno); CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, ps[psind])); addr = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, ps[psind], - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_SHARED, fd, 0)); /* Verify mmap output */ CHERIBSDTEST_VERIFY2(cheri_gettag(addr) != 0, @@ -2699,7 +2717,7 @@ CHERIBSDTEST(cheri_revoke_shm_anon_hoard_unmapped, CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); map = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_SHARED, fd, 0)); to_revoke = malloc(1); *map = to_revoke; @@ -2785,7 +2803,7 @@ CHERIBSDTEST(cheri_revoke_shm_anon_hoard_closed, CHERIBSDTEST_CHECK_SYSCALL(ftruncate(fd, getpagesize())); map = CHERIBSDTEST_CHECK_SYSCALL(mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + PROT_READ | PROT_WRITE | PROT_CAP, MAP_SHARED, fd, 0)); to_revoke = malloc(1); *map = to_revoke; diff --git a/lib/libsys/shm_open.2 b/lib/libsys/shm_open.2 index 8bea939690ba..5f50824ac88a 100644 --- a/lib/libsys/shm_open.2 +++ b/lib/libsys/shm_open.2 @@ -112,6 +112,27 @@ The size of the object can be adjusted via and queried via .Xr fstat 2 . .Pp +On systems supporting CHERI capabilities, capabilities may only be shared +within a single address space unless the shared memory object is created +with the +.Dv O_SHARECAP +flag in the initial call to +.Fn shm_open . +Additionally, capabilities can only be stored to or loaded from mappings +created using +.Xr mmap 2 +where the +.Dv PROT_CAP +permission is specified. +Without +.Dv PROT_CAP +stores of valid capabilities will trigger a fault and loads will always +yield an invalid capability. +On fork, pages with capability permissions remain shared, but are +downgraded to remove capability permissions unless +.Dv O_SHARECAP +was used during object creation. +.Pp The new descriptor is set to close during .Xr execve 2 system calls; diff --git a/lib/libsysdecode/mktables b/lib/libsysdecode/mktables index a52b46747f77..a41ccb8386ee 100644 --- a/lib/libsysdecode/mktables +++ b/lib/libsysdecode/mktables @@ -127,7 +127,7 @@ gen_table "rlimit" "RLIMIT_[A-Z]+[[:space:]]+[0-9]+" "sys/ gen_table "rusage" "RUSAGE_[A-Z]+[[:space:]]+[-0-9]+" "sys/resource.h" gen_table "schedpolicy" "SCHED_[A-Z]+[[:space:]]+[0-9]+" "sys/sched.h" gen_table "sendfileflags" "SF_[A-Z]+[[:space:]]+[0-9]+" "sys/socket.h" -gen_table "shm_open_flags" "O_ACCMODE|O_CREAT|O_EXCL|O_TRUNC|O_CLOEXEC" "sys/fcntl.h" +gen_table "shm_open_flags" "O_ACCMODE|O_CREAT|O_EXCL|O_TRUNC|O_CLOEXEC|O_SHARECAP" "sys/fcntl.h" gen_table "shmatflags" "SHM_[A-Z]+[[:space:]]+[0-9]{6}" "sys/shm.h" gen_table "shutdownhow" "SHUT_[A-Z]+[[:space:]]+[0-9]+" "sys/socket.h" gen_table "sigbuscode" "BUS_[A-Z]+[[:space:]]+[0-9]+" "sys/signal.h" diff --git a/sys/compat/linuxkpi/common/include/linux/list.h b/sys/compat/linuxkpi/common/include/linux/list.h index eecb517d780e..f73cfc7b72f7 100644 --- a/sys/compat/linuxkpi/common/include/linux/list.h +++ b/sys/compat/linuxkpi/common/include/linux/list.h @@ -73,6 +73,7 @@ #include #include #include +#include #endif #ifndef prefetch diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c index 39993094bfb4..1f64d8bd741e 100644 --- a/sys/kern/kern_umtx.c +++ b/sys/kern/kern_umtx.c @@ -4560,7 +4560,7 @@ umtx_shm_create_reg(struct thread *td, const struct umtx_key *key, return (ENOMEM); reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); bcopy(key, ®->ushm_key, sizeof(*key)); - reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false); + reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR, false, false); reg->ushm_cred = crhold(cred); error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); if (error != 0) { diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index c7e34a3e9472..f94cd9034e9e 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -937,7 +937,7 @@ shm_dotruncate(struct shmfd *shmfd, off_t length) * routines. */ struct shmfd * -shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) +shm_alloc(struct ucred *ucred, mode_t mode, bool largepage, bool sharecap) { struct shmfd *shmfd; vm_object_t obj; @@ -962,6 +962,16 @@ shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) } KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); vm_object_set_flag(shmfd->shm_object, OBJ_HASCAP); + if (sharecap) { + shmfd->shm_vmspace = NULL; + vm_object_set_flag(shmfd->shm_object, OBJ_SHARECAP); + } else { + shmfd->shm_vmspace = curproc->p_vmspace; + vm_map_lock(&curproc->p_vmspace->vm_map); + LIST_INSERT_HEAD(&curproc->p_vmspace->vm_shm_objects, + shmfd, shm_vmspace_entry); + vm_map_unlock(&curproc->p_vmspace->vm_map); + } vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; @@ -989,11 +999,19 @@ void shm_drop(struct shmfd *shmfd) { vm_object_t obj; + struct vmspace *vm; if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif + vm = atomic_load_ptr(&shmfd->shm_vmspace); + if (vm != NULL) { + vm_map_lock(&vm->vm_map); + if (vm == atomic_load_ptr(&shmfd->shm_vmspace)) + LIST_REMOVE(shmfd, shm_vmspace_entry); + vm_map_unlock(&vm->vm_map); + } rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); obj = shmfd->shm_object; @@ -1007,6 +1025,25 @@ shm_drop(struct shmfd *shmfd) } } +void +shm_vmspace_free(struct vmspace *vm) +{ + struct shmfd *shmfd; + + vm_map_lock(&vm->vm_map); + while (!LIST_EMPTY(&vm->vm_shm_objects)) { + shmfd = LIST_FIRST(&vm->vm_shm_objects); + + KASSERT(shmfd->shm_vmspace == vm, ("wrong vmspace! %p != %p", + shmfd->shm_vmspace, vm)); + + LIST_REMOVE(shmfd, shm_vmspace_entry); + atomic_store_ptr(&shmfd->shm_vmspace, NULL); + /* XXX: free refs to shmfds? */ + } + vm_map_unlock(&vm->vm_map); +} + /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. @@ -1168,7 +1205,7 @@ kern_shm_open2(struct thread *td, const char * __capability userpath, Fnv32_t fnv; mode_t cmode; int error, fd, initial_seals; - bool largepage; + bool largepage, sharecap; if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | SHM_LARGEPAGE)) != 0) @@ -1184,9 +1221,16 @@ kern_shm_open2(struct thread *td, const char * __capability userpath, if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); - if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) + if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC | + O_SHARECAP)) != 0) return (EINVAL); + /* XXX: add a proc flag to allow/disallow */ + if (SV_PROC_FLAG(td->td_proc, SV_CHERI)) + sharecap = (flags & O_SHARECAP) != 0; + else + sharecap = true; + largepage = (shmflags & SHM_LARGEPAGE) != 0; if (largepage && !PMAP_HAS_LARGEPAGES) return (ENOTTY); @@ -1248,7 +1292,7 @@ kern_shm_open2(struct thread *td, const char * __capability userpath, fdrop(fp, td); return (EINVAL); } - shmfd = shm_alloc(td->td_ucred, cmode, largepage); + shmfd = shm_alloc(td->td_ucred, cmode, largepage, sharecap); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; } else { @@ -1264,7 +1308,7 @@ kern_shm_open2(struct thread *td, const char * __capability userpath, if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode, - largepage); + largepage, sharecap); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; shm_insert(path, fnv, shmfd); @@ -1309,6 +1353,9 @@ kern_shm_open2(struct thread *td, const char * __capability userpath, else if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; + else if (sharecap && + (shmfd->shm_object->flags & O_SHARECAP) == 0) + error = EPERM; else if (shmflags != 0 && shmflags != shmfd->shm_flags) error = EINVAL; else { @@ -1762,10 +1809,27 @@ shm_mmap(struct file *fp, vm_map_t map, vm_pointer_t *addr, goto out; } } - maxprot &= max_maxprot; + if ((max_maxprot & (VM_PROT_CAP | VM_PROT_NO_IMPLY_CAP)) != 0) { + /* + * If we want capability permissions, we must either be in + * the original address space or the object must have the + * OBJ_SHARECAP flag set. + */ + if ((max_maxprot & VM_PROT_CAP) != 0 && + (shmfd->shm_object->flags & OBJ_SHARECAP) == 0 && + shmfd->shm_vmspace != td->td_proc->p_vmspace) { + error = EACCES; + goto out; + } - prot = VM_PROT_ADD_CAP(prot); - maxprot = VM_PROT_ADD_CAP(prot); + /* + * If we've asked for (or explicitly rejected) capability + * permissions, imply them for maxprot so we don't end up + * with prot as a superset of maxport. + */ + maxprot = VM_PROT_ADD_CAP(maxprot); + } + maxprot &= max_maxprot; /* See comment in vn_mmap(). */ if ( diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h index 143824558fc2..b3d597e74ede 100644 --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -142,6 +142,12 @@ typedef __pid_t pid_t; #define O_EMPTY_PATH 0x02000000 #endif +#if __BSD_VISIBLE +#define O_SHARECAP O_DIRECTORY /* Allow cross address space capability + sharing. POSIX shm and memfd + specific. */ +#endif + /* * XXX missing O_RSYNC. */ diff --git a/sys/sys/mman.h b/sys/sys/mman.h index 060a68731fbb..1b0015e2301f 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -276,6 +276,8 @@ struct shmfd { vm_ooffset_t shm_size; vm_object_t shm_object; vm_pindex_t shm_pages; /* allocated pages */ + struct vmspace *shm_vmspace; + LIST_ENTRY(shmfd) shm_vmspace_entry; int shm_refs; uid_t shm_uid; gid_t shm_gid; @@ -314,9 +316,11 @@ int shm_map(struct file *fp, size_t size, off_t offset, void **memp); int shm_unmap(struct file *fp, void *mem, size_t size); int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); -struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage); +struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage, + bool sharecap); struct shmfd *shm_hold(struct shmfd *shmfd); void shm_drop(struct shmfd *shmfd); +void shm_vmspace_free(struct vmspace *vm); int shm_dotruncate(struct shmfd *shmfd, off_t length); bool shm_largepage(struct shmfd *shmfd); void shm_remove_prison(struct prison *pr); diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 62871a068eca..1abf6b12d116 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -421,6 +421,7 @@ vmspace_alloc(vm_pointer_t min, vm_pointer_t max, pmap_pinit_t pinit) */ vm->vm_prev_cid = 0; #endif + LIST_INIT(&vm->vm_shm_objects); return (vm); } @@ -451,6 +452,11 @@ vmspace_dofree(struct vmspace *vm) */ shmexit(vm); + /* + * Clean up local posix shm objects. + */ + shm_vmspace_free(vm); + /* * Lock the map, to wait out all other references to it. * Delete all of the mappings and pages they hold, then call @@ -5150,6 +5156,10 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) #if __has_feature(capabilities) vm2->vm_prev_cid = vm1->vm_prev_cid; + /* + * NB: we don't copy vm_shm_objects as by definition, no copied + * objects will be local to the new vmspace. + */ #endif new_map->anon_loc = old_map->anon_loc; @@ -5157,6 +5167,8 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) MAP_ASLR_STACK | MAP_RESERVATIONS | MAP_WXORX); VM_MAP_ENTRY_FOREACH(old_entry, old_map) { + bool strip_cap_perms = false; + if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) panic("vm_map_fork: encountered a submap"); @@ -5181,6 +5193,11 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) object = old_entry->object.vm_object; } + /* XXX: add a proc flag to allow/disallow */ + if ((old_entry->max_protection & VM_PROT_CAP) != 0 && + (object->flags & OBJ_SHARECAP) == 0) + strip_cap_perms = true; + /* * Add the reference before calling vm_object_shadow * to insure that a shadow object is created. @@ -5242,6 +5259,10 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) *new_entry = *old_entry; new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); + if (strip_cap_perms) { + new_entry->protection &= ~VM_PROT_CAP; + new_entry->max_protection &= ~VM_PROT_CAP; + } new_entry->wiring_thread = NULL; new_entry->wired_count = 0; if (new_entry->eflags & MAP_ENTRY_WRITECNT) { @@ -5259,11 +5280,19 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) /* * Update the physical map + * + * If this is a shared object that might contain + * capabilities, we've removed the capability + * permissions and need to let a fault set + * hardware permissions up properly rather than + * blindly copying them. */ - pmap_copy(new_map->pmap, old_map->pmap, - new_entry->start, - (old_entry->end - old_entry->start), - old_entry->start); + if (!strip_cap_perms) { + pmap_copy(new_map->pmap, old_map->pmap, + new_entry->start, + (old_entry->end - old_entry->start), + old_entry->start); + } break; case VM_INHERIT_COPY: diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index fd6fe58e7868..200887b4b5ae 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -379,6 +379,7 @@ struct vmspace { #if __has_feature(capabilities) uint64_t vm_prev_cid; /* (d) last compartment ID allocated */ #endif + LIST_HEAD(, shmfd) vm_shm_objects; /* (d) local shm objects */ /* * Keep the PMAP last, so that CPU-specific variations of that * structure on a single architecture don't result in offset diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 8aadceafc1c8..028a6d854407 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -203,6 +203,8 @@ struct vm_object { #define OBJ_NOCAP 0x20000 /* object and all shadow objects can not store capabilities */ #define OBJ_CHERISHADOW 0x40000 /* object is the shadow bitmap */ +#define OBJ_SHARECAP 0x80000 /* capabilities in object can be + shared across vmspaces */ /* * Helpers to perform conversion between vm_object page indexes and offsets. From 757e73f9d2aa2f3f4eb55b36cafb00e1671f8f83 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Thu, 26 Sep 2024 21:13:31 +0100 Subject: [PATCH 08/11] vm_map: add MAP_INHERIT_NONE Allow vm entries to be created with VM_INHERIT_NONE. To be used in a future commit which maps objects during a revocation pass. --- sys/vm/vm_map.c | 6 ++++++ sys/vm/vm_map.h | 1 + 2 files changed, 7 insertions(+) diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 1abf6b12d116..1432c6e2235f 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -1936,6 +1936,10 @@ vm_map_insert1(vm_map_t map, vm_object_t object, vm_ooffset_t offset, max != VM_PROT_NONE)) return (KERN_INVALID_ARGUMENT); + if ((cow & (MAP_INHERIT_SHARE | MAP_INHERIT_NONE)) == + (MAP_INHERIT_SHARE | MAP_INHERIT_NONE)) + return (KERN_INVALID_ARGUMENT); + protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; @@ -1961,6 +1965,8 @@ vm_map_insert1(vm_map_t map, vm_object_t object, vm_ooffset_t offset, protoeflags |= MAP_ENTRY_STACK_GAP_UP; if (cow & MAP_INHERIT_SHARE) inheritance = VM_INHERIT_SHARE; + else if (cow & MAP_INHERIT_NONE) + inheritance = VM_INHERIT_NONE; else inheritance = VM_INHERIT_DEFAULT; if ((cow & MAP_CREATE_SHADOW) != 0) diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 200887b4b5ae..4fafb3a7fbaa 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -470,6 +470,7 @@ long vmspace_resident_count(struct vmspace *vmspace); #define MAP_SPLIT_BOUNDARY_MASK 0x00180000 #define MAP_NO_HINT 0x00200000 #define MAP_CREATE_SHADOW 0x00400000 +#define MAP_INHERIT_NONE 0x00800000 #define MAP_SPLIT_BOUNDARY_SHIFT 19 From 5909b4d5ee6d8a45c4c12d93b7e14e1400e6ab23 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 6 Sep 2024 19:56:15 +0100 Subject: [PATCH 09/11] posix shm: revoke local objects During revocation, map all local shm objects into the address space to ensure that they are scanned even when they are not (fully) mapped. Mappings created with O_SHARECAP aren't mapped as revocation on them is unreliable by design. I had originally envisioned a model were we maintained a mapping of only pages not currently mapped in the address space, but the required bookkeeping seems quite complex and there isn't obvious machinery to handle it. --- bin/cheribsdtest/cheribsdtest_vm.c | 6 +- sys/kern/kern_cheri_revoke.c | 18 ++++++ sys/kern/uipc_shm.c | 99 ++++++++++++++++++++++++++++++ sys/sys/mman.h | 7 +++ 4 files changed, 126 insertions(+), 4 deletions(-) diff --git a/bin/cheribsdtest/cheribsdtest_vm.c b/bin/cheribsdtest/cheribsdtest_vm.c index a46db1b51fee..4ca16bf420b2 100644 --- a/bin/cheribsdtest/cheribsdtest_vm.c +++ b/bin/cheribsdtest/cheribsdtest_vm.c @@ -2706,8 +2706,7 @@ CHERIBSDTEST(cheri_revoke_cow_mapping, } CHERIBSDTEST(cheri_revoke_shm_anon_hoard_unmapped, - "Capability is revoked within an unmapped shm object", - .ct_xfail_reason = "unmapped part of shm objects aren't revoked") + "Capability is revoked within an unmapped shm object") { int fd; void * volatile to_revoke; @@ -2739,8 +2738,7 @@ CHERIBSDTEST(cheri_revoke_shm_anon_hoard_unmapped, } CHERIBSDTEST(cheri_revoke_shm_anon_hoard_closed, - "Capability is revoked within an unmapped and closed shm object", - .ct_xfail_reason = "unmapped part of shm objects aren't revoked") + "Capability is revoked within an unmapped and closed shm object") { int sv[2]; int pid; diff --git a/sys/kern/kern_cheri_revoke.c b/sys/kern/kern_cheri_revoke.c index 4a2ef177b6e9..47201715cc5a 100644 --- a/sys/kern/kern_cheri_revoke.c +++ b/sys/kern/kern_cheri_revoke.c @@ -103,6 +103,20 @@ cheri_revoke_hoarders(struct proc *p, struct vm_cheri_revoke_cookie *crc) kqueue_cheri_revoke(p->p_fd, crc); } +static void +cheri_revoke_map_hoards(struct proc *p, struct vm_cheri_revoke_cookie *crc) +{ + /* POSIX shared memory */ + shm_map_local_objs(p, crc); +} + +static void +cheri_revoke_unmap_hoards(struct proc *p, struct vm_cheri_revoke_cookie *crc) +{ + /* POSIX shared memory */ + shm_unmap_local_objs(p, crc); +} + void cheri_revoke_vmspace_fork(struct vmspace *dstvm, struct vmspace *srcvm) { @@ -650,6 +664,8 @@ kern_cheri_revoke(struct thread *td, int flags, /* Per-process kernel hoarders */ cheri_revoke_hoarders(td->td_proc, &vmcrc); + cheri_revoke_map_hoards(td->td_proc, &vmcrc); + KASSERT(myst == CHERI_REVOKE_ST_INITING || myst == CHERI_REVOKE_ST_CLOSING, ("unexpected state %d in revoker", myst)); @@ -727,6 +743,8 @@ kern_cheri_revoke(struct thread *td, int flags, #ifdef DIAGNOSTIC vm_cheri_assert_consistent_clg(&vm->vm_map); #endif + cheri_revoke_unmap_hoards(td->td_proc, &vmcrc); + /* Signal the end of this revocation epoch */ epoch++; crepochs.dequeue = epoch; diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index f94cd9034e9e..a18c86647309 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include @@ -1044,6 +1045,104 @@ shm_vmspace_free(struct vmspace *vm) vm_map_unlock(&vm->vm_map); } +void +shm_map_local_objs(struct proc *p, struct vm_cheri_revoke_cookie *crc) +{ + struct vmspace *vm = p->p_vmspace; + vm_map_t map = &vm->vm_map; + struct shmfd *shmfd; + void *rl_cookie; + vm_prot_t prot, max_prot; + int rv; + + vm_map_lock(map); + LIST_FOREACH(shmfd, &vm->vm_shm_objects, shm_vmspace_entry) { + vm_offset_t align = PAGE_SIZE; + vm_offset_t vaddr = 0; + vm_pointer_t reservation; + /* XXX: can we avoid mapping some objects? */ + + prot = max_prot = VM_PROT_ALL; + + shm_hold(shmfd); + rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX); + + vm_object_reference(shmfd->shm_object); + + KASSERT(shmfd->shm_hoard_addr == 0, + ("shm obj already mapped?")); + if (shm_largepage(shmfd)) + align = pagesizes[shmfd->shm_lp_psind]; + align = MAX(align, + CHERI_REPRESENTABLE_ALIGNMENT(shmfd->shm_size)); + + /* + * We don't need to do the retry dance to try and avoid sbrk + * space because this only applies to pure-capability + * processes and those don't do sbrk. + */ + rv = vm_map_find_aligned(map, &vaddr, + shmfd->shm_size, vm_map_max(map), align); + if (rv != KERN_SUCCESS) + goto fail; + reservation = vaddr; + rv = vm_map_reservation_create_locked(map, + &reservation, shmfd->shm_size, max_prot); + if (rv != KERN_SUCCESS) + goto fail; + rv = vm_map_insert(map, shmfd->shm_object, 0, + reservation, reservation + shmfd->shm_size, + prot, max_prot, MAP_DISABLE_COREDUMP | MAP_INHERIT_NONE, + reservation); + + if (rv != KERN_SUCCESS) + vm_map_reservation_delete_locked(map, + reservation); + shmfd->shm_hoard_addr = reservation; + shmfd->shm_hoard_size = shmfd->shm_size; +fail: + shm_rangelock_unlock(shmfd, rl_cookie); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(shmfd->shm_object); + shm_drop(shmfd); + } + + if (rv != KERN_SUCCESS) { + /* + * XXX Out of suitable address space? + * What to do? Probably kill all procs in the + * vmspace as we can't revoke? + */ + panic("Can't map shm object"); + } + } + vm_map_unlock(map); +} + +void +shm_unmap_local_objs(struct proc *p, struct vm_cheri_revoke_cookie *crc) +{ + struct vmspace *vm = p->p_vmspace; + vm_map_t map = &vm->vm_map; + struct shmfd *shmfd; + void *rl_cookie; + int rv; + + vm_map_lock(map); + LIST_FOREACH(shmfd, &vm->vm_shm_objects, shm_vmspace_entry) { + rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX); + rv = vm_map_delete(map, shmfd->shm_hoard_addr, + shmfd->shm_hoard_addr + shmfd->shm_hoard_size, false); + if (rv != KERN_SUCCESS) + panic("failed to delete shm hoard map entry\n"); + shmfd->shm_hoard_addr = 0; + shmfd->shm_hoard_size = 0; + shm_rangelock_unlock(shmfd, rl_cookie); + shm_drop(shmfd); + } + vm_map_unlock(map); +} + /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. diff --git a/sys/sys/mman.h b/sys/sys/mman.h index 1b0015e2301f..175e230e986f 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -278,6 +278,8 @@ struct shmfd { vm_pindex_t shm_pages; /* allocated pages */ struct vmspace *shm_vmspace; LIST_ENTRY(shmfd) shm_vmspace_entry; + vm_pointer_t shm_hoard_addr; + vm_ooffset_t shm_hoard_size; int shm_refs; uid_t shm_uid; gid_t shm_gid; @@ -311,6 +313,8 @@ struct shmfd { #ifdef _KERNEL struct prison; +struct proc; +struct vm_cheri_revoke_cookie; int shm_map(struct file *fp, size_t size, off_t offset, void **memp); int shm_unmap(struct file *fp, void *mem, size_t size); @@ -324,6 +328,9 @@ void shm_vmspace_free(struct vmspace *vm); int shm_dotruncate(struct shmfd *shmfd, off_t length); bool shm_largepage(struct shmfd *shmfd); void shm_remove_prison(struct prison *pr); +void shm_map_local_objs(struct proc *p, struct vm_cheri_revoke_cookie *crc); +void shm_unmap_local_objs(struct proc *p, + struct vm_cheri_revoke_cookie *crc); extern struct fileops shm_ops; From 19b23ee763168c0ac9c907147d97e32eb15190f5 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Fri, 27 Sep 2024 22:12:45 +0100 Subject: [PATCH 10/11] Rework MAP_ANON|MAP_SHARED and shmat cap support mmap(..., MAP_ANON|MAP_SHARED, ...): Require PROT_CAP explicitly to enable capability support in shared, anonymous mappings. When specified, set the MAP_SHARECAP cow flag which causes a backing object to be allocated and the OBJ_SHARECAP flag set to allow sharing capabilities across address spaces. shmat: Always set OBJ_SHARECAP on SysV shared memory objects. Use of them is straightforwardly auditable. We might want to add an explict SHM_SHARECAP flag at some point rather than making this universal, but shmat is probably best left in the dustbin of history. --- sys/kern/sysv_shm.c | 1 + sys/vm/vm_map.c | 12 ++++++++++++ sys/vm/vm_map.h | 1 + sys/vm/vm_mmap.c | 18 ++++++++++++++++-- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index ca8e20338f41..42e21242752d 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -900,6 +900,7 @@ shmget_allocate_segment(struct thread *td, key_t key, size_t size, int mode) } vm_object_set_flag(shm_object, OBJ_HASCAP); + vm_object_set_flag(shm_object, OBJ_SHARECAP); shmseg->object = shm_object; shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid; shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid; diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 1432c6e2235f..a85d0cc1d72f 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -138,6 +138,7 @@ static uma_zone_t vmspace_zone; static int vmspace_zinit(void *mem, int size, int flags); static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_pointer_t min, vm_pointer_t max); +static inline void vm_map_entry_back(vm_map_entry_t entry); static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry); static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); @@ -2121,6 +2122,17 @@ vm_map_insert1(vm_map_t map, vm_object_t object, vm_ooffset_t offset, ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); new_entry->cred = cred; + if ((cow & MAP_SHARECAP) != 0) { + KASSERT(new_entry->inheritance == VM_INHERIT_SHARE, + ("MAP_SHARECAP on unshared mapping")); + if (new_entry->object.vm_object == NULL) + vm_map_entry_back(new_entry); + VM_OBJECT_WLOCK(new_entry->object.vm_object); + vm_object_set_flag(new_entry->object.vm_object, OBJ_HASCAP); + vm_object_set_flag(new_entry->object.vm_object, OBJ_SHARECAP); + VM_OBJECT_WUNLOCK(new_entry->object.vm_object); + } + /* * Insert the new entry into the list */ diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index 4fafb3a7fbaa..0aad14950f5a 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -471,6 +471,7 @@ long vmspace_resident_count(struct vmspace *vmspace); #define MAP_NO_HINT 0x00200000 #define MAP_CREATE_SHADOW 0x00400000 #define MAP_INHERIT_NONE 0x00800000 +#define MAP_SHARECAP 0x01000000 #define MAP_SPLIT_BOUNDARY_SHIFT 19 diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index a16a288e0879..b5831f2d0849 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -696,9 +696,12 @@ kern_mmap(struct thread *td, const struct mmap_req *mrp) /* * Mapping blank space is trivial. */ + if ((flags & MAP_SHARED) == 0) { + prot = VM_PROT_ADD_CAP(prot); + max_prot = VM_PROT_ADD_CAP(max_prot); + } error = vm_mmap_object(&vms->vm_map, &addr, max_addr, size, - VM_PROT_ADD_CAP(prot), VM_PROT_ADD_CAP(max_prot), flags, - NULL, pos, FALSE, td); + prot, max_prot, flags, NULL, pos, FALSE, td); } else { /* * Mapping file, get fp for validation and don't let the @@ -2080,6 +2083,17 @@ vm_mmap_object(vm_map_t map, vm_pointer_t *addr, vm_offset_t max_addr, docow |= MAP_CHECK_EXCL; if ((flags & MAP_GUARD) != 0) docow |= MAP_CREATE_GUARD; + /* + * If we're creating a new shared resevation with anonymous + * backing and capability permissions, let the vm system know so + * the entry can be backed and OBJ_SHARECAP set. + * + * XXX: there should be a permission check... + */ + if ((flags & (MAP_ANON | MAP_SHARED | MAP_RESERVATION_CREATE)) == + (MAP_ANON | MAP_SHARED | MAP_RESERVATION_CREATE) && + (maxprot & (VM_PROT_READ_CAP | VM_PROT_WRITE_CAP)) != 0) + docow |= MAP_SHARECAP; if (fitit) { if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) From 9f0e1ac109ec47f1d4f67bd61112b94c5661aaf6 Mon Sep 17 00:00:00 2001 From: Brooks Davis Date: Wed, 2 Oct 2024 00:18:12 +0100 Subject: [PATCH 11/11] minherit: limit INHERIT_SHARE CheriABI: mostly disallow post-fork sharing via minherit(). Developers should use mmap and MAP_SHARED instead. Do allow no-op reqests and sharing of mappings that either have no capabilities or where objects have the OBJ_SHARECAP flag. --- bin/cheribsdtest/cheribsdtest_cheriabi.c | 9 +++++++-- sys/vm/vm_map.c | 25 ++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/bin/cheribsdtest/cheribsdtest_cheriabi.c b/bin/cheribsdtest/cheribsdtest_cheriabi.c index b7f9e439368c..34082b49f2f1 100644 --- a/bin/cheribsdtest/cheribsdtest_cheriabi.c +++ b/bin/cheribsdtest/cheribsdtest_cheriabi.c @@ -389,11 +389,16 @@ CHERIBSDTEST(cheriabi_minherit_invalid_ptr, CHERIBSDTEST_CHECK_CALL_ERROR(minherit(mappings.middle + mappings.maplen, mappings.maplen, INHERIT_NONE), EPROT); + /* + * minherit() should not be able to mark a MAP_ANON mapping shared + * upless it was initially marked as shared. + */ + CHERIBSDTEST_CHECK_CALL_ERROR(minherit(mappings.middle, mappings.maplen, + INHERIT_SHARE), EACCES); + /* Sanity check: minherit() on a valid capability should succeed. */ CHERIBSDTEST_CHECK_SYSCALL(minherit(mappings.middle, mappings.maplen, INHERIT_NONE)); - CHERIBSDTEST_CHECK_SYSCALL(minherit(mappings.middle, mappings.maplen, - INHERIT_SHARE)); /* Unmapping the original capabilities should succeed. */ free_adjacent_mappings(&mappings); diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index a85d0cc1d72f..73503e455f0b 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -3778,14 +3778,35 @@ vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, } } #endif - if (new_inheritance == VM_INHERIT_COPY) { + if (new_inheritance == VM_INHERIT_COPY || + new_inheritance == VM_INHERIT_SHARE) { for (entry = start_entry; entry->start < end; prev_entry = entry, entry = vm_map_entry_succ(entry)) { - if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) + if (new_inheritance == VM_INHERIT_COPY && + (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) { rv = KERN_INVALID_ARGUMENT; goto unlock; } + /* + * CheriABI: mostly disallow post-fork sharing via + * minherit(). Developers should use mmap and + * MAP_SHARED instead. Do allow no-op reqests + * and sharing of mappings that either have no + * capabilities or where objects have the + * OBJ_SHARECAP flag. + */ + if (new_inheritance == VM_INHERIT_SHARE && + entry->inheritance != VM_INHERIT_SHARE && + /* XXX: check reservations instead? */ + SV_CURPROC_FLAG(SV_CHERI) && + (entry->object.vm_object == NULL || + (entry->object.vm_object->flags & + (OBJ_NOCAP | OBJ_SHARECAP)) == 0)) { + rv = KERN_PROTECTION_FAILURE; + goto unlock; + } + } } for (entry = start_entry; entry->start < end; prev_entry = entry,