Skip to content

Commit 7440a1c

Browse files
giuseppeclaude
andcommitted
linux: add OPEN_TREE_NAMESPACE support
Use open_tree(OPEN_TREE_NAMESPACE) + setns(CLONE_NEWNS) to replace the traditional unshare(CLONE_NEWNS) + bind mount rootfs + pivot_root sequence. OPEN_TREE_NAMESPACE creates a new mount namespace with the rootfs as the root mount. setns() enters that namespace directly, so no bind mount or pivot_root is needed. The kernel automatically sets the process root and cwd to the new namespace's root when the old root is not reachable. On older kernels (< 7.0) or when OPEN_TREE_NAMESPACE is not supported, the code falls back to the traditional path. Closes: #2086 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
1 parent a15cbd1 commit 7440a1c

1 file changed

Lines changed: 204 additions & 67 deletions

File tree

src/libcrun/linux.c

Lines changed: 204 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@
8888
# define OPEN_TREE_CLOEXEC O_CLOEXEC
8989
#endif
9090

91+
#ifndef OPEN_TREE_NAMESPACE
92+
# define OPEN_TREE_NAMESPACE 2
93+
#endif
94+
9195
#ifndef MOVE_MOUNT_F_EMPTY_PATH
9296
# define MOVE_MOUNT_F_EMPTY_PATH 0x00000004
9397
#endif
@@ -162,6 +166,8 @@ struct private_data_s
162166
bool maskdir_bind_failed;
163167
bool maskdir_warned;
164168
bool joined_mount_ns;
169+
bool needs_pivot;
170+
bool no_pivot;
165171
};
166172

167173
struct linux_namespace_s
@@ -940,6 +946,7 @@ fsopen_mount (const char *type, const char *source_name, const char *labeltype,
940946
if (eq)
941947
{
942948
*eq = '\0';
949+
943950
ret = syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, token, eq + 1, 0);
944951
}
945952
else
@@ -1496,7 +1503,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd,
14961503
}
14971504

14981505
close_and_reset (&fd);
1499-
fd = openat (procfd, "self/root", O_PATH | O_CLOEXEC);
1506+
fd = open (get_private_data (container)->rootfs, O_PATH | O_CLOEXEC);
15001507
if (UNLIKELY (fd < 0))
15011508
return crun_make_error (err, errno, "reopen rootfs after mount on /");
15021509

@@ -2696,6 +2703,25 @@ process_single_mount (libcrun_container_t *container, const char *rootfs,
26962703
if (UNLIKELY (ret < 0))
26972704
return ret;
26982705
}
2706+
2707+
if (is_empty_string (target))
2708+
{
2709+
cleanup_close int fd = -1;
2710+
2711+
fd = open (get_private_data (container)->rootfs, O_PATH | O_CLOEXEC);
2712+
if (UNLIKELY (fd < 0))
2713+
return crun_make_error (err, errno, "reopen rootfs after mount on /");
2714+
2715+
{
2716+
int tmp = dup (fd);
2717+
if (UNLIKELY (tmp < 0))
2718+
return crun_make_error (err, errno, "dup");
2719+
2720+
TEMP_FAILURE_RETRY (close (get_private_data (container)->rootfsfd));
2721+
get_private_data (container)->rootfsfd = tmp;
2722+
}
2723+
}
2724+
26992725
mounted = true;
27002726

27012727
if (is_empty_string (target))
@@ -3154,10 +3180,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont
31543180
return ret;
31553181
}
31563182

3157-
ret = do_finalize_notify_socket (container, err);
3158-
if (UNLIKELY (ret < 0))
3159-
return ret;
3160-
31613183
if (def->process && def->process->cwd)
31623184
{
31633185
libcrun_error_t tmp_err = NULL;
@@ -3174,25 +3196,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont
31743196
return 0;
31753197
}
31763198

3177-
int
3178-
libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err)
3179-
{
3180-
int ret;
3181-
3182-
ret = finalize_mounts (container, err);
3183-
if (UNLIKELY (ret < 0))
3184-
return ret;
3185-
3186-
// configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS
3187-
ret = libcrun_container_notify_handler (entrypoint_args, HANDLER_CONFIGURE_AFTER_MOUNTS, container, rootfs, err);
3188-
if (UNLIKELY (ret < 0))
3189-
return crun_error_wrap (err, "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS");
3190-
3191-
close_and_reset (&(get_private_data (container)->rootfsfd));
3192-
3193-
return 0;
3194-
}
3195-
31963199
static int
31973200
umount_or_hide (const char *target, libcrun_error_t *err)
31983201
{
@@ -3250,6 +3253,83 @@ move_root (const char *rootfs, libcrun_error_t *err)
32503253
return 0;
32513254
}
32523255

3256+
int
3257+
libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err)
3258+
{
3259+
int ret;
3260+
3261+
// configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS
3262+
ret = libcrun_container_notify_handler (entrypoint_args, HANDLER_CONFIGURE_AFTER_MOUNTS, container, rootfs, err);
3263+
if (UNLIKELY (ret < 0))
3264+
return crun_error_wrap (err, "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS");
3265+
3266+
close_and_reset (&(get_private_data (container)->rootfsfd));
3267+
3268+
if (get_private_data (container)->needs_pivot)
3269+
{
3270+
get_private_data (container)->needs_pivot = false;
3271+
3272+
if (get_private_data (container)->no_pivot)
3273+
{
3274+
ret = move_root (rootfs, err);
3275+
if (UNLIKELY (ret < 0))
3276+
return ret;
3277+
}
3278+
else
3279+
{
3280+
ret = do_pivot (container, rootfs, err);
3281+
if (UNLIKELY (ret < 0))
3282+
return ret;
3283+
}
3284+
3285+
ret = do_mount (container, NULL, -1, "/", NULL,
3286+
get_private_data (container)->rootfs_propagation,
3287+
NULL, LABEL_MOUNT, err);
3288+
if (UNLIKELY (ret < 0))
3289+
return ret;
3290+
3291+
ret = chdir ("/");
3292+
if (UNLIKELY (ret < 0))
3293+
return crun_make_error (err, errno, "chdir to `/`");
3294+
}
3295+
3296+
ret = do_finalize_notify_socket (container, err);
3297+
if (UNLIKELY (ret < 0))
3298+
return ret;
3299+
3300+
ret = finalize_mounts (container, err);
3301+
if (UNLIKELY (ret < 0))
3302+
return ret;
3303+
3304+
return 0;
3305+
}
3306+
3307+
static int
3308+
maybe_open_tree_namespace (const char *rootfs, int *out_fd, libcrun_error_t *err)
3309+
{
3310+
cleanup_close int rootfs_fd = -1;
3311+
int tree_fd;
3312+
3313+
*out_fd = -1;
3314+
3315+
rootfs_fd = open (rootfs, O_DIRECTORY | O_PATH | O_CLOEXEC);
3316+
if (UNLIKELY (rootfs_fd < 0))
3317+
return crun_make_error (err, errno, "open `%s`", rootfs);
3318+
3319+
tree_fd = syscall_open_tree (rootfs_fd, "",
3320+
OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC
3321+
| AT_EMPTY_PATH | AT_RECURSIVE);
3322+
if (tree_fd < 0)
3323+
{
3324+
if (errno == EINVAL || errno == ENOSYS || errno == EPERM)
3325+
return 0;
3326+
return crun_make_error (err, errno, "open_tree `%s`", rootfs);
3327+
}
3328+
3329+
*out_fd = tree_fd;
3330+
return 0;
3331+
}
3332+
32533333
static struct libcrun_fd_map *
32543334
get_fd_map (libcrun_container_t *container)
32553335
{
@@ -3317,12 +3397,37 @@ open_mount_of_type (libcrun_container_t *container,
33173397
return mnt_fd;
33183398
}
33193399

3400+
static bool
3401+
can_use_open_tree_namespace (libcrun_container_t *container)
3402+
{
3403+
runtime_spec_schema_config_schema *def = container->container_def;
3404+
struct libcrun_fd_map *mount_fds;
3405+
bool has_hooks = def->hooks
3406+
&& (def->hooks->prestart_len || def->hooks->create_runtime_len);
3407+
bool has_userns = get_private_data (container)->unshare_flags & CLONE_NEWUSER;
3408+
size_t i;
3409+
3410+
if (has_hooks || has_userns)
3411+
return false;
3412+
3413+
mount_fds = get_fd_map (container);
3414+
for (i = 0; i < def->mounts_len; i++)
3415+
{
3416+
if (mount_fds->fds[i] < 0)
3417+
return false;
3418+
}
3419+
3420+
return true;
3421+
}
3422+
33203423
static int
33213424
setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err)
33223425
{
33233426
runtime_spec_schema_config_schema *def = container->container_def;
33243427
unsigned long rootfs_propagation = 0;
3428+
cleanup_close int tree_fd = -1;
33253429
libcrun_error_t tmp_err = NULL;
3430+
bool use_open_tree = false;
33263431
size_t i;
33273432
int ret;
33283433

@@ -3334,25 +3439,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
33343439

33353440
get_private_data (container)->rootfs_propagation = rootfs_propagation;
33363441

3337-
if (! get_private_data (container)->joined_mount_ns)
3338-
{
3339-
ret = unshare (CLONE_NEWNS);
3340-
if (UNLIKELY (ret < 0))
3341-
return crun_make_error (err, errno, "unshare `CLONE_NEWNS`");
3342-
}
3343-
3344-
ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err);
3345-
if (UNLIKELY (ret < 0))
3346-
return ret;
3347-
3348-
ret = make_parent_mount_private (*rootfs, err);
3349-
if (UNLIKELY (ret < 0))
3350-
return ret;
3351-
3352-
ret = do_mount (container, *rootfs, -1, *rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err);
3353-
if (UNLIKELY (ret < 0))
3354-
return ret;
3355-
33563442
/* Pre-create mounts and cache paths before pivot_root,
33573443
while the host file system is still reachable. */
33583444
for (i = 0; i < def->mounts_len; i++)
@@ -3442,6 +3528,65 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
34423528
mnt_fd = -1;
34433529
}
34443530

3531+
if (! get_private_data (container)->joined_mount_ns
3532+
&& can_use_open_tree_namespace (container))
3533+
use_open_tree = true;
3534+
3535+
if (use_open_tree)
3536+
{
3537+
ret = maybe_open_tree_namespace (*rootfs, &tree_fd, err);
3538+
if (UNLIKELY (ret < 0))
3539+
return ret;
3540+
}
3541+
3542+
if (tree_fd >= 0)
3543+
{
3544+
ret = setns (tree_fd, CLONE_NEWNS);
3545+
if (UNLIKELY (ret < 0))
3546+
return crun_make_error (err, errno, "setns `CLONE_NEWNS`");
3547+
3548+
ret = mount (NULL, "/", NULL, MS_REMOUNT | MS_BIND, NULL);
3549+
if (UNLIKELY (ret < 0))
3550+
return crun_make_error (err, errno, "remount `/`");
3551+
3552+
ret = do_mount (container, NULL, -1, "/", NULL, MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err);
3553+
if (UNLIKELY (ret < 0))
3554+
return ret;
3555+
3556+
ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err);
3557+
if (UNLIKELY (ret < 0))
3558+
return ret;
3559+
3560+
get_private_data (container)->needs_pivot = false;
3561+
get_private_data (container)->no_pivot = no_pivot;
3562+
free (*rootfs);
3563+
*rootfs = xstrdup ("/");
3564+
}
3565+
else
3566+
{
3567+
if (! get_private_data (container)->joined_mount_ns)
3568+
{
3569+
ret = unshare (CLONE_NEWNS);
3570+
if (UNLIKELY (ret < 0))
3571+
return crun_make_error (err, errno, "unshare `CLONE_NEWNS`");
3572+
}
3573+
3574+
ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err);
3575+
if (UNLIKELY (ret < 0))
3576+
return ret;
3577+
3578+
ret = make_parent_mount_private (*rootfs, err);
3579+
if (UNLIKELY (ret < 0))
3580+
return ret;
3581+
3582+
ret = do_mount (container, *rootfs, -1, *rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err);
3583+
if (UNLIKELY (ret < 0))
3584+
return ret;
3585+
3586+
get_private_data (container)->needs_pivot = true;
3587+
get_private_data (container)->no_pivot = no_pivot;
3588+
}
3589+
34453590
/* Mount everything before pivot_root while host paths are still reachable.
34463591
Use the pre-created fd when available, fall back to mount(). */
34473592
ret = open (*rootfs, O_PATH | O_CLOEXEC);
@@ -3533,23 +3678,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
35333678
get_private_data (container)->maskdir_bind_failed = true;
35343679
}
35353680

3536-
if (no_pivot)
3537-
{
3538-
ret = move_root (*rootfs, err);
3539-
if (UNLIKELY (ret < 0))
3540-
return ret;
3541-
}
3542-
else
3543-
{
3544-
ret = do_pivot (container, *rootfs, err);
3545-
if (UNLIKELY (ret < 0))
3546-
return ret;
3547-
}
3548-
3549-
ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err);
3550-
if (UNLIKELY (ret < 0))
3551-
return ret;
3552-
35533681
return 0;
35543682
}
35553683

@@ -3566,20 +3694,30 @@ libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **roo
35663694
ret = setup_mount_namespace (container, no_pivot, rootfs, err);
35673695
if (UNLIKELY (ret < 0))
35683696
return ret;
3697+
3698+
/* If setup_mount_namespace used OPEN_TREE_NAMESPACE, rootfs is
3699+
already set to "/". Otherwise pivot_root is deferred until
3700+
after the mounts are created. */
3701+
if (strcmp (*rootfs, "/") == 0)
3702+
{
3703+
ret = chdir ("/");
3704+
if (UNLIKELY (ret < 0))
3705+
return crun_make_error (err, errno, "chdir to `/`");
3706+
}
35693707
}
35703708
else
35713709
{
35723710
ret = chroot (*rootfs);
35733711
if (UNLIKELY (ret < 0))
35743712
return crun_make_error (err, errno, "chroot to `%s`", *rootfs);
3575-
}
35763713

3577-
ret = chdir ("/");
3578-
if (UNLIKELY (ret < 0))
3579-
return crun_make_error (err, errno, "chdir to `/`");
3714+
ret = chdir ("/");
3715+
if (UNLIKELY (ret < 0))
3716+
return crun_make_error (err, errno, "chdir to `/`");
35803717

3581-
free (*rootfs);
3582-
*rootfs = xstrdup ("/");
3718+
free (*rootfs);
3719+
*rootfs = xstrdup ("/");
3720+
}
35833721

35843722
return 0;
35853723
}
@@ -5175,7 +5313,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy
51755313
if (propagation == 0)
51765314
propagation = MS_PRIVATE;
51775315

5178-
/* If the bind mount failed, do not fail here, but attempt to create it from within the container. */
51795316
mount_fd = get_bind_mount (-1, def->mounts[i]->source, recursive, false, nofollow, propagation, err);
51805317
if (UNLIKELY (mount_fd < 0))
51815318
crun_error_release (err);

0 commit comments

Comments
 (0)