Skip to content

Commit 419132e

Browse files
giuseppeclaude
andcommitted
linux: add OPEN_TREE_NAMESPACE support
Use open_tree(OPEN_TREE_NAMESPACE) + setns(CLONE_NEWNS) to replace the traditional unshare(CLONE_NEWNS) + bind mount rootfs + pivot_root sequence. OPEN_TREE_NAMESPACE creates a new mount namespace with the rootfs as the root mount. setns() enters that namespace directly, so no bind mount or pivot_root is needed. The kernel automatically sets the process root and cwd to the new namespace's root when the old root is not reachable. On older kernels (< 7.0) or when OPEN_TREE_NAMESPACE is not supported, the code falls back to the traditional path. Closes: #2086 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
1 parent a58f134 commit 419132e

1 file changed

Lines changed: 186 additions & 67 deletions

File tree

src/libcrun/linux.c

Lines changed: 186 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@
8888
# define OPEN_TREE_CLOEXEC O_CLOEXEC
8989
#endif
9090

91+
#ifndef OPEN_TREE_NAMESPACE
92+
# define OPEN_TREE_NAMESPACE 2
93+
#endif
94+
9195
#ifndef MOVE_MOUNT_F_EMPTY_PATH
9296
# define MOVE_MOUNT_F_EMPTY_PATH 0x00000004
9397
#endif
@@ -162,6 +166,8 @@ struct private_data_s
162166
bool maskdir_bind_failed;
163167
bool maskdir_warned;
164168
bool joined_mount_ns;
169+
bool needs_pivot;
170+
bool no_pivot;
165171
};
166172

167173
struct linux_namespace_s
@@ -961,6 +967,7 @@ fsopen_mount (const char *type, const char *source_name, const char *labeltype,
961967
if (eq)
962968
{
963969
*eq = '\0';
970+
964971
ret = syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, token, eq + 1, 0);
965972
}
966973
else
@@ -1528,7 +1535,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd,
15281535
}
15291536

15301537
close_and_reset (&fd);
1531-
fd = openat (procfd, "self/root", O_PATH | O_CLOEXEC);
1538+
fd = open (get_private_data (container)->rootfs, O_PATH | O_CLOEXEC);
15321539
if (UNLIKELY (fd < 0))
15331540
return crun_make_error (err, errno, "reopen rootfs after mount on /");
15341541

@@ -2728,6 +2735,7 @@ process_single_mount (libcrun_container_t *container, const char *rootfs,
27282735
if (UNLIKELY (ret < 0))
27292736
return ret;
27302737
}
2738+
27312739
mounted = true;
27322740

27332741
if (is_empty_string (target))
@@ -3186,10 +3194,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont
31863194
return ret;
31873195
}
31883196

3189-
ret = do_finalize_notify_socket (container, err);
3190-
if (UNLIKELY (ret < 0))
3191-
return ret;
3192-
31933197
if (def->process && def->process->cwd)
31943198
{
31953199
libcrun_error_t tmp_err = NULL;
@@ -3206,25 +3210,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont
32063210
return 0;
32073211
}
32083212

3209-
int
3210-
libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err)
3211-
{
3212-
int ret;
3213-
3214-
ret = finalize_mounts (container, err);
3215-
if (UNLIKELY (ret < 0))
3216-
return ret;
3217-
3218-
// configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS
3219-
ret = libcrun_container_notify_handler (entrypoint_args, HANDLER_CONFIGURE_AFTER_MOUNTS, container, rootfs, err);
3220-
if (UNLIKELY (ret < 0))
3221-
return crun_error_wrap (err, "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS");
3222-
3223-
close_and_reset (&(get_private_data (container)->rootfsfd));
3224-
3225-
return 0;
3226-
}
3227-
32283213
static int
32293214
umount_or_hide (const char *target, libcrun_error_t *err)
32303215
{
@@ -3282,6 +3267,83 @@ move_root (const char *rootfs, libcrun_error_t *err)
32823267
return 0;
32833268
}
32843269

3270+
int
3271+
libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err)
3272+
{
3273+
int ret;
3274+
3275+
// configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS
3276+
ret = libcrun_container_notify_handler (entrypoint_args, HANDLER_CONFIGURE_AFTER_MOUNTS, container, rootfs, err);
3277+
if (UNLIKELY (ret < 0))
3278+
return crun_error_wrap (err, "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS");
3279+
3280+
close_and_reset (&(get_private_data (container)->rootfsfd));
3281+
3282+
if (get_private_data (container)->needs_pivot)
3283+
{
3284+
get_private_data (container)->needs_pivot = false;
3285+
3286+
if (get_private_data (container)->no_pivot)
3287+
{
3288+
ret = move_root (rootfs, err);
3289+
if (UNLIKELY (ret < 0))
3290+
return ret;
3291+
}
3292+
else
3293+
{
3294+
ret = do_pivot (container, rootfs, err);
3295+
if (UNLIKELY (ret < 0))
3296+
return ret;
3297+
}
3298+
3299+
ret = do_mount (container, NULL, -1, "/", NULL,
3300+
get_private_data (container)->rootfs_propagation,
3301+
NULL, LABEL_MOUNT, err);
3302+
if (UNLIKELY (ret < 0))
3303+
return ret;
3304+
3305+
ret = chdir ("/");
3306+
if (UNLIKELY (ret < 0))
3307+
return crun_make_error (err, errno, "chdir to `/`");
3308+
}
3309+
3310+
ret = do_finalize_notify_socket (container, err);
3311+
if (UNLIKELY (ret < 0))
3312+
return ret;
3313+
3314+
ret = finalize_mounts (container, err);
3315+
if (UNLIKELY (ret < 0))
3316+
return ret;
3317+
3318+
return 0;
3319+
}
3320+
3321+
static int
3322+
maybe_open_tree_namespace (const char *rootfs, int *out_fd, libcrun_error_t *err)
3323+
{
3324+
cleanup_close int rootfs_fd = -1;
3325+
int tree_fd;
3326+
3327+
*out_fd = -1;
3328+
3329+
rootfs_fd = open (rootfs, O_DIRECTORY | O_PATH | O_CLOEXEC);
3330+
if (UNLIKELY (rootfs_fd < 0))
3331+
return crun_make_error (err, errno, "open `%s`", rootfs);
3332+
3333+
tree_fd = syscall_open_tree (rootfs_fd, "",
3334+
OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC
3335+
| AT_EMPTY_PATH | AT_RECURSIVE);
3336+
if (tree_fd < 0)
3337+
{
3338+
if (errno == EINVAL || errno == ENOSYS || errno == EPERM)
3339+
return 0;
3340+
return crun_make_error (err, errno, "open_tree `%s`", rootfs);
3341+
}
3342+
3343+
*out_fd = tree_fd;
3344+
return 0;
3345+
}
3346+
32853347
static struct libcrun_fd_map *
32863348
get_fd_map (libcrun_container_t *container)
32873349
{
@@ -3351,12 +3413,37 @@ open_mount_of_type (libcrun_container_t *container,
33513413
return mnt_fd;
33523414
}
33533415

3416+
static bool
3417+
can_use_open_tree_namespace (libcrun_container_t *container)
3418+
{
3419+
runtime_spec_schema_config_schema *def = container->container_def;
3420+
struct libcrun_fd_map *mount_fds;
3421+
bool has_hooks = def->hooks
3422+
&& (def->hooks->prestart_len || def->hooks->create_runtime_len);
3423+
bool has_userns = get_private_data (container)->unshare_flags & CLONE_NEWUSER;
3424+
size_t i;
3425+
3426+
if (has_hooks || has_userns)
3427+
return false;
3428+
3429+
mount_fds = get_fd_map (container);
3430+
for (i = 0; i < def->mounts_len; i++)
3431+
{
3432+
if (mount_fds->fds[i] < 0)
3433+
return false;
3434+
}
3435+
3436+
return true;
3437+
}
3438+
33543439
static int
33553440
setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err)
33563441
{
33573442
runtime_spec_schema_config_schema *def = container->container_def;
33583443
unsigned long rootfs_propagation = 0;
3444+
cleanup_close int tree_fd = -1;
33593445
libcrun_error_t tmp_err = NULL;
3446+
bool use_open_tree = false;
33603447
size_t i;
33613448
int ret;
33623449

@@ -3372,25 +3459,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
33723459
if (UNLIKELY (ret < 0))
33733460
return ret;
33743461

3375-
if (! get_private_data (container)->joined_mount_ns)
3376-
{
3377-
ret = unshare (CLONE_NEWNS);
3378-
if (UNLIKELY (ret < 0))
3379-
return crun_make_error (err, errno, "unshare `CLONE_NEWNS`");
3380-
}
3381-
3382-
ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err);
3383-
if (UNLIKELY (ret < 0))
3384-
return ret;
3385-
3386-
ret = make_parent_mount_private (*rootfs, err);
3387-
if (UNLIKELY (ret < 0))
3388-
return ret;
3389-
3390-
ret = do_mount (container, *rootfs, -1, *rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err);
3391-
if (UNLIKELY (ret < 0))
3392-
return ret;
3393-
33943462
/* Pre-create mounts and cache paths before pivot_root,
33953463
while the host file system is still reachable. */
33963464
for (i = 0; i < def->mounts_len; i++)
@@ -3481,6 +3549,65 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
34813549
mnt_fd = -1;
34823550
}
34833551

3552+
if (! get_private_data (container)->joined_mount_ns
3553+
&& can_use_open_tree_namespace (container))
3554+
use_open_tree = true;
3555+
3556+
if (use_open_tree)
3557+
{
3558+
ret = maybe_open_tree_namespace (*rootfs, &tree_fd, err);
3559+
if (UNLIKELY (ret < 0))
3560+
return ret;
3561+
}
3562+
3563+
if (tree_fd >= 0)
3564+
{
3565+
ret = setns (tree_fd, CLONE_NEWNS);
3566+
if (UNLIKELY (ret < 0))
3567+
return crun_make_error (err, errno, "setns `CLONE_NEWNS`");
3568+
3569+
ret = mount (NULL, "/", NULL, MS_REMOUNT | MS_BIND, NULL);
3570+
if (UNLIKELY (ret < 0))
3571+
return crun_make_error (err, errno, "remount `/`");
3572+
3573+
ret = do_mount (container, NULL, -1, "/", NULL, MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err);
3574+
if (UNLIKELY (ret < 0))
3575+
return ret;
3576+
3577+
ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err);
3578+
if (UNLIKELY (ret < 0))
3579+
return ret;
3580+
3581+
get_private_data (container)->needs_pivot = false;
3582+
get_private_data (container)->no_pivot = no_pivot;
3583+
free (*rootfs);
3584+
*rootfs = xstrdup ("/");
3585+
}
3586+
else
3587+
{
3588+
if (! get_private_data (container)->joined_mount_ns)
3589+
{
3590+
ret = unshare (CLONE_NEWNS);
3591+
if (UNLIKELY (ret < 0))
3592+
return crun_make_error (err, errno, "unshare `CLONE_NEWNS`");
3593+
}
3594+
3595+
ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err);
3596+
if (UNLIKELY (ret < 0))
3597+
return ret;
3598+
3599+
ret = make_parent_mount_private (*rootfs, err);
3600+
if (UNLIKELY (ret < 0))
3601+
return ret;
3602+
3603+
ret = do_mount (container, *rootfs, -1, *rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err);
3604+
if (UNLIKELY (ret < 0))
3605+
return ret;
3606+
3607+
get_private_data (container)->needs_pivot = true;
3608+
get_private_data (container)->no_pivot = no_pivot;
3609+
}
3610+
34843611
/* Mount everything before pivot_root while host paths are still reachable.
34853612
Use the pre-created fd when available, fall back to mount(). */
34863613
ret = open (*rootfs, O_PATH | O_CLOEXEC);
@@ -3568,23 +3695,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
35683695
get_private_data (container)->maskdir_bind_failed = true;
35693696
}
35703697

3571-
if (no_pivot)
3572-
{
3573-
ret = move_root (*rootfs, err);
3574-
if (UNLIKELY (ret < 0))
3575-
return ret;
3576-
}
3577-
else
3578-
{
3579-
ret = do_pivot (container, *rootfs, err);
3580-
if (UNLIKELY (ret < 0))
3581-
return ret;
3582-
}
3583-
3584-
ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err);
3585-
if (UNLIKELY (ret < 0))
3586-
return ret;
3587-
35883698
return 0;
35893699
}
35903700

@@ -3601,20 +3711,30 @@ libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **roo
36013711
ret = setup_mount_namespace (container, no_pivot, rootfs, err);
36023712
if (UNLIKELY (ret < 0))
36033713
return ret;
3714+
3715+
/* If setup_mount_namespace used OPEN_TREE_NAMESPACE, rootfs is
3716+
already set to "/". Otherwise pivot_root is deferred until
3717+
after the mounts are created. */
3718+
if (strcmp (*rootfs, "/") == 0)
3719+
{
3720+
ret = chdir ("/");
3721+
if (UNLIKELY (ret < 0))
3722+
return crun_make_error (err, errno, "chdir to `/`");
3723+
}
36043724
}
36053725
else
36063726
{
36073727
ret = chroot (*rootfs);
36083728
if (UNLIKELY (ret < 0))
36093729
return crun_make_error (err, errno, "chroot to `%s`", *rootfs);
3610-
}
36113730

3612-
ret = chdir ("/");
3613-
if (UNLIKELY (ret < 0))
3614-
return crun_make_error (err, errno, "chdir to `/`");
3731+
ret = chdir ("/");
3732+
if (UNLIKELY (ret < 0))
3733+
return crun_make_error (err, errno, "chdir to `/`");
36153734

3616-
free (*rootfs);
3617-
*rootfs = xstrdup ("/");
3735+
free (*rootfs);
3736+
*rootfs = xstrdup ("/");
3737+
}
36183738

36193739
return 0;
36203740
}
@@ -5210,7 +5330,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy
52105330
if (propagation == 0)
52115331
propagation = MS_PRIVATE;
52125332

5213-
/* If the bind mount failed, do not fail here, but attempt to create it from within the container. */
52145333
mount_fd = get_bind_mount (-1, def->mounts[i]->source, recursive, false, nofollow, propagation, err);
52155334
if (UNLIKELY (mount_fd < 0))
52165335
crun_error_release (err);

0 commit comments

Comments
 (0)