8888# define OPEN_TREE_CLOEXEC O_CLOEXEC
8989#endif
9090
91+ #ifndef OPEN_TREE_NAMESPACE
92+ # define OPEN_TREE_NAMESPACE 2
93+ #endif
94+
9195#ifndef MOVE_MOUNT_F_EMPTY_PATH
9296# define MOVE_MOUNT_F_EMPTY_PATH 0x00000004
9397#endif
@@ -162,6 +166,8 @@ struct private_data_s
162166 bool maskdir_bind_failed ;
163167 bool maskdir_warned ;
164168 bool joined_mount_ns ;
169+ bool needs_pivot ;
170+ bool no_pivot ;
165171};
166172
167173struct linux_namespace_s
@@ -961,6 +967,7 @@ fsopen_mount (const char *type, const char *source_name, const char *labeltype,
961967 if (eq )
962968 {
963969 * eq = '\0' ;
970+
964971 ret = syscall_fsconfig (fsfd , FSCONFIG_SET_STRING , token , eq + 1 , 0 );
965972 }
966973 else
@@ -1528,7 +1535,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd,
15281535 }
15291536
15301537 close_and_reset (& fd );
1531- fd = openat ( procfd , "self/root" , O_PATH | O_CLOEXEC );
1538+ fd = open ( get_private_data ( container ) -> rootfs , O_PATH | O_CLOEXEC );
15321539 if (UNLIKELY (fd < 0 ))
15331540 return crun_make_error (err , errno , "reopen rootfs after mount on /" );
15341541
@@ -2728,6 +2735,7 @@ process_single_mount (libcrun_container_t *container, const char *rootfs,
27282735 if (UNLIKELY (ret < 0 ))
27292736 return ret ;
27302737 }
2738+
27312739 mounted = true;
27322740
27332741 if (is_empty_string (target ))
@@ -3186,10 +3194,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont
31863194 return ret ;
31873195 }
31883196
3189- ret = do_finalize_notify_socket (container , err );
3190- if (UNLIKELY (ret < 0 ))
3191- return ret ;
3192-
31933197 if (def -> process && def -> process -> cwd )
31943198 {
31953199 libcrun_error_t tmp_err = NULL ;
@@ -3206,25 +3210,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont
32063210 return 0 ;
32073211}
32083212
3209- int
3210- libcrun_finalize_mounts (struct container_entrypoint_s * entrypoint_args , libcrun_container_t * container , const char * rootfs , libcrun_error_t * err )
3211- {
3212- int ret ;
3213-
3214- ret = finalize_mounts (container , err );
3215- if (UNLIKELY (ret < 0 ))
3216- return ret ;
3217-
3218- // configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS
3219- ret = libcrun_container_notify_handler (entrypoint_args , HANDLER_CONFIGURE_AFTER_MOUNTS , container , rootfs , err );
3220- if (UNLIKELY (ret < 0 ))
3221- return crun_error_wrap (err , "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS" );
3222-
3223- close_and_reset (& (get_private_data (container )-> rootfsfd ));
3224-
3225- return 0 ;
3226- }
3227-
32283213static int
32293214umount_or_hide (const char * target , libcrun_error_t * err )
32303215{
@@ -3282,6 +3267,83 @@ move_root (const char *rootfs, libcrun_error_t *err)
32823267 return 0 ;
32833268}
32843269
3270+ int
3271+ libcrun_finalize_mounts (struct container_entrypoint_s * entrypoint_args , libcrun_container_t * container , const char * rootfs , libcrun_error_t * err )
3272+ {
3273+ int ret ;
3274+
3275+ // configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS
3276+ ret = libcrun_container_notify_handler (entrypoint_args , HANDLER_CONFIGURE_AFTER_MOUNTS , container , rootfs , err );
3277+ if (UNLIKELY (ret < 0 ))
3278+ return crun_error_wrap (err , "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS" );
3279+
3280+ close_and_reset (& (get_private_data (container )-> rootfsfd ));
3281+
3282+ if (get_private_data (container )-> needs_pivot )
3283+ {
3284+ get_private_data (container )-> needs_pivot = false;
3285+
3286+ if (get_private_data (container )-> no_pivot )
3287+ {
3288+ ret = move_root (rootfs , err );
3289+ if (UNLIKELY (ret < 0 ))
3290+ return ret ;
3291+ }
3292+ else
3293+ {
3294+ ret = do_pivot (container , rootfs , err );
3295+ if (UNLIKELY (ret < 0 ))
3296+ return ret ;
3297+ }
3298+
3299+ ret = do_mount (container , NULL , -1 , "/" , NULL ,
3300+ get_private_data (container )-> rootfs_propagation ,
3301+ NULL , LABEL_MOUNT , err );
3302+ if (UNLIKELY (ret < 0 ))
3303+ return ret ;
3304+
3305+ ret = chdir ("/" );
3306+ if (UNLIKELY (ret < 0 ))
3307+ return crun_make_error (err , errno , "chdir to `/`" );
3308+ }
3309+
3310+ ret = do_finalize_notify_socket (container , err );
3311+ if (UNLIKELY (ret < 0 ))
3312+ return ret ;
3313+
3314+ ret = finalize_mounts (container , err );
3315+ if (UNLIKELY (ret < 0 ))
3316+ return ret ;
3317+
3318+ return 0 ;
3319+ }
3320+
3321+ static int
3322+ maybe_open_tree_namespace (const char * rootfs , int * out_fd , libcrun_error_t * err )
3323+ {
3324+ cleanup_close int rootfs_fd = -1 ;
3325+ int tree_fd ;
3326+
3327+ * out_fd = -1 ;
3328+
3329+ rootfs_fd = open (rootfs , O_DIRECTORY | O_PATH | O_CLOEXEC );
3330+ if (UNLIKELY (rootfs_fd < 0 ))
3331+ return crun_make_error (err , errno , "open `%s`" , rootfs );
3332+
3333+ tree_fd = syscall_open_tree (rootfs_fd , "" ,
3334+ OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC
3335+ | AT_EMPTY_PATH | AT_RECURSIVE );
3336+ if (tree_fd < 0 )
3337+ {
3338+ if (errno == EINVAL || errno == ENOSYS || errno == EPERM )
3339+ return 0 ;
3340+ return crun_make_error (err , errno , "open_tree `%s`" , rootfs );
3341+ }
3342+
3343+ * out_fd = tree_fd ;
3344+ return 0 ;
3345+ }
3346+
32853347static struct libcrun_fd_map *
32863348get_fd_map (libcrun_container_t * container )
32873349{
@@ -3351,12 +3413,37 @@ open_mount_of_type (libcrun_container_t *container,
33513413 return mnt_fd ;
33523414}
33533415
3416+ static bool
3417+ can_use_open_tree_namespace (libcrun_container_t * container )
3418+ {
3419+ runtime_spec_schema_config_schema * def = container -> container_def ;
3420+ struct libcrun_fd_map * mount_fds ;
3421+ bool has_hooks = def -> hooks
3422+ && (def -> hooks -> prestart_len || def -> hooks -> create_runtime_len );
3423+ bool has_userns = get_private_data (container )-> unshare_flags & CLONE_NEWUSER ;
3424+ size_t i ;
3425+
3426+ if (has_hooks || has_userns )
3427+ return false;
3428+
3429+ mount_fds = get_fd_map (container );
3430+ for (i = 0 ; i < def -> mounts_len ; i ++ )
3431+ {
3432+ if (mount_fds -> fds [i ] < 0 )
3433+ return false;
3434+ }
3435+
3436+ return true;
3437+ }
3438+
33543439static int
33553440setup_mount_namespace (libcrun_container_t * container , bool no_pivot , char * * rootfs , libcrun_error_t * err )
33563441{
33573442 runtime_spec_schema_config_schema * def = container -> container_def ;
33583443 unsigned long rootfs_propagation = 0 ;
3444+ cleanup_close int tree_fd = -1 ;
33593445 libcrun_error_t tmp_err = NULL ;
3446+ bool use_open_tree = false;
33603447 size_t i ;
33613448 int ret ;
33623449
@@ -3372,25 +3459,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
33723459 if (UNLIKELY (ret < 0 ))
33733460 return ret ;
33743461
3375- if (! get_private_data (container )-> joined_mount_ns )
3376- {
3377- ret = unshare (CLONE_NEWNS );
3378- if (UNLIKELY (ret < 0 ))
3379- return crun_make_error (err , errno , "unshare `CLONE_NEWNS`" );
3380- }
3381-
3382- ret = do_mount (container , NULL , -1 , "/" , NULL , rootfs_propagation , NULL , LABEL_MOUNT , err );
3383- if (UNLIKELY (ret < 0 ))
3384- return ret ;
3385-
3386- ret = make_parent_mount_private (* rootfs , err );
3387- if (UNLIKELY (ret < 0 ))
3388- return ret ;
3389-
3390- ret = do_mount (container , * rootfs , -1 , * rootfs , NULL , MS_BIND | MS_REC | MS_PRIVATE , NULL , LABEL_MOUNT , err );
3391- if (UNLIKELY (ret < 0 ))
3392- return ret ;
3393-
33943462 /* Pre-create mounts and cache paths before pivot_root,
33953463 while the host file system is still reachable. */
33963464 for (i = 0 ; i < def -> mounts_len ; i ++ )
@@ -3481,6 +3549,65 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
34813549 mnt_fd = -1 ;
34823550 }
34833551
3552+ if (! get_private_data (container )-> joined_mount_ns
3553+ && can_use_open_tree_namespace (container ))
3554+ use_open_tree = true;
3555+
3556+ if (use_open_tree )
3557+ {
3558+ ret = maybe_open_tree_namespace (* rootfs , & tree_fd , err );
3559+ if (UNLIKELY (ret < 0 ))
3560+ return ret ;
3561+ }
3562+
3563+ if (tree_fd >= 0 )
3564+ {
3565+ ret = setns (tree_fd , CLONE_NEWNS );
3566+ if (UNLIKELY (ret < 0 ))
3567+ return crun_make_error (err , errno , "setns `CLONE_NEWNS`" );
3568+
3569+ ret = mount (NULL , "/" , NULL , MS_REMOUNT | MS_BIND , NULL );
3570+ if (UNLIKELY (ret < 0 ))
3571+ return crun_make_error (err , errno , "remount `/`" );
3572+
3573+ ret = do_mount (container , NULL , -1 , "/" , NULL , MS_REC | MS_PRIVATE , NULL , LABEL_MOUNT , err );
3574+ if (UNLIKELY (ret < 0 ))
3575+ return ret ;
3576+
3577+ ret = do_mount (container , NULL , -1 , "/" , NULL , rootfs_propagation , NULL , LABEL_MOUNT , err );
3578+ if (UNLIKELY (ret < 0 ))
3579+ return ret ;
3580+
3581+ get_private_data (container )-> needs_pivot = false;
3582+ get_private_data (container )-> no_pivot = no_pivot ;
3583+ free (* rootfs );
3584+ * rootfs = xstrdup ("/" );
3585+ }
3586+ else
3587+ {
3588+ if (! get_private_data (container )-> joined_mount_ns )
3589+ {
3590+ ret = unshare (CLONE_NEWNS );
3591+ if (UNLIKELY (ret < 0 ))
3592+ return crun_make_error (err , errno , "unshare `CLONE_NEWNS`" );
3593+ }
3594+
3595+ ret = do_mount (container , NULL , -1 , "/" , NULL , rootfs_propagation , NULL , LABEL_MOUNT , err );
3596+ if (UNLIKELY (ret < 0 ))
3597+ return ret ;
3598+
3599+ ret = make_parent_mount_private (* rootfs , err );
3600+ if (UNLIKELY (ret < 0 ))
3601+ return ret ;
3602+
3603+ ret = do_mount (container , * rootfs , -1 , * rootfs , NULL , MS_BIND | MS_REC | MS_PRIVATE , NULL , LABEL_MOUNT , err );
3604+ if (UNLIKELY (ret < 0 ))
3605+ return ret ;
3606+
3607+ get_private_data (container )-> needs_pivot = true;
3608+ get_private_data (container )-> no_pivot = no_pivot ;
3609+ }
3610+
34843611 /* Mount everything before pivot_root while host paths are still reachable.
34853612 Use the pre-created fd when available, fall back to mount(). */
34863613 ret = open (* rootfs , O_PATH | O_CLOEXEC );
@@ -3568,23 +3695,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
35683695 get_private_data (container )-> maskdir_bind_failed = true;
35693696 }
35703697
3571- if (no_pivot )
3572- {
3573- ret = move_root (* rootfs , err );
3574- if (UNLIKELY (ret < 0 ))
3575- return ret ;
3576- }
3577- else
3578- {
3579- ret = do_pivot (container , * rootfs , err );
3580- if (UNLIKELY (ret < 0 ))
3581- return ret ;
3582- }
3583-
3584- ret = do_mount (container , NULL , -1 , "/" , NULL , rootfs_propagation , NULL , LABEL_MOUNT , err );
3585- if (UNLIKELY (ret < 0 ))
3586- return ret ;
3587-
35883698 return 0 ;
35893699}
35903700
@@ -3601,20 +3711,30 @@ libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **roo
36013711 ret = setup_mount_namespace (container , no_pivot , rootfs , err );
36023712 if (UNLIKELY (ret < 0 ))
36033713 return ret ;
3714+
3715+ /* If setup_mount_namespace used OPEN_TREE_NAMESPACE, rootfs is
3716+ already set to "/". Otherwise pivot_root is deferred until
3717+ after the mounts are created. */
3718+ if (strcmp (* rootfs , "/" ) == 0 )
3719+ {
3720+ ret = chdir ("/" );
3721+ if (UNLIKELY (ret < 0 ))
3722+ return crun_make_error (err , errno , "chdir to `/`" );
3723+ }
36043724 }
36053725 else
36063726 {
36073727 ret = chroot (* rootfs );
36083728 if (UNLIKELY (ret < 0 ))
36093729 return crun_make_error (err , errno , "chroot to `%s`" , * rootfs );
3610- }
36113730
3612- ret = chdir ("/" );
3613- if (UNLIKELY (ret < 0 ))
3614- return crun_make_error (err , errno , "chdir to `/`" );
3731+ ret = chdir ("/" );
3732+ if (UNLIKELY (ret < 0 ))
3733+ return crun_make_error (err , errno , "chdir to `/`" );
36153734
3616- free (* rootfs );
3617- * rootfs = xstrdup ("/" );
3735+ free (* rootfs );
3736+ * rootfs = xstrdup ("/" );
3737+ }
36183738
36193739 return 0 ;
36203740}
@@ -5210,7 +5330,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy
52105330 if (propagation == 0 )
52115331 propagation = MS_PRIVATE ;
52125332
5213- /* If the bind mount failed, do not fail here, but attempt to create it from within the container. */
52145333 mount_fd = get_bind_mount (-1 , def -> mounts [i ]-> source , recursive , false, nofollow , propagation , err );
52155334 if (UNLIKELY (mount_fd < 0 ))
52165335 crun_error_release (err );
0 commit comments