8888# define OPEN_TREE_CLOEXEC O_CLOEXEC
8989#endif
9090
91+ #ifndef OPEN_TREE_NAMESPACE
92+ # define OPEN_TREE_NAMESPACE 2
93+ #endif
94+
9195#ifndef MOVE_MOUNT_F_EMPTY_PATH
9296# define MOVE_MOUNT_F_EMPTY_PATH 0x00000004
9397#endif
@@ -162,6 +166,8 @@ struct private_data_s
162166 bool maskdir_bind_failed ;
163167 bool maskdir_warned ;
164168 bool joined_mount_ns ;
169+ bool needs_pivot ;
170+ bool no_pivot ;
165171};
166172
167173struct linux_namespace_s
@@ -940,6 +946,7 @@ fsopen_mount (const char *type, const char *source_name, const char *labeltype,
940946 if (eq )
941947 {
942948 * eq = '\0' ;
949+
943950 ret = syscall_fsconfig (fsfd , FSCONFIG_SET_STRING , token , eq + 1 , 0 );
944951 }
945952 else
@@ -1496,7 +1503,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd,
14961503 }
14971504
14981505 close_and_reset (& fd );
1499- fd = openat ( procfd , "self/root" , O_PATH | O_CLOEXEC );
1506+ fd = open ( get_private_data ( container ) -> rootfs , O_PATH | O_CLOEXEC );
15001507 if (UNLIKELY (fd < 0 ))
15011508 return crun_make_error (err , errno , "reopen rootfs after mount on /" );
15021509
@@ -2696,6 +2703,25 @@ process_single_mount (libcrun_container_t *container, const char *rootfs,
26962703 if (UNLIKELY (ret < 0 ))
26972704 return ret ;
26982705 }
2706+
2707+ if (is_empty_string (target ))
2708+ {
2709+ cleanup_close int fd = -1 ;
2710+
2711+ fd = open (get_private_data (container )-> rootfs , O_PATH | O_CLOEXEC );
2712+ if (UNLIKELY (fd < 0 ))
2713+ return crun_make_error (err , errno , "reopen rootfs after mount on /" );
2714+
2715+ {
2716+ int tmp = dup (fd );
2717+ if (UNLIKELY (tmp < 0 ))
2718+ return crun_make_error (err , errno , "dup" );
2719+
2720+ TEMP_FAILURE_RETRY (close (get_private_data (container )-> rootfsfd ));
2721+ get_private_data (container )-> rootfsfd = tmp ;
2722+ }
2723+ }
2724+
26992725 mounted = true;
27002726
27012727 if (is_empty_string (target ))
@@ -3154,10 +3180,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont
31543180 return ret ;
31553181 }
31563182
3157- ret = do_finalize_notify_socket (container , err );
3158- if (UNLIKELY (ret < 0 ))
3159- return ret ;
3160-
31613183 if (def -> process && def -> process -> cwd )
31623184 {
31633185 libcrun_error_t tmp_err = NULL ;
@@ -3174,25 +3196,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont
31743196 return 0 ;
31753197}
31763198
3177- int
3178- libcrun_finalize_mounts (struct container_entrypoint_s * entrypoint_args , libcrun_container_t * container , const char * rootfs , libcrun_error_t * err )
3179- {
3180- int ret ;
3181-
3182- ret = finalize_mounts (container , err );
3183- if (UNLIKELY (ret < 0 ))
3184- return ret ;
3185-
3186- // configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS
3187- ret = libcrun_container_notify_handler (entrypoint_args , HANDLER_CONFIGURE_AFTER_MOUNTS , container , rootfs , err );
3188- if (UNLIKELY (ret < 0 ))
3189- return crun_error_wrap (err , "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS" );
3190-
3191- close_and_reset (& (get_private_data (container )-> rootfsfd ));
3192-
3193- return 0 ;
3194- }
3195-
31963199static int
31973200umount_or_hide (const char * target , libcrun_error_t * err )
31983201{
@@ -3250,6 +3253,83 @@ move_root (const char *rootfs, libcrun_error_t *err)
32503253 return 0 ;
32513254}
32523255
3256+ int
3257+ libcrun_finalize_mounts (struct container_entrypoint_s * entrypoint_args , libcrun_container_t * container , const char * rootfs , libcrun_error_t * err )
3258+ {
3259+ int ret ;
3260+
3261+ // configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS
3262+ ret = libcrun_container_notify_handler (entrypoint_args , HANDLER_CONFIGURE_AFTER_MOUNTS , container , rootfs , err );
3263+ if (UNLIKELY (ret < 0 ))
3264+ return crun_error_wrap (err , "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS" );
3265+
3266+ close_and_reset (& (get_private_data (container )-> rootfsfd ));
3267+
3268+ if (get_private_data (container )-> needs_pivot )
3269+ {
3270+ get_private_data (container )-> needs_pivot = false;
3271+
3272+ if (get_private_data (container )-> no_pivot )
3273+ {
3274+ ret = move_root (rootfs , err );
3275+ if (UNLIKELY (ret < 0 ))
3276+ return ret ;
3277+ }
3278+ else
3279+ {
3280+ ret = do_pivot (container , rootfs , err );
3281+ if (UNLIKELY (ret < 0 ))
3282+ return ret ;
3283+ }
3284+
3285+ ret = do_mount (container , NULL , -1 , "/" , NULL ,
3286+ get_private_data (container )-> rootfs_propagation ,
3287+ NULL , LABEL_MOUNT , err );
3288+ if (UNLIKELY (ret < 0 ))
3289+ return ret ;
3290+
3291+ ret = chdir ("/" );
3292+ if (UNLIKELY (ret < 0 ))
3293+ return crun_make_error (err , errno , "chdir to `/`" );
3294+ }
3295+
3296+ ret = do_finalize_notify_socket (container , err );
3297+ if (UNLIKELY (ret < 0 ))
3298+ return ret ;
3299+
3300+ ret = finalize_mounts (container , err );
3301+ if (UNLIKELY (ret < 0 ))
3302+ return ret ;
3303+
3304+ return 0 ;
3305+ }
3306+
3307+ static int
3308+ maybe_open_tree_namespace (const char * rootfs , int * out_fd , libcrun_error_t * err )
3309+ {
3310+ cleanup_close int rootfs_fd = -1 ;
3311+ int tree_fd ;
3312+
3313+ * out_fd = -1 ;
3314+
3315+ rootfs_fd = open (rootfs , O_DIRECTORY | O_PATH | O_CLOEXEC );
3316+ if (UNLIKELY (rootfs_fd < 0 ))
3317+ return crun_make_error (err , errno , "open `%s`" , rootfs );
3318+
3319+ tree_fd = syscall_open_tree (rootfs_fd , "" ,
3320+ OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC
3321+ | AT_EMPTY_PATH | AT_RECURSIVE );
3322+ if (tree_fd < 0 )
3323+ {
3324+ if (errno == EINVAL || errno == ENOSYS || errno == EPERM )
3325+ return 0 ;
3326+ return crun_make_error (err , errno , "open_tree `%s`" , rootfs );
3327+ }
3328+
3329+ * out_fd = tree_fd ;
3330+ return 0 ;
3331+ }
3332+
32533333static struct libcrun_fd_map *
32543334get_fd_map (libcrun_container_t * container )
32553335{
@@ -3317,12 +3397,37 @@ open_mount_of_type (libcrun_container_t *container,
33173397 return mnt_fd ;
33183398}
33193399
3400+ static bool
3401+ can_use_open_tree_namespace (libcrun_container_t * container )
3402+ {
3403+ runtime_spec_schema_config_schema * def = container -> container_def ;
3404+ struct libcrun_fd_map * mount_fds ;
3405+ bool has_hooks = def -> hooks
3406+ && (def -> hooks -> prestart_len || def -> hooks -> create_runtime_len );
3407+ bool has_userns = get_private_data (container )-> unshare_flags & CLONE_NEWUSER ;
3408+ size_t i ;
3409+
3410+ if (has_hooks || has_userns )
3411+ return false;
3412+
3413+ mount_fds = get_fd_map (container );
3414+ for (i = 0 ; i < def -> mounts_len ; i ++ )
3415+ {
3416+ if (mount_fds -> fds [i ] < 0 )
3417+ return false;
3418+ }
3419+
3420+ return true;
3421+ }
3422+
33203423static int
33213424setup_mount_namespace (libcrun_container_t * container , bool no_pivot , char * * rootfs , libcrun_error_t * err )
33223425{
33233426 runtime_spec_schema_config_schema * def = container -> container_def ;
33243427 unsigned long rootfs_propagation = 0 ;
3428+ cleanup_close int tree_fd = -1 ;
33253429 libcrun_error_t tmp_err = NULL ;
3430+ bool use_open_tree = false;
33263431 size_t i ;
33273432 int ret ;
33283433
@@ -3334,25 +3439,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
33343439
33353440 get_private_data (container )-> rootfs_propagation = rootfs_propagation ;
33363441
3337- if (! get_private_data (container )-> joined_mount_ns )
3338- {
3339- ret = unshare (CLONE_NEWNS );
3340- if (UNLIKELY (ret < 0 ))
3341- return crun_make_error (err , errno , "unshare `CLONE_NEWNS`" );
3342- }
3343-
3344- ret = do_mount (container , NULL , -1 , "/" , NULL , rootfs_propagation , NULL , LABEL_MOUNT , err );
3345- if (UNLIKELY (ret < 0 ))
3346- return ret ;
3347-
3348- ret = make_parent_mount_private (* rootfs , err );
3349- if (UNLIKELY (ret < 0 ))
3350- return ret ;
3351-
3352- ret = do_mount (container , * rootfs , -1 , * rootfs , NULL , MS_BIND | MS_REC | MS_PRIVATE , NULL , LABEL_MOUNT , err );
3353- if (UNLIKELY (ret < 0 ))
3354- return ret ;
3355-
33563442 /* Pre-create mounts and cache paths before pivot_root,
33573443 while the host file system is still reachable. */
33583444 for (i = 0 ; i < def -> mounts_len ; i ++ )
@@ -3442,6 +3528,65 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
34423528 mnt_fd = -1 ;
34433529 }
34443530
3531+ if (! get_private_data (container )-> joined_mount_ns
3532+ && can_use_open_tree_namespace (container ))
3533+ use_open_tree = true;
3534+
3535+ if (use_open_tree )
3536+ {
3537+ ret = maybe_open_tree_namespace (* rootfs , & tree_fd , err );
3538+ if (UNLIKELY (ret < 0 ))
3539+ return ret ;
3540+ }
3541+
3542+ if (tree_fd >= 0 )
3543+ {
3544+ ret = setns (tree_fd , CLONE_NEWNS );
3545+ if (UNLIKELY (ret < 0 ))
3546+ return crun_make_error (err , errno , "setns `CLONE_NEWNS`" );
3547+
3548+ ret = mount (NULL , "/" , NULL , MS_REMOUNT | MS_BIND , NULL );
3549+ if (UNLIKELY (ret < 0 ))
3550+ return crun_make_error (err , errno , "remount `/`" );
3551+
3552+ ret = do_mount (container , NULL , -1 , "/" , NULL , MS_REC | MS_PRIVATE , NULL , LABEL_MOUNT , err );
3553+ if (UNLIKELY (ret < 0 ))
3554+ return ret ;
3555+
3556+ ret = do_mount (container , NULL , -1 , "/" , NULL , rootfs_propagation , NULL , LABEL_MOUNT , err );
3557+ if (UNLIKELY (ret < 0 ))
3558+ return ret ;
3559+
3560+ get_private_data (container )-> needs_pivot = false;
3561+ get_private_data (container )-> no_pivot = no_pivot ;
3562+ free (* rootfs );
3563+ * rootfs = xstrdup ("/" );
3564+ }
3565+ else
3566+ {
3567+ if (! get_private_data (container )-> joined_mount_ns )
3568+ {
3569+ ret = unshare (CLONE_NEWNS );
3570+ if (UNLIKELY (ret < 0 ))
3571+ return crun_make_error (err , errno , "unshare `CLONE_NEWNS`" );
3572+ }
3573+
3574+ ret = do_mount (container , NULL , -1 , "/" , NULL , rootfs_propagation , NULL , LABEL_MOUNT , err );
3575+ if (UNLIKELY (ret < 0 ))
3576+ return ret ;
3577+
3578+ ret = make_parent_mount_private (* rootfs , err );
3579+ if (UNLIKELY (ret < 0 ))
3580+ return ret ;
3581+
3582+ ret = do_mount (container , * rootfs , -1 , * rootfs , NULL , MS_BIND | MS_REC | MS_PRIVATE , NULL , LABEL_MOUNT , err );
3583+ if (UNLIKELY (ret < 0 ))
3584+ return ret ;
3585+
3586+ get_private_data (container )-> needs_pivot = true;
3587+ get_private_data (container )-> no_pivot = no_pivot ;
3588+ }
3589+
34453590 /* Mount everything before pivot_root while host paths are still reachable.
34463591 Use the pre-created fd when available, fall back to mount(). */
34473592 ret = open (* rootfs , O_PATH | O_CLOEXEC );
@@ -3533,23 +3678,6 @@ setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **roo
35333678 get_private_data (container )-> maskdir_bind_failed = true;
35343679 }
35353680
3536- if (no_pivot )
3537- {
3538- ret = move_root (* rootfs , err );
3539- if (UNLIKELY (ret < 0 ))
3540- return ret ;
3541- }
3542- else
3543- {
3544- ret = do_pivot (container , * rootfs , err );
3545- if (UNLIKELY (ret < 0 ))
3546- return ret ;
3547- }
3548-
3549- ret = do_mount (container , NULL , -1 , "/" , NULL , rootfs_propagation , NULL , LABEL_MOUNT , err );
3550- if (UNLIKELY (ret < 0 ))
3551- return ret ;
3552-
35533681 return 0 ;
35543682}
35553683
@@ -3566,20 +3694,30 @@ libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **roo
35663694 ret = setup_mount_namespace (container , no_pivot , rootfs , err );
35673695 if (UNLIKELY (ret < 0 ))
35683696 return ret ;
3697+
3698+ /* If setup_mount_namespace used OPEN_TREE_NAMESPACE, rootfs is
3699+ already set to "/". Otherwise pivot_root is deferred until
3700+ after the mounts are created. */
3701+ if (strcmp (* rootfs , "/" ) == 0 )
3702+ {
3703+ ret = chdir ("/" );
3704+ if (UNLIKELY (ret < 0 ))
3705+ return crun_make_error (err , errno , "chdir to `/`" );
3706+ }
35693707 }
35703708 else
35713709 {
35723710 ret = chroot (* rootfs );
35733711 if (UNLIKELY (ret < 0 ))
35743712 return crun_make_error (err , errno , "chroot to `%s`" , * rootfs );
3575- }
35763713
3577- ret = chdir ("/" );
3578- if (UNLIKELY (ret < 0 ))
3579- return crun_make_error (err , errno , "chdir to `/`" );
3714+ ret = chdir ("/" );
3715+ if (UNLIKELY (ret < 0 ))
3716+ return crun_make_error (err , errno , "chdir to `/`" );
35803717
3581- free (* rootfs );
3582- * rootfs = xstrdup ("/" );
3718+ free (* rootfs );
3719+ * rootfs = xstrdup ("/" );
3720+ }
35833721
35843722 return 0 ;
35853723}
@@ -5175,7 +5313,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy
51755313 if (propagation == 0 )
51765314 propagation = MS_PRIVATE ;
51775315
5178- /* If the bind mount failed, do not fail here, but attempt to create it from within the container. */
51795316 mount_fd = get_bind_mount (-1 , def -> mounts [i ]-> source , recursive , false, nofollow , propagation , err );
51805317 if (UNLIKELY (mount_fd < 0 ))
51815318 crun_error_release (err );
0 commit comments