Merge tag 'vfs-7.1-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

+113 -83

fs/namespace.c

··· 2646 2646 2647 2647 if (unlikely(shorter) && child != source_mnt) 2648 2648 mp = shorter; 2649 + /* 2650 + * If @q was locked it was meant to hide 2651 + * whatever was under it. Let @child take over 2652 + * that job and lock it, then we can unlock @q. 2653 + * That'll allow another namespace to shed @q 2654 + * and reveal @child. Clearly, that mounter 2655 + * consented to this by not severing the mount 2656 + * relationship. Otherwise, what's the point. 2657 + */ 2658 + if (IS_MNT_LOCKED(q)) { 2659 + child->mnt.mnt_flags |= MNT_LOCKED; 2660 + q->mnt.mnt_flags &= ~MNT_LOCKED; 2661 + } 2649 2662 mnt_change_mountpoint(r, mp, q); 2650 2663 } 2651 2664 } ··· 2735 2722 * In all cases the location must not have been unmounted and the 2736 2723 * chosen mountpoint must be allowed to be mounted on. For "beneath" 2737 2724 * case we also require the location to be at the root of a mount 2738 - * that has a parent (i.e. is not a root of some namespace). 2725 + * that has something mounted on top of it (i.e. has an overmount). 2739 2726 */ 2740 2727 static void do_lock_mount(const struct path *path, 2741 2728 struct pinned_mountpoint *res, ··· 2971 2958 } 2972 2959 2973 2960 static struct mount *__do_loopback(const struct path *old_path, 2974 - unsigned int flags, unsigned int copy_flags) 2961 + bool recurse, unsigned int copy_flags) 2975 2962 { 2976 2963 struct mount *old = real_mount(old_path->mnt); 2977 - bool recurse = flags & AT_RECURSIVE; 2978 2964 2979 2965 if (IS_MNT_UNBINDABLE(old)) 2980 2966 return ERR_PTR(-EINVAL); ··· 2983 2971 2984 2972 if (!recurse && __has_locked_children(old, old_path->dentry)) 2985 2973 return ERR_PTR(-EINVAL); 2986 - 2987 - /* 2988 - * When creating a new mount namespace we don't want to copy over 2989 - * mounts of mount namespaces to avoid the risk of cycles and also to 2990 - * minimize the default complex interdependencies between mount 2991 - * namespaces. 2992 - * 2993 - * We could ofc just check whether all mount namespace files aren't 2994 - * creating cycles but really let's keep this simple. 2995 - */ 2996 - if (!(flags & OPEN_TREE_NAMESPACE)) 2997 - copy_flags |= CL_COPY_MNT_NS_FILE; 2998 2974 2999 2975 if (recurse) 3000 2976 return copy_tree(old, old_path->dentry, copy_flags); ··· 2998 2998 { 2999 2999 struct path old_path __free(path_put) = {}; 3000 3000 struct mount *mnt = NULL; 3001 - unsigned int flags = recurse ? AT_RECURSIVE : 0; 3002 3001 int err; 3003 3002 3004 3003 if (!old_name || !*old_name) ··· 3016 3017 if (!check_mnt(mp.parent)) 3017 3018 return -EINVAL; 3018 3019 3019 - mnt = __do_loopback(&old_path, flags, 0); 3020 + mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE); 3020 3021 if (IS_ERR(mnt)) 3021 3022 return PTR_ERR(mnt); 3022 3023 ··· 3054 3055 ns->seq_origin = src_mnt_ns->ns.ns_id; 3055 3056 } 3056 3057 3057 - mnt = __do_loopback(path, flags, 0); 3058 + mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE); 3058 3059 if (IS_ERR(mnt)) { 3059 3060 emptied_ns = ns; 3060 3061 return ERR_CAST(mnt); ··· 3086 3087 return file; 3087 3088 } 3088 3089 3089 - static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) 3090 + enum mount_copy_flags_t { 3091 + MOUNT_COPY_RECURSIVE = (1 << 0), 3092 + MOUNT_COPY_NEW = (1 << 1), 3093 + }; 3094 + 3095 + static struct mnt_namespace *create_new_namespace(struct path *path, 3096 + enum mount_copy_flags_t flags) 3090 3097 { 3091 3098 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 3092 3099 struct user_namespace *user_ns = current_user_ns(); ··· 3101 3096 struct path to_path; 3102 3097 struct mount *mnt; 3103 3098 unsigned int copy_flags = 0; 3104 - bool locked = false; 3099 + bool locked = false, recurse = flags & MOUNT_COPY_RECURSIVE; 3105 3100 3106 3101 if (user_ns != ns->user_ns) 3107 3102 copy_flags |= CL_SLAVE; ··· 3136 3131 } 3137 3132 3138 3133 /* 3139 - * We don't emulate unshare()ing a mount namespace. We stick 3140 - * to the restrictions of creating detached bind-mounts. It 3141 - * has a lot saner and simpler semantics. 3134 + * We don't emulate unshare()ing a mount namespace. We stick to 3135 + * the restrictions of creating detached bind-mounts. It has a 3136 + * lot saner and simpler semantics. 3142 3137 */ 3143 - mnt = __do_loopback(path, flags, copy_flags); 3138 + if (flags & MOUNT_COPY_NEW) 3139 + mnt = clone_mnt(real_mount(path->mnt), path->dentry, copy_flags); 3140 + else 3141 + mnt = __do_loopback(path, recurse, copy_flags); 3144 3142 scoped_guard(mount_writer) { 3145 3143 if (IS_ERR(mnt)) { 3146 3144 emptied_ns = new_ns; ··· 3172 3164 return new_ns; 3173 3165 } 3174 3166 3175 - static struct file *open_new_namespace(struct path *path, unsigned int flags) 3167 + static struct file *open_new_namespace(struct path *path, 3168 + enum mount_copy_flags_t flags) 3176 3169 { 3177 3170 struct mnt_namespace *new_ns; 3178 3171 ··· 3226 3217 return ERR_PTR(ret); 3227 3218 3228 3219 if (flags & OPEN_TREE_NAMESPACE) 3229 - return open_new_namespace(&path, flags); 3220 + return open_new_namespace(&path, (flags & AT_RECURSIVE) ? MOUNT_COPY_RECURSIVE : 0); 3230 3221 3231 3222 if (flags & OPEN_TREE_CLONE) 3232 3223 return open_detached_copy(&path, flags); ··· 3522 3513 * @mnt_to: mount under which to mount 3523 3514 * @mp: mountpoint of @mnt_to 3524 3515 * 3525 - * - Make sure that nothing can be mounted beneath the caller's current 3526 - * root or the rootfs of the namespace. 3527 3516 * - Make sure that the caller can unmount the topmost mount ensuring 3528 3517 * that the caller could reveal the underlying mountpoint. 3529 3518 * - Ensure that nothing has been mounted on top of @mnt_from before we ··· 3535 3528 */ 3536 3529 static int can_move_mount_beneath(const struct mount *mnt_from, 3537 3530 const struct mount *mnt_to, 3538 - const struct mountpoint *mp) 3531 + struct pinned_mountpoint *mp) 3539 3532 { 3540 3533 struct mount *parent_mnt_to = mnt_to->mnt_parent; 3541 3534 3542 - if (IS_MNT_LOCKED(mnt_to)) 3543 - return -EINVAL; 3544 - 3545 3535 /* Avoid creating shadow mounts during mount propagation. */ 3546 3536 if (mnt_from->overmount) 3547 - return -EINVAL; 3548 - 3549 - /* 3550 - * Mounting beneath the rootfs only makes sense when the 3551 - * semantics of pivot_root(".", ".") are used. 3552 - */ 3553 - if (&mnt_to->mnt == current->fs->root.mnt) 3554 - return -EINVAL; 3555 - if (parent_mnt_to == current->nsproxy->mnt_ns->root) 3556 3537 return -EINVAL; 3557 3538 3558 3539 if (mount_is_ancestor(mnt_to, mnt_from)) ··· 3552 3557 * propagating a copy @c of @mnt_from on top of @mnt_to. This 3553 3558 * defeats the whole purpose of mounting beneath another mount. 3554 3559 */ 3555 - if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) 3560 + if (propagation_would_overmount(parent_mnt_to, mnt_to, mp->mp)) 3556 3561 return -EINVAL; 3557 3562 3558 3563 /* ··· 3568 3573 * @mnt_from beneath @mnt_to. 3569 3574 */ 3570 3575 if (check_mnt(mnt_from) && 3571 - propagation_would_overmount(parent_mnt_to, mnt_from, mp)) 3576 + propagation_would_overmount(parent_mnt_to, mnt_from, mp->mp)) 3572 3577 return -EINVAL; 3573 3578 3574 3579 return 0; ··· 3677 3682 3678 3683 if (mp.parent != over->mnt_parent) 3679 3684 over = mp.parent->overmount; 3680 - err = can_move_mount_beneath(old, over, mp.mp); 3685 + err = can_move_mount_beneath(old, over, &mp); 3681 3686 if (err) 3682 3687 return err; 3683 3688 } ··· 4226 4231 struct user_namespace *user_ns, struct fs_struct *new_fs) 4227 4232 { 4228 4233 struct mnt_namespace *new_ns; 4229 - struct vfsmount *rootmnt __free(mntput) = NULL; 4230 - struct vfsmount *pwdmnt __free(mntput) = NULL; 4234 + struct path old_root __free(path_put) = {}; 4235 + struct path old_pwd __free(path_put) = {}; 4231 4236 struct mount *p, *q; 4232 4237 struct mount *old; 4233 4238 struct mount *new; ··· 4247 4252 return new_ns; 4248 4253 4249 4254 guard(namespace_excl)(); 4250 - /* First pass: copy the tree topology */ 4251 - copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; 4255 + 4256 + if (flags & CLONE_EMPTY_MNTNS) 4257 + copy_flags = 0; 4258 + else 4259 + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; 4252 4260 if (user_ns != ns->user_ns) 4253 4261 copy_flags |= CL_SLAVE; 4254 - new = copy_tree(old, old->mnt.mnt_root, copy_flags); 4262 + 4263 + if (flags & CLONE_EMPTY_MNTNS) 4264 + new = clone_mnt(old, old->mnt.mnt_root, copy_flags); 4265 + else 4266 + new = copy_tree(old, old->mnt.mnt_root, copy_flags); 4255 4267 if (IS_ERR(new)) { 4256 4268 emptied_ns = new_ns; 4257 4269 return ERR_CAST(new); ··· 4269 4267 } 4270 4268 new_ns->root = new; 4271 4269 4272 - /* 4273 - * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 4274 - * as belonging to new namespace. We have already acquired a private 4275 - * fs_struct, so tsk->fs->lock is not needed. 4276 - */ 4277 - p = old; 4278 - q = new; 4279 - while (p) { 4280 - mnt_add_to_ns(new_ns, q); 4281 - new_ns->nr_mounts++; 4270 + if (flags & CLONE_EMPTY_MNTNS) { 4271 + /* 4272 + * Empty mount namespace: only the root mount exists. 4273 + * Reset root and pwd to the cloned mount's root dentry. 4274 + */ 4282 4275 if (new_fs) { 4283 - if (&p->mnt == new_fs->root.mnt) { 4284 - new_fs->root.mnt = mntget(&q->mnt); 4285 - rootmnt = &p->mnt; 4286 - } 4287 - if (&p->mnt == new_fs->pwd.mnt) { 4288 - new_fs->pwd.mnt = mntget(&q->mnt); 4289 - pwdmnt = &p->mnt; 4290 - } 4276 + old_root = new_fs->root; 4277 + old_pwd = new_fs->pwd; 4278 + 4279 + new_fs->root.mnt = mntget(&new->mnt); 4280 + new_fs->root.dentry = dget(new->mnt.mnt_root); 4281 + 4282 + new_fs->pwd.mnt = mntget(&new->mnt); 4283 + new_fs->pwd.dentry = dget(new->mnt.mnt_root); 4291 4284 } 4292 - p = next_mnt(p, old); 4293 - q = next_mnt(q, new); 4294 - if (!q) 4295 - break; 4296 - // an mntns binding we'd skipped? 4297 - while (p->mnt.mnt_root != q->mnt.mnt_root) 4298 - p = next_mnt(skip_mnt_tree(p), old); 4285 + mnt_add_to_ns(new_ns, new); 4286 + new_ns->nr_mounts++; 4287 + } else { 4288 + /* 4289 + * Full copy: walk old and new trees in parallel, switching 4290 + * the tsk->fs->* elements and marking new vfsmounts as 4291 + * belonging to new namespace. We have already acquired a 4292 + * private fs_struct, so tsk->fs->lock is not needed. 4293 + */ 4294 + p = old; 4295 + q = new; 4296 + while (p) { 4297 + mnt_add_to_ns(new_ns, q); 4298 + new_ns->nr_mounts++; 4299 + if (new_fs) { 4300 + if (&p->mnt == new_fs->root.mnt) { 4301 + old_root.mnt = new_fs->root.mnt; 4302 + new_fs->root.mnt = mntget(&q->mnt); 4303 + } 4304 + if (&p->mnt == new_fs->pwd.mnt) { 4305 + old_pwd.mnt = new_fs->pwd.mnt; 4306 + new_fs->pwd.mnt = mntget(&q->mnt); 4307 + } 4308 + } 4309 + p = next_mnt(p, old); 4310 + q = next_mnt(q, new); 4311 + if (!q) 4312 + break; 4313 + // an mntns binding we'd skipped? 4314 + while (p->mnt.mnt_root != q->mnt.mnt_root) 4315 + p = next_mnt(skip_mnt_tree(p), old); 4316 + } 4299 4317 } 4300 4318 ns_tree_add_raw(new_ns); 4301 4319 return new_ns; ··· 4436 4414 unsigned int mnt_flags = 0; 4437 4415 long ret; 4438 4416 4439 - if (!may_mount()) 4417 + if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0) 4418 + return -EINVAL; 4419 + 4420 + if ((flags & FSMOUNT_NAMESPACE) && 4421 + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 4440 4422 return -EPERM; 4441 4423 4442 - if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) 4443 - return -EINVAL; 4424 + if (!(flags & FSMOUNT_NAMESPACE) && !may_mount()) 4425 + return -EPERM; 4444 4426 4445 4427 if (attr_flags & ~FSMOUNT_VALID_FLAGS) 4446 4428 return -EINVAL; ··· 4510 4484 * don't want to have to handle any errors incurred. 4511 4485 */ 4512 4486 vfs_clean_context(fc); 4487 + 4488 + if (flags & FSMOUNT_NAMESPACE) 4489 + return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0, 4490 + open_new_namespace(&new_path, MOUNT_COPY_NEW)); 4513 4491 4514 4492 ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); 4515 4493 if (IS_ERR(ns)) ··· 5679 5649 if (mnt_ns_empty(ns)) 5680 5650 return -ENOENT; 5681 5651 5682 - first = child = ns->root; 5683 - for (;;) { 5684 - child = listmnt_next(child, false); 5685 - if (!child) 5686 - return -ENOENT; 5687 - if (child->mnt_parent == first) 5652 + first = ns->root; 5653 + for (child = node_to_mount(ns->mnt_first_node); child; 5654 + child = listmnt_next(child, false)) { 5655 + if (child != first && child->mnt_parent == first) 5688 5656 break; 5689 5657 } 5658 + if (!child) 5659 + return -ENOENT; 5690 5660 5691 5661 root->mnt = mntget(&child->mnt); 5692 5662 root->dentry = dget(root->mnt->mnt_root);

+1

include/uapi/linux/mount.h

··· 110 110 * fsmount() flags. 111 111 */ 112 112 #define FSMOUNT_CLOEXEC 0x00000001 113 + #define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */ 113 114 114 115 /* 115 116 * Mount attributes.

+7

include/uapi/linux/sched.h

··· 39 39 #define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ 40 40 #define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */ 41 41 #define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */ 42 + #define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */ 42 43 43 44 /* 44 45 * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 45 46 * syscalls only: 46 47 */ 47 48 #define CLONE_NEWTIME 0x00000080 /* New time namespace */ 49 + 50 + /* 51 + * unshare flags share the bit space with clone flags but only apply to the 52 + * unshare syscall: 53 + */ 54 + #define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */ 48 55 49 56 #ifndef __ASSEMBLY__ 50 57 /**

+16 -3

kernel/fork.c

··· 2667 2667 pid_t nr; 2668 2668 2669 2669 /* 2670 + * Creating an empty mount namespace implies creating a new mount 2671 + * namespace. Set this before copy_process() so that the 2672 + * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly. 2673 + */ 2674 + if (clone_flags & CLONE_EMPTY_MNTNS) { 2675 + clone_flags |= CLONE_NEWNS; 2676 + args->flags = clone_flags; 2677 + } 2678 + 2679 + /* 2670 2680 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument 2671 2681 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are 2672 2682 * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate ··· 2954 2944 { 2955 2945 /* Verify that no unknown flags are passed along. */ 2956 2946 if (kargs->flags & 2957 - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | 2958 - CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL)) 2947 + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | 2948 + CLONE_INTO_CGROUP | CLONE_AUTOREAP | CLONE_NNP | 2949 + CLONE_PIDFD_AUTOKILL | CLONE_EMPTY_MNTNS)) 2959 2950 return false; 2960 2951 2961 2952 /* ··· 3107 3096 { 3108 3097 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_SIGHAND| 3109 3098 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 3110 - CLONE_NS_ALL)) 3099 + CLONE_NS_ALL | UNSHARE_EMPTY_MNTNS)) 3111 3100 return -EINVAL; 3112 3101 /* 3113 3102 * Not implemented, but pretend it works if there is nothing ··· 3206 3195 /* 3207 3196 * If unsharing namespace, must also unshare filesystem information. 3208 3197 */ 3198 + if (unshare_flags & UNSHARE_EMPTY_MNTNS) 3199 + unshare_flags |= CLONE_NEWNS; 3209 3200 if (unshare_flags & CLONE_NEWNS) 3210 3201 unshare_flags |= CLONE_FS; 3211 3202

+14 -3

kernel/nsproxy.c

··· 96 96 if (!new_nsp) 97 97 return ERR_PTR(-ENOMEM); 98 98 99 - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); 99 + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, 100 + user_ns, new_fs); 100 101 if (IS_ERR(new_nsp->mnt_ns)) { 101 102 err = PTR_ERR(new_nsp->mnt_ns); 102 103 goto out_ns; ··· 212 211 struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) 213 212 { 214 213 struct user_namespace *user_ns; 214 + u64 flags = unshare_flags; 215 215 int err = 0; 216 216 217 - if (!(unshare_flags & (CLONE_NS_ALL & ~CLONE_NEWUSER))) 217 + if (!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER))) 218 218 return 0; 219 219 220 220 user_ns = new_cred ? new_cred->user_ns : current_user_ns(); 221 221 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 222 222 return -EPERM; 223 223 224 - *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, 224 + /* 225 + * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases 226 + * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS. 227 + */ 228 + if (flags & UNSHARE_EMPTY_MNTNS) { 229 + flags &= ~(u64)UNSHARE_EMPTY_MNTNS; 230 + flags |= CLONE_EMPTY_MNTNS; 231 + } 232 + 233 + *new_nsp = create_new_namespaces(flags, current, user_ns, 225 234 new_fs ? new_fs : current->fs); 226 235 if (IS_ERR(*new_nsp)) { 227 236 err = PTR_ERR(*new_nsp);

+12 -2

tools/include/uapi/linux/mount.h

··· 61 61 /* 62 62 * open_tree() flags. 63 63 */ 64 - #define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ 64 + #define OPEN_TREE_CLONE (1 << 0) /* Clone the target tree and attach the clone */ 65 + #define OPEN_TREE_NAMESPACE (1 << 1) /* Clone the target tree into a new mount namespace */ 65 66 #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ 66 67 67 68 /* ··· 110 109 * fsmount() flags. 111 110 */ 112 111 #define FSMOUNT_CLOEXEC 0x00000001 112 + #define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */ 113 113 114 114 /* 115 115 * Mount attributes. ··· 199 197 */ 200 198 struct mnt_id_req { 201 199 __u32 size; 202 - __u32 spare; 200 + union { 201 + __u32 mnt_ns_fd; 202 + __u32 mnt_fd; 203 + }; 203 204 __u64 mnt_id; 204 205 __u64 param; 205 206 __u64 mnt_ns_id; ··· 236 231 */ 237 232 #define LSMT_ROOT 0xffffffffffffffff /* root mount */ 238 233 #define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ 234 + 235 + /* 236 + * @flag bits for statmount(2) 237 + */ 238 + #define STATMOUNT_BY_FD 0x00000001U /* want mountinfo for given fd */ 239 239 240 240 #endif /* _UAPI_LINUX_MOUNT_H */

+3

tools/testing/selftests/Makefile

··· 38 38 TARGETS += filesystems/statmount 39 39 TARGETS += filesystems/mount-notify 40 40 TARGETS += filesystems/fuse 41 + TARGETS += filesystems/move_mount 42 + TARGETS += filesystems/empty_mntns 43 + TARGETS += filesystems/fsmount_ns 41 44 TARGETS += firmware 42 45 TARGETS += fpu 43 46 TARGETS += ftrace

+4

tools/testing/selftests/filesystems/empty_mntns/.gitignore

··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + clone3_empty_mntns_test 3 + empty_mntns_test 4 + overmount_chroot_test

+12

tools/testing/selftests/filesystems/empty_mntns/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) 4 + LDLIBS += -lcap 5 + 6 + TEST_GEN_PROGS := empty_mntns_test overmount_chroot_test clone3_empty_mntns_test 7 + 8 + include ../../lib.mk 9 + 10 + $(OUTPUT)/empty_mntns_test: ../utils.c 11 + $(OUTPUT)/overmount_chroot_test: ../utils.c 12 + $(OUTPUT)/clone3_empty_mntns_test: ../utils.c

+938

tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Tests for empty mount namespace creation via clone3() CLONE_EMPTY_MNTNS 4 + * 5 + * These tests exercise the clone3() code path for creating empty mount 6 + * namespaces, which is distinct from the unshare() path tested in 7 + * empty_mntns_test.c. With clone3(), CLONE_EMPTY_MNTNS (0x2000000000ULL) 8 + * is a 64-bit flag that implies CLONE_NEWNS. The implication happens in 9 + * kernel_clone() before copy_process(), unlike unshare() where it goes 10 + * through UNSHARE_EMPTY_MNTNS -> CLONE_EMPTY_MNTNS conversion in 11 + * unshare_nsproxy_namespaces(). 12 + * 13 + * Copyright (c) 2024 Christian Brauner <brauner@kernel.org> 14 + */ 15 + 16 + #define _GNU_SOURCE 17 + #include <fcntl.h> 18 + #include <linux/mount.h> 19 + #include <linux/stat.h> 20 + #include <stdio.h> 21 + #include <string.h> 22 + #include <sys/mount.h> 23 + #include <sys/stat.h> 24 + #include <sys/types.h> 25 + #include <unistd.h> 26 + 27 + #include "../utils.h" 28 + #include "../wrappers.h" 29 + #include "clone3/clone3_selftests.h" 30 + #include "empty_mntns.h" 31 + #include "kselftest_harness.h" 32 + 33 + static pid_t clone3_empty_mntns(uint64_t extra_flags) 34 + { 35 + struct __clone_args args = { 36 + .flags = CLONE_EMPTY_MNTNS | extra_flags, 37 + .exit_signal = SIGCHLD, 38 + }; 39 + 40 + return sys_clone3(&args, sizeof(args)); 41 + } 42 + 43 + static bool clone3_empty_mntns_supported(void) 44 + { 45 + pid_t pid; 46 + int status; 47 + 48 + pid = fork(); 49 + if (pid < 0) 50 + return false; 51 + 52 + if (pid == 0) { 53 + if (enter_userns()) 54 + _exit(1); 55 + 56 + pid = clone3_empty_mntns(0); 57 + if (pid < 0) 58 + _exit(1); 59 + 60 + if (pid == 0) 61 + _exit(0); 62 + 63 + _exit(wait_for_pid(pid) != 0); 64 + } 65 + 66 + if (waitpid(pid, &status, 0) != pid) 67 + return false; 68 + 69 + if (!WIFEXITED(status)) 70 + return false; 71 + 72 + return WEXITSTATUS(status) == 0; 73 + } 74 + 75 + FIXTURE(clone3_empty_mntns) {}; 76 + 77 + FIXTURE_SETUP(clone3_empty_mntns) 78 + { 79 + if (!clone3_empty_mntns_supported()) 80 + SKIP(return, "CLONE_EMPTY_MNTNS via clone3 not supported"); 81 + } 82 + 83 + FIXTURE_TEARDOWN(clone3_empty_mntns) {} 84 + 85 + /* 86 + * Basic clone3() with CLONE_EMPTY_MNTNS: child gets empty mount namespace 87 + * with exactly 1 mount and root == cwd. 88 + */ 89 + TEST_F(clone3_empty_mntns, basic) 90 + { 91 + pid_t pid, inner; 92 + 93 + pid = fork(); 94 + ASSERT_GE(pid, 0); 95 + 96 + if (pid == 0) { 97 + if (enter_userns()) 98 + _exit(1); 99 + 100 + inner = clone3_empty_mntns(0); 101 + if (inner < 0) 102 + _exit(2); 103 + 104 + if (inner == 0) { 105 + uint64_t root_id, cwd_id; 106 + 107 + if (count_mounts() != 1) 108 + _exit(3); 109 + 110 + root_id = get_unique_mnt_id("/"); 111 + cwd_id = get_unique_mnt_id("."); 112 + if (root_id == 0 || cwd_id == 0) 113 + _exit(4); 114 + 115 + if (root_id != cwd_id) 116 + _exit(5); 117 + 118 + _exit(0); 119 + } 120 + 121 + _exit(wait_for_pid(inner)); 122 + } 123 + 124 + ASSERT_EQ(wait_for_pid(pid), 0); 125 + } 126 + 127 + /* 128 + * CLONE_EMPTY_MNTNS implies CLONE_NEWNS. Verify that it works without 129 + * explicitly setting CLONE_NEWNS (tests fork.c:2627-2630). 130 + */ 131 + TEST_F(clone3_empty_mntns, implies_newns) 132 + { 133 + pid_t pid, inner; 134 + 135 + pid = fork(); 136 + ASSERT_GE(pid, 0); 137 + 138 + if (pid == 0) { 139 + ssize_t parent_mounts; 140 + 141 + if (enter_userns()) 142 + _exit(1); 143 + 144 + /* Verify we have mounts in our current namespace. */ 145 + parent_mounts = count_mounts(); 146 + if (parent_mounts < 1) 147 + _exit(2); 148 + 149 + /* Only CLONE_EMPTY_MNTNS, no explicit CLONE_NEWNS. */ 150 + inner = clone3_empty_mntns(0); 151 + if (inner < 0) 152 + _exit(3); 153 + 154 + if (inner == 0) { 155 + if (count_mounts() != 1) 156 + _exit(4); 157 + 158 + _exit(0); 159 + } 160 + 161 + /* Parent still has its mounts. */ 162 + if (count_mounts() != parent_mounts) 163 + _exit(5); 164 + 165 + _exit(wait_for_pid(inner)); 166 + } 167 + 168 + ASSERT_EQ(wait_for_pid(pid), 0); 169 + } 170 + 171 + /* 172 + * Helper macro: generate a test that clones with CLONE_EMPTY_MNTNS | 173 + * @extra_flags and verifies the child has exactly one mount. 174 + */ 175 + #define TEST_CLONE3_FLAGS(test_name, extra_flags) \ 176 + TEST_F(clone3_empty_mntns, test_name) \ 177 + { \ 178 + pid_t pid, inner; \ 179 + \ 180 + pid = fork(); \ 181 + ASSERT_GE(pid, 0); \ 182 + \ 183 + if (pid == 0) { \ 184 + if (enter_userns()) \ 185 + _exit(1); \ 186 + \ 187 + inner = clone3_empty_mntns(extra_flags); \ 188 + if (inner < 0) \ 189 + _exit(2); \ 190 + \ 191 + if (inner == 0) { \ 192 + if (count_mounts() != 1) \ 193 + _exit(3); \ 194 + _exit(0); \ 195 + } \ 196 + \ 197 + _exit(wait_for_pid(inner)); \ 198 + } \ 199 + \ 200 + ASSERT_EQ(wait_for_pid(pid), 0); \ 201 + } 202 + 203 + /* Redundant CLONE_NEWNS | CLONE_EMPTY_MNTNS should succeed. */ 204 + TEST_CLONE3_FLAGS(with_explicit_newns, CLONE_NEWNS) 205 + 206 + /* CLONE_EMPTY_MNTNS combined with CLONE_NEWUSER. */ 207 + TEST_CLONE3_FLAGS(with_newuser, CLONE_NEWUSER) 208 + 209 + /* CLONE_EMPTY_MNTNS combined with other namespace flags. */ 210 + TEST_CLONE3_FLAGS(with_other_ns_flags, CLONE_NEWUTS | CLONE_NEWIPC) 211 + 212 + /* 213 + * CLONE_EMPTY_MNTNS combined with CLONE_NEWPID. 214 + */ 215 + TEST_F(clone3_empty_mntns, with_newpid) 216 + { 217 + pid_t pid, inner; 218 + 219 + pid = fork(); 220 + ASSERT_GE(pid, 0); 221 + 222 + if (pid == 0) { 223 + if (enter_userns()) 224 + _exit(1); 225 + 226 + inner = clone3_empty_mntns(CLONE_NEWPID); 227 + if (inner < 0) 228 + _exit(2); 229 + 230 + if (inner == 0) { 231 + if (count_mounts() != 1) 232 + _exit(3); 233 + 234 + /* In a new PID namespace, getpid() returns 1. */ 235 + if (getpid() != 1) 236 + _exit(4); 237 + 238 + _exit(0); 239 + } 240 + 241 + _exit(wait_for_pid(inner)); 242 + } 243 + 244 + ASSERT_EQ(wait_for_pid(pid), 0); 245 + } 246 + 247 + /* 248 + * CLONE_EMPTY_MNTNS | CLONE_FS must fail because the implied CLONE_NEWNS 249 + * and CLONE_FS are mutually exclusive (fork.c:1981). 250 + */ 251 + TEST_F(clone3_empty_mntns, with_clone_fs_fails) 252 + { 253 + pid_t pid; 254 + 255 + pid = fork(); 256 + ASSERT_GE(pid, 0); 257 + 258 + if (pid == 0) { 259 + struct __clone_args args = { 260 + .flags = CLONE_EMPTY_MNTNS | CLONE_FS, 261 + .exit_signal = SIGCHLD, 262 + }; 263 + pid_t ret; 264 + 265 + if (enter_userns()) 266 + _exit(1); 267 + 268 + ret = sys_clone3(&args, sizeof(args)); 269 + if (ret >= 0) { 270 + if (ret == 0) 271 + _exit(0); 272 + wait_for_pid(ret); 273 + _exit(2); 274 + } 275 + 276 + if (errno != EINVAL) 277 + _exit(3); 278 + 279 + _exit(0); 280 + } 281 + 282 + ASSERT_EQ(wait_for_pid(pid), 0); 283 + } 284 + 285 + /* 286 + * CLONE_EMPTY_MNTNS combined with CLONE_PIDFD returns a valid pidfd. 287 + */ 288 + TEST_F(clone3_empty_mntns, with_pidfd) 289 + { 290 + pid_t pid; 291 + 292 + pid = fork(); 293 + ASSERT_GE(pid, 0); 294 + 295 + if (pid == 0) { 296 + struct __clone_args args = { 297 + .flags = CLONE_EMPTY_MNTNS | CLONE_PIDFD, 298 + .exit_signal = SIGCHLD, 299 + }; 300 + int pidfd = -1; 301 + pid_t inner; 302 + 303 + if (enter_userns()) 304 + _exit(1); 305 + 306 + args.pidfd = (uintptr_t)&pidfd; 307 + 308 + inner = sys_clone3(&args, sizeof(args)); 309 + if (inner < 0) 310 + _exit(2); 311 + 312 + if (inner == 0) { 313 + if (count_mounts() != 1) 314 + _exit(3); 315 + 316 + _exit(0); 317 + } 318 + 319 + /* Verify we got a valid pidfd. */ 320 + if (pidfd < 0) 321 + _exit(4); 322 + 323 + close(pidfd); 324 + _exit(wait_for_pid(inner)); 325 + } 326 + 327 + ASSERT_EQ(wait_for_pid(pid), 0); 328 + } 329 + 330 + /* 331 + * clone3 without CAP_SYS_ADMIN must fail with EPERM. 332 + */ 333 + TEST_F(clone3_empty_mntns, eperm_without_caps) 334 + { 335 + pid_t pid; 336 + 337 + pid = fork(); 338 + ASSERT_GE(pid, 0); 339 + 340 + if (pid == 0) { 341 + pid_t ret; 342 + 343 + /* Skip if already root. */ 344 + if (getuid() == 0) 345 + _exit(0); 346 + 347 + ret = clone3_empty_mntns(0); 348 + if (ret >= 0) { 349 + if (ret == 0) 350 + _exit(0); 351 + wait_for_pid(ret); 352 + _exit(1); 353 + } 354 + 355 + if (errno != EPERM) 356 + _exit(2); 357 + 358 + _exit(0); 359 + } 360 + 361 + ASSERT_EQ(wait_for_pid(pid), 0); 362 + } 363 + 364 + /* 365 + * Parent's mount namespace is unaffected after clone3 with CLONE_EMPTY_MNTNS. 366 + */ 367 + TEST_F(clone3_empty_mntns, parent_unchanged) 368 + { 369 + pid_t pid; 370 + 371 + pid = fork(); 372 + ASSERT_GE(pid, 0); 373 + 374 + if (pid == 0) { 375 + ssize_t nr_before, nr_after; 376 + pid_t inner; 377 + 378 + if (enter_userns()) 379 + _exit(1); 380 + 381 + nr_before = count_mounts(); 382 + if (nr_before < 1) 383 + _exit(2); 384 + 385 + inner = clone3_empty_mntns(0); 386 + if (inner < 0) 387 + _exit(3); 388 + 389 + if (inner == 0) 390 + _exit(0); 391 + 392 + if (wait_for_pid(inner) != 0) 393 + _exit(4); 394 + 395 + nr_after = count_mounts(); 396 + if (nr_after != nr_before) 397 + _exit(5); 398 + 399 + _exit(0); 400 + } 401 + 402 + ASSERT_EQ(wait_for_pid(pid), 0); 403 + } 404 + 405 + /* 406 + * Parent with many mounts: child still gets exactly 1 mount. 407 + */ 408 + TEST_F(clone3_empty_mntns, many_parent_mounts) 409 + { 410 + pid_t pid; 411 + 412 + pid = fork(); 413 + ASSERT_GE(pid, 0); 414 + 415 + if (pid == 0) { 416 + char tmpdir[] = "/tmp/clone3_mntns_test.XXXXXX"; 417 + pid_t inner; 418 + int i; 419 + 420 + if (enter_userns()) 421 + _exit(1); 422 + 423 + if (unshare(CLONE_NEWNS)) 424 + _exit(2); 425 + 426 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 427 + _exit(3); 428 + 429 + if (!mkdtemp(tmpdir)) 430 + _exit(4); 431 + 432 + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M")) 433 + _exit(5); 434 + 435 + for (i = 0; i < 5; i++) { 436 + char subdir[256]; 437 + 438 + snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i); 439 + if (mkdir(subdir, 0755) && errno != EEXIST) 440 + _exit(6); 441 + if (mount(subdir, subdir, NULL, MS_BIND, NULL)) 442 + _exit(7); 443 + } 444 + 445 + if (count_mounts() < 5) 446 + _exit(8); 447 + 448 + inner = clone3_empty_mntns(0); 449 + if (inner < 0) 450 + _exit(9); 451 + 452 + if (inner == 0) { 453 + if (count_mounts() != 1) 454 + _exit(10); 455 + 456 + _exit(0); 457 + } 458 + 459 + _exit(wait_for_pid(inner)); 460 + } 461 + 462 + ASSERT_EQ(wait_for_pid(pid), 0); 463 + } 464 + 465 + /* 466 + * Verify the child's root mount is nullfs with expected statmount properties. 467 + */ 468 + TEST_F(clone3_empty_mntns, mount_properties) 469 + { 470 + pid_t pid; 471 + 472 + pid = fork(); 473 + ASSERT_GE(pid, 0); 474 + 475 + if (pid == 0) { 476 + pid_t inner; 477 + 478 + if (enter_userns()) 479 + _exit(1); 480 + 481 + inner = clone3_empty_mntns(0); 482 + if (inner < 0) 483 + _exit(2); 484 + 485 + if (inner == 0) { 486 + struct statmount *sm; 487 + uint64_t root_id; 488 + 489 + root_id = get_unique_mnt_id("/"); 490 + if (!root_id) 491 + _exit(3); 492 + 493 + sm = statmount_alloc(root_id, 0, 494 + STATMOUNT_MNT_BASIC | 495 + STATMOUNT_MNT_POINT | 496 + STATMOUNT_FS_TYPE, 0); 497 + if (!sm) 498 + _exit(4); 499 + 500 + /* Root mount point is "/". */ 501 + if (!(sm->mask & STATMOUNT_MNT_POINT)) 502 + _exit(5); 503 + if (strcmp(sm->str + sm->mnt_point, "/") != 0) 504 + _exit(6); 505 + 506 + /* Filesystem type is nullfs. */ 507 + if (!(sm->mask & STATMOUNT_FS_TYPE)) 508 + _exit(7); 509 + if (strcmp(sm->str + sm->fs_type, "nullfs") != 0) 510 + _exit(8); 511 + 512 + /* Root mount is its own parent. */ 513 + if (!(sm->mask & STATMOUNT_MNT_BASIC)) 514 + _exit(9); 515 + if (sm->mnt_parent_id != sm->mnt_id) 516 + _exit(10); 517 + 518 + free(sm); 519 + _exit(0); 520 + } 521 + 522 + _exit(wait_for_pid(inner)); 523 + } 524 + 525 + ASSERT_EQ(wait_for_pid(pid), 0); 526 + } 527 + 528 + /* 529 + * Listmount returns only the root mount in the child's empty namespace. 530 + */ 531 + TEST_F(clone3_empty_mntns, listmount_single_entry) 532 + { 533 + pid_t pid; 534 + 535 + pid = fork(); 536 + ASSERT_GE(pid, 0); 537 + 538 + if (pid == 0) { 539 + pid_t inner; 540 + 541 + if (enter_userns()) 542 + _exit(1); 543 + 544 + inner = clone3_empty_mntns(0); 545 + if (inner < 0) 546 + _exit(2); 547 + 548 + if (inner == 0) { 549 + uint64_t list[16]; 550 + ssize_t nr_mounts; 551 + uint64_t root_id; 552 + 553 + nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0); 554 + if (nr_mounts != 1) 555 + _exit(3); 556 + 557 + root_id = get_unique_mnt_id("/"); 558 + if (!root_id) 559 + _exit(4); 560 + 561 + if (list[0] != root_id) 562 + _exit(5); 563 + 564 + _exit(0); 565 + } 566 + 567 + _exit(wait_for_pid(inner)); 568 + } 569 + 570 + ASSERT_EQ(wait_for_pid(pid), 0); 571 + } 572 + 573 + /* 574 + * Child can mount tmpfs over nullfs root (the primary container use case). 575 + * 576 + * Uses the new mount API (fsopen/fsmount/move_mount) because resolving 577 + * "/" returns the process root directly without following overmounts. 578 + * The mount fd from fsmount lets us fchdir + chroot into the new tmpfs. 579 + */ 580 + TEST_F(clone3_empty_mntns, child_overmount_tmpfs) 581 + { 582 + pid_t pid; 583 + 584 + pid = fork(); 585 + ASSERT_GE(pid, 0); 586 + 587 + if (pid == 0) { 588 + pid_t inner; 589 + 590 + if (enter_userns()) 591 + _exit(1); 592 + 593 + inner = clone3_empty_mntns(0); 594 + if (inner < 0) 595 + _exit(2); 596 + 597 + if (inner == 0) { 598 + struct statmount *sm; 599 + uint64_t root_id; 600 + int fd, fsfd, mntfd; 601 + 602 + if (count_mounts() != 1) 603 + _exit(3); 604 + 605 + /* Verify root is nullfs. */ 606 + root_id = get_unique_mnt_id("/"); 607 + if (!root_id) 608 + _exit(4); 609 + 610 + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); 611 + if (!sm) 612 + _exit(5); 613 + if (!(sm->mask & STATMOUNT_FS_TYPE)) 614 + _exit(6); 615 + if (strcmp(sm->str + sm->fs_type, "nullfs") != 0) 616 + _exit(7); 617 + free(sm); 618 + 619 + /* Create tmpfs via the new mount API. */ 620 + fsfd = sys_fsopen("tmpfs", 0); 621 + if (fsfd < 0) 622 + _exit(8); 623 + 624 + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, 625 + "size", "1M", 0)) { 626 + close(fsfd); 627 + _exit(9); 628 + } 629 + 630 + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, 631 + NULL, NULL, 0)) { 632 + close(fsfd); 633 + _exit(10); 634 + } 635 + 636 + mntfd = sys_fsmount(fsfd, 0, 0); 637 + close(fsfd); 638 + if (mntfd < 0) 639 + _exit(11); 640 + 641 + /* Attach tmpfs to "/". */ 642 + if (sys_move_mount(mntfd, "", AT_FDCWD, "/", 643 + MOVE_MOUNT_F_EMPTY_PATH)) { 644 + close(mntfd); 645 + _exit(12); 646 + } 647 + 648 + if (count_mounts() != 2) { 649 + close(mntfd); 650 + _exit(13); 651 + } 652 + 653 + /* Enter the tmpfs. */ 654 + if (fchdir(mntfd)) { 655 + close(mntfd); 656 + _exit(14); 657 + } 658 + 659 + if (chroot(".")) { 660 + close(mntfd); 661 + _exit(15); 662 + } 663 + 664 + close(mntfd); 665 + 666 + /* Verify "/" is now tmpfs. */ 667 + root_id = get_unique_mnt_id("/"); 668 + if (!root_id) 669 + _exit(16); 670 + 671 + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); 672 + if (!sm) 673 + _exit(17); 674 + if (!(sm->mask & STATMOUNT_FS_TYPE)) 675 + _exit(18); 676 + if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) 677 + _exit(19); 678 + free(sm); 679 + 680 + /* Verify tmpfs is writable. */ 681 + fd = open("/testfile", O_CREAT | O_RDWR, 0644); 682 + if (fd < 0) 683 + _exit(20); 684 + 685 + if (write(fd, "test", 4) != 4) { 686 + close(fd); 687 + _exit(21); 688 + } 689 + close(fd); 690 + 691 + if (access("/testfile", F_OK)) 692 + _exit(22); 693 + 694 + _exit(0); 695 + } 696 + 697 + _exit(wait_for_pid(inner)); 698 + } 699 + 700 + ASSERT_EQ(wait_for_pid(pid), 0); 701 + } 702 + 703 + /* 704 + * Multiple clone3 calls with CLONE_EMPTY_MNTNS produce children with 705 + * distinct mount namespace root mount IDs. 706 + */ 707 + TEST_F(clone3_empty_mntns, repeated) 708 + { 709 + pid_t pid; 710 + 711 + pid = fork(); 712 + ASSERT_GE(pid, 0); 713 + 714 + if (pid == 0) { 715 + int pipe1[2], pipe2[2]; 716 + uint64_t id1 = 0, id2 = 0; 717 + pid_t inner1, inner2; 718 + 719 + if (enter_userns()) 720 + _exit(1); 721 + 722 + if (pipe(pipe1) || pipe(pipe2)) 723 + _exit(2); 724 + 725 + inner1 = clone3_empty_mntns(0); 726 + if (inner1 < 0) 727 + _exit(3); 728 + 729 + if (inner1 == 0) { 730 + uint64_t root_id; 731 + 732 + close(pipe1[0]); 733 + root_id = get_unique_mnt_id("/"); 734 + if (write(pipe1[1], &root_id, sizeof(root_id)) != sizeof(root_id)) 735 + _exit(1); 736 + close(pipe1[1]); 737 + _exit(0); 738 + } 739 + 740 + inner2 = clone3_empty_mntns(0); 741 + if (inner2 < 0) 742 + _exit(4); 743 + 744 + if (inner2 == 0) { 745 + uint64_t root_id; 746 + 747 + close(pipe2[0]); 748 + root_id = get_unique_mnt_id("/"); 749 + if (write(pipe2[1], &root_id, sizeof(root_id)) != sizeof(root_id)) 750 + _exit(1); 751 + close(pipe2[1]); 752 + _exit(0); 753 + } 754 + 755 + close(pipe1[1]); 756 + close(pipe2[1]); 757 + 758 + if (read(pipe1[0], &id1, sizeof(id1)) != sizeof(id1)) 759 + _exit(5); 760 + if (read(pipe2[0], &id2, sizeof(id2)) != sizeof(id2)) 761 + _exit(6); 762 + 763 + close(pipe1[0]); 764 + close(pipe2[0]); 765 + 766 + if (wait_for_pid(inner1) || wait_for_pid(inner2)) 767 + _exit(7); 768 + 769 + /* Each child must have a distinct root mount ID. */ 770 + if (id1 == 0 || id2 == 0) 771 + _exit(8); 772 + if (id1 == id2) 773 + _exit(9); 774 + 775 + _exit(0); 776 + } 777 + 778 + ASSERT_EQ(wait_for_pid(pid), 0); 779 + } 780 + 781 + /* 782 + * Verify setns() into a child's empty mount namespace works. 783 + */ 784 + TEST_F(clone3_empty_mntns, setns_into_child_mntns) 785 + { 786 + pid_t pid; 787 + 788 + pid = fork(); 789 + ASSERT_GE(pid, 0); 790 + 791 + if (pid == 0) { 792 + int pipe_fd[2]; 793 + pid_t inner; 794 + char c; 795 + 796 + if (enter_userns()) 797 + _exit(1); 798 + 799 + if (pipe(pipe_fd)) 800 + _exit(2); 801 + 802 + inner = clone3_empty_mntns(0); 803 + if (inner < 0) 804 + _exit(3); 805 + 806 + if (inner == 0) { 807 + /* Signal parent we're ready. */ 808 + close(pipe_fd[0]); 809 + if (write(pipe_fd[1], "r", 1) != 1) 810 + _exit(1); 811 + 812 + /* 813 + * Wait for parent to finish. Reading from our 814 + * write end will block until the parent closes 815 + * its read end, giving us an implicit barrier. 816 + */ 817 + if (read(pipe_fd[1], &c, 1) < 0) 818 + ; 819 + close(pipe_fd[1]); 820 + _exit(0); 821 + } 822 + 823 + close(pipe_fd[1]); 824 + 825 + /* Wait for child to be ready. */ 826 + if (read(pipe_fd[0], &c, 1) != 1) 827 + _exit(4); 828 + 829 + /* Open child's mount namespace. */ 830 + { 831 + char path[64]; 832 + int mntns_fd; 833 + 834 + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", inner); 835 + mntns_fd = open(path, O_RDONLY); 836 + if (mntns_fd < 0) 837 + _exit(5); 838 + 839 + if (setns(mntns_fd, CLONE_NEWNS)) 840 + _exit(6); 841 + 842 + close(mntns_fd); 843 + } 844 + 845 + /* Now we should be in the child's empty mntns. */ 846 + if (count_mounts() != 1) 847 + _exit(7); 848 + 849 + close(pipe_fd[0]); 850 + _exit(wait_for_pid(inner)); 851 + } 852 + 853 + ASSERT_EQ(wait_for_pid(pid), 0); 854 + } 855 + 856 + /* 857 + * Tests below do not require CLONE_EMPTY_MNTNS support. 858 + */ 859 + 860 + /* 861 + * Unknown 64-bit flags beyond the known set are rejected. 862 + */ 863 + TEST(unknown_flags_rejected) 864 + { 865 + pid_t pid; 866 + 867 + pid = fork(); 868 + ASSERT_GE(pid, 0); 869 + 870 + if (pid == 0) { 871 + struct __clone_args args = { 872 + .flags = 0x800000000ULL, 873 + .exit_signal = SIGCHLD, 874 + }; 875 + pid_t ret; 876 + 877 + ret = sys_clone3(&args, sizeof(args)); 878 + if (ret >= 0) { 879 + if (ret == 0) 880 + _exit(0); 881 + wait_for_pid(ret); 882 + _exit(1); 883 + } 884 + 885 + if (errno != EINVAL) 886 + _exit(2); 887 + 888 + _exit(0); 889 + } 890 + 891 + ASSERT_EQ(wait_for_pid(pid), 0); 892 + } 893 + 894 + /* 895 + * Regular clone3 with CLONE_NEWNS (without CLONE_EMPTY_MNTNS) still 896 + * copies the full mount tree. 897 + */ 898 + TEST(clone3_newns_full_copy) 899 + { 900 + pid_t pid; 901 + 902 + pid = fork(); 903 + ASSERT_GE(pid, 0); 904 + 905 + if (pid == 0) { 906 + struct __clone_args args = { 907 + .flags = CLONE_NEWNS, 908 + .exit_signal = SIGCHLD, 909 + }; 910 + ssize_t parent_mounts; 911 + pid_t inner; 912 + 913 + if (enter_userns()) 914 + _exit(1); 915 + 916 + parent_mounts = count_mounts(); 917 + if (parent_mounts < 1) 918 + _exit(2); 919 + 920 + inner = sys_clone3(&args, sizeof(args)); 921 + if (inner < 0) 922 + _exit(3); 923 + 924 + if (inner == 0) { 925 + /* Full copy should have at least as many mounts. */ 926 + if (count_mounts() < parent_mounts) 927 + _exit(1); 928 + 929 + _exit(0); 930 + } 931 + 932 + _exit(wait_for_pid(inner)); 933 + } 934 + 935 + ASSERT_EQ(wait_for_pid(pid), 0); 936 + } 937 + 938 + TEST_HARNESS_MAIN

+25

tools/testing/selftests/filesystems/empty_mntns/empty_mntns.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + #ifndef EMPTY_MNTNS_H 3 + #define EMPTY_MNTNS_H 4 + 5 + #include <errno.h> 6 + #include <stdlib.h> 7 + 8 + #include "../statmount/statmount.h" 9 + 10 + #ifndef UNSHARE_EMPTY_MNTNS 11 + #define UNSHARE_EMPTY_MNTNS 0x00100000 12 + #endif 13 + 14 + #ifndef CLONE_EMPTY_MNTNS 15 + #define CLONE_EMPTY_MNTNS (1ULL << 37) 16 + #endif 17 + 18 + static inline ssize_t count_mounts(void) 19 + { 20 + uint64_t list[4096]; 21 + 22 + return listmount(LSMT_ROOT, 0, 0, list, sizeof(list) / sizeof(list[0]), 0); 23 + } 24 + 25 + #endif /* EMPTY_MNTNS_H */

+725

tools/testing/selftests/filesystems/empty_mntns/empty_mntns_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Tests for empty mount namespace creation via UNSHARE_EMPTY_MNTNS 4 + * 5 + * Copyright (c) 2024 Christian Brauner <brauner@kernel.org> 6 + */ 7 + 8 + #define _GNU_SOURCE 9 + #include <fcntl.h> 10 + #include <linux/mount.h> 11 + #include <linux/stat.h> 12 + #include <sched.h> 13 + #include <stdio.h> 14 + #include <string.h> 15 + #include <sys/mount.h> 16 + #include <sys/stat.h> 17 + #include <sys/types.h> 18 + #include <sys/wait.h> 19 + #include <unistd.h> 20 + 21 + #include "../utils.h" 22 + #include "../wrappers.h" 23 + #include "empty_mntns.h" 24 + #include "kselftest_harness.h" 25 + 26 + static bool unshare_empty_mntns_supported(void) 27 + { 28 + pid_t pid; 29 + int status; 30 + 31 + pid = fork(); 32 + if (pid < 0) 33 + return false; 34 + 35 + if (pid == 0) { 36 + if (enter_userns()) 37 + _exit(1); 38 + 39 + if (unshare(UNSHARE_EMPTY_MNTNS) && errno == EINVAL) 40 + _exit(1); 41 + _exit(0); 42 + } 43 + 44 + if (waitpid(pid, &status, 0) != pid) 45 + return false; 46 + 47 + if (!WIFEXITED(status)) 48 + return false; 49 + 50 + return WEXITSTATUS(status) == 0; 51 + } 52 + 53 + 54 + FIXTURE(empty_mntns) {}; 55 + 56 + FIXTURE_SETUP(empty_mntns) 57 + { 58 + if (!unshare_empty_mntns_supported()) 59 + SKIP(return, "UNSHARE_EMPTY_MNTNS not supported"); 60 + } 61 + 62 + FIXTURE_TEARDOWN(empty_mntns) {} 63 + 64 + /* Verify unshare succeeds, produces exactly 1 mount, and root == cwd */ 65 + TEST_F(empty_mntns, basic) 66 + { 67 + pid_t pid; 68 + 69 + pid = fork(); 70 + ASSERT_GE(pid, 0); 71 + 72 + if (pid == 0) { 73 + uint64_t root_id, cwd_id; 74 + 75 + if (enter_userns()) 76 + _exit(1); 77 + 78 + if (unshare(UNSHARE_EMPTY_MNTNS)) 79 + _exit(2); 80 + 81 + if (count_mounts() != 1) 82 + _exit(3); 83 + 84 + root_id = get_unique_mnt_id("/"); 85 + cwd_id = get_unique_mnt_id("."); 86 + if (root_id == 0 || cwd_id == 0) 87 + _exit(4); 88 + 89 + if (root_id != cwd_id) 90 + _exit(5); 91 + 92 + _exit(0); 93 + } 94 + 95 + ASSERT_EQ(wait_for_pid(pid), 0); 96 + } 97 + 98 + /* 99 + * UNSHARE_EMPTY_MNTNS combined with CLONE_NEWUSER. 100 + * 101 + * The user namespace must be created first so /proc is still accessible 102 + * for writing uid_map/gid_map. The empty mount namespace is created 103 + * afterwards. 104 + */ 105 + TEST_F(empty_mntns, with_clone_newuser) 106 + { 107 + pid_t pid; 108 + 109 + pid = fork(); 110 + ASSERT_GE(pid, 0); 111 + 112 + if (pid == 0) { 113 + uid_t uid = getuid(); 114 + gid_t gid = getgid(); 115 + char map[100]; 116 + 117 + if (unshare(CLONE_NEWUSER)) 118 + _exit(1); 119 + 120 + snprintf(map, sizeof(map), "0 %d 1", uid); 121 + if (write_file("/proc/self/uid_map", map)) 122 + _exit(2); 123 + 124 + if (write_file("/proc/self/setgroups", "deny")) 125 + _exit(3); 126 + 127 + snprintf(map, sizeof(map), "0 %d 1", gid); 128 + if (write_file("/proc/self/gid_map", map)) 129 + _exit(4); 130 + 131 + if (unshare(UNSHARE_EMPTY_MNTNS)) 132 + _exit(5); 133 + 134 + if (count_mounts() != 1) 135 + _exit(6); 136 + 137 + _exit(0); 138 + } 139 + 140 + ASSERT_EQ(wait_for_pid(pid), 0); 141 + } 142 + 143 + /* UNSHARE_EMPTY_MNTNS combined with other namespace flags */ 144 + TEST_F(empty_mntns, with_other_ns_flags) 145 + { 146 + pid_t pid; 147 + 148 + pid = fork(); 149 + ASSERT_GE(pid, 0); 150 + 151 + if (pid == 0) { 152 + if (enter_userns()) 153 + _exit(1); 154 + 155 + if (unshare(UNSHARE_EMPTY_MNTNS | CLONE_NEWUTS | CLONE_NEWIPC)) 156 + _exit(2); 157 + 158 + if (count_mounts() != 1) 159 + _exit(3); 160 + 161 + _exit(0); 162 + } 163 + 164 + ASSERT_EQ(wait_for_pid(pid), 0); 165 + } 166 + 167 + /* EPERM without proper capabilities */ 168 + TEST_F(empty_mntns, eperm_without_caps) 169 + { 170 + pid_t pid; 171 + 172 + pid = fork(); 173 + ASSERT_GE(pid, 0); 174 + 175 + if (pid == 0) { 176 + /* Skip if already root */ 177 + if (getuid() == 0) 178 + _exit(0); 179 + 180 + if (unshare(UNSHARE_EMPTY_MNTNS) == 0) 181 + _exit(1); 182 + 183 + if (errno != EPERM) 184 + _exit(2); 185 + 186 + _exit(0); 187 + } 188 + 189 + ASSERT_EQ(wait_for_pid(pid), 0); 190 + } 191 + 192 + /* Many source mounts still result in exactly 1 mount */ 193 + TEST_F(empty_mntns, many_source_mounts) 194 + { 195 + pid_t pid; 196 + 197 + pid = fork(); 198 + ASSERT_GE(pid, 0); 199 + 200 + if (pid == 0) { 201 + char tmpdir[] = "/tmp/empty_mntns_test.XXXXXX"; 202 + int i; 203 + 204 + if (enter_userns()) 205 + _exit(1); 206 + 207 + if (unshare(CLONE_NEWNS)) 208 + _exit(2); 209 + 210 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 211 + _exit(3); 212 + 213 + if (!mkdtemp(tmpdir)) 214 + _exit(4); 215 + 216 + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M")) 217 + _exit(5); 218 + 219 + for (i = 0; i < 5; i++) { 220 + char subdir[256]; 221 + 222 + snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i); 223 + if (mkdir(subdir, 0755) && errno != EEXIST) 224 + _exit(6); 225 + if (mount(subdir, subdir, NULL, MS_BIND, NULL)) 226 + _exit(7); 227 + } 228 + 229 + if (count_mounts() < 5) 230 + _exit(8); 231 + 232 + if (unshare(UNSHARE_EMPTY_MNTNS)) 233 + _exit(9); 234 + 235 + if (count_mounts() != 1) 236 + _exit(10); 237 + 238 + _exit(0); 239 + } 240 + 241 + ASSERT_EQ(wait_for_pid(pid), 0); 242 + } 243 + 244 + /* CWD on a different mount gets reset to root */ 245 + TEST_F(empty_mntns, cwd_reset) 246 + { 247 + pid_t pid; 248 + 249 + pid = fork(); 250 + ASSERT_GE(pid, 0); 251 + 252 + if (pid == 0) { 253 + char tmpdir[] = "/tmp/empty_mntns_cwd.XXXXXX"; 254 + uint64_t root_id, cwd_id; 255 + struct statmount *sm; 256 + 257 + if (enter_userns()) 258 + _exit(1); 259 + 260 + if (unshare(CLONE_NEWNS)) 261 + _exit(2); 262 + 263 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 264 + _exit(3); 265 + 266 + if (!mkdtemp(tmpdir)) 267 + _exit(4); 268 + 269 + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M")) 270 + _exit(5); 271 + 272 + if (chdir(tmpdir)) 273 + _exit(6); 274 + 275 + if (unshare(UNSHARE_EMPTY_MNTNS)) 276 + _exit(7); 277 + 278 + root_id = get_unique_mnt_id("/"); 279 + cwd_id = get_unique_mnt_id("."); 280 + if (root_id == 0 || cwd_id == 0) 281 + _exit(8); 282 + 283 + if (root_id != cwd_id) 284 + _exit(9); 285 + 286 + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, 0); 287 + if (!sm) 288 + _exit(10); 289 + 290 + if (strcmp(sm->str + sm->mnt_point, "/") != 0) 291 + _exit(11); 292 + 293 + free(sm); 294 + _exit(0); 295 + } 296 + 297 + ASSERT_EQ(wait_for_pid(pid), 0); 298 + } 299 + 300 + /* Verify statmount properties of the root mount */ 301 + TEST_F(empty_mntns, mount_properties) 302 + { 303 + pid_t pid; 304 + 305 + pid = fork(); 306 + ASSERT_GE(pid, 0); 307 + 308 + if (pid == 0) { 309 + struct statmount *sm; 310 + uint64_t root_id; 311 + 312 + if (enter_userns()) 313 + _exit(1); 314 + 315 + if (unshare(UNSHARE_EMPTY_MNTNS)) 316 + _exit(2); 317 + 318 + root_id = get_unique_mnt_id("/"); 319 + if (!root_id) 320 + _exit(3); 321 + 322 + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT | 323 + STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE, 0); 324 + if (!sm) 325 + _exit(4); 326 + 327 + if (!(sm->mask & STATMOUNT_MNT_POINT)) 328 + _exit(5); 329 + 330 + if (strcmp(sm->str + sm->mnt_point, "/") != 0) 331 + _exit(6); 332 + 333 + if (!(sm->mask & STATMOUNT_MNT_BASIC)) 334 + _exit(7); 335 + 336 + if (sm->mnt_id != root_id) 337 + _exit(8); 338 + 339 + free(sm); 340 + _exit(0); 341 + } 342 + 343 + ASSERT_EQ(wait_for_pid(pid), 0); 344 + } 345 + 346 + /* Consecutive UNSHARE_EMPTY_MNTNS calls produce new namespaces */ 347 + TEST_F(empty_mntns, repeated_unshare) 348 + { 349 + pid_t pid; 350 + 351 + pid = fork(); 352 + ASSERT_GE(pid, 0); 353 + 354 + if (pid == 0) { 355 + uint64_t first_root_id, second_root_id; 356 + 357 + if (enter_userns()) 358 + _exit(1); 359 + 360 + if (unshare(UNSHARE_EMPTY_MNTNS)) 361 + _exit(2); 362 + 363 + if (count_mounts() != 1) 364 + _exit(3); 365 + 366 + first_root_id = get_unique_mnt_id("/"); 367 + 368 + if (unshare(UNSHARE_EMPTY_MNTNS)) 369 + _exit(4); 370 + 371 + if (count_mounts() != 1) 372 + _exit(5); 373 + 374 + second_root_id = get_unique_mnt_id("/"); 375 + 376 + if (first_root_id == second_root_id) 377 + _exit(6); 378 + 379 + _exit(0); 380 + } 381 + 382 + ASSERT_EQ(wait_for_pid(pid), 0); 383 + } 384 + 385 + /* Root mount's parent is itself */ 386 + TEST_F(empty_mntns, root_is_own_parent) 387 + { 388 + pid_t pid; 389 + 390 + pid = fork(); 391 + ASSERT_GE(pid, 0); 392 + 393 + if (pid == 0) { 394 + struct statmount sm; 395 + uint64_t root_id; 396 + 397 + if (enter_userns()) 398 + _exit(1); 399 + 400 + if (unshare(UNSHARE_EMPTY_MNTNS)) 401 + _exit(2); 402 + 403 + root_id = get_unique_mnt_id("/"); 404 + if (!root_id) 405 + _exit(3); 406 + 407 + if (statmount(root_id, 0, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0) < 0) 408 + _exit(4); 409 + 410 + if (!(sm.mask & STATMOUNT_MNT_BASIC)) 411 + _exit(5); 412 + 413 + if (sm.mnt_parent_id != sm.mnt_id) 414 + _exit(6); 415 + 416 + _exit(0); 417 + } 418 + 419 + ASSERT_EQ(wait_for_pid(pid), 0); 420 + } 421 + 422 + /* Listmount returns only the root mount */ 423 + TEST_F(empty_mntns, listmount_single_entry) 424 + { 425 + pid_t pid; 426 + 427 + pid = fork(); 428 + ASSERT_GE(pid, 0); 429 + 430 + if (pid == 0) { 431 + uint64_t list[16]; 432 + ssize_t nr_mounts; 433 + uint64_t root_id; 434 + 435 + if (enter_userns()) 436 + _exit(1); 437 + 438 + if (unshare(UNSHARE_EMPTY_MNTNS)) 439 + _exit(2); 440 + 441 + nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0); 442 + if (nr_mounts != 1) 443 + _exit(3); 444 + 445 + root_id = get_unique_mnt_id("/"); 446 + if (!root_id) 447 + _exit(4); 448 + 449 + if (list[0] != root_id) 450 + _exit(5); 451 + 452 + _exit(0); 453 + } 454 + 455 + ASSERT_EQ(wait_for_pid(pid), 0); 456 + } 457 + 458 + /* 459 + * Mount tmpfs over nullfs root to build a writable filesystem from scratch. 460 + * This exercises the intended usage pattern: create an empty mount namespace 461 + * (which has a nullfs root), then mount a real filesystem over it. 462 + * 463 + * Because resolving "/" returns the process root directly (via nd_jump_root) 464 + * without following overmounts, we use the new mount API (fsopen/fsmount) 465 + * to obtain a mount fd, then fchdir + chroot to enter the new filesystem. 466 + */ 467 + TEST_F(empty_mntns, overmount_tmpfs) 468 + { 469 + pid_t pid; 470 + 471 + pid = fork(); 472 + ASSERT_GE(pid, 0); 473 + 474 + if (pid == 0) { 475 + struct statmount *sm; 476 + uint64_t root_id, cwd_id; 477 + int fd, fsfd, mntfd; 478 + 479 + if (enter_userns()) 480 + _exit(1); 481 + 482 + if (unshare(UNSHARE_EMPTY_MNTNS)) 483 + _exit(2); 484 + 485 + if (count_mounts() != 1) 486 + _exit(3); 487 + 488 + root_id = get_unique_mnt_id("/"); 489 + if (!root_id) 490 + _exit(4); 491 + 492 + /* Verify root is nullfs */ 493 + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); 494 + if (!sm) 495 + _exit(5); 496 + 497 + if (!(sm->mask & STATMOUNT_FS_TYPE)) 498 + _exit(6); 499 + 500 + if (strcmp(sm->str + sm->fs_type, "nullfs") != 0) 501 + _exit(7); 502 + 503 + free(sm); 504 + 505 + cwd_id = get_unique_mnt_id("."); 506 + if (!cwd_id || root_id != cwd_id) 507 + _exit(8); 508 + 509 + /* 510 + * nullfs root is immutable. open(O_CREAT) returns ENOENT 511 + * because empty_dir_lookup() returns -ENOENT before the 512 + * IS_IMMUTABLE permission check in may_o_create() is reached. 513 + */ 514 + fd = open("/test", O_CREAT | O_RDWR, 0644); 515 + if (fd >= 0) { 516 + close(fd); 517 + _exit(9); 518 + } 519 + if (errno != ENOENT) 520 + _exit(10); 521 + 522 + /* 523 + * Use the new mount API to create tmpfs and get a mount fd. 524 + * We need the fd because after attaching the tmpfs on top of 525 + * "/", path resolution of "/" still returns the process root 526 + * (nullfs) without following the overmount. The mount fd 527 + * lets us fchdir + chroot into the tmpfs. 528 + */ 529 + fsfd = sys_fsopen("tmpfs", 0); 530 + if (fsfd < 0) 531 + _exit(11); 532 + 533 + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "size", "1M", 0)) { 534 + close(fsfd); 535 + _exit(12); 536 + } 537 + 538 + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { 539 + close(fsfd); 540 + _exit(13); 541 + } 542 + 543 + mntfd = sys_fsmount(fsfd, 0, 0); 544 + close(fsfd); 545 + if (mntfd < 0) 546 + _exit(14); 547 + 548 + if (sys_move_mount(mntfd, "", AT_FDCWD, "/", 549 + MOVE_MOUNT_F_EMPTY_PATH)) { 550 + close(mntfd); 551 + _exit(15); 552 + } 553 + 554 + if (count_mounts() != 2) { 555 + close(mntfd); 556 + _exit(16); 557 + } 558 + 559 + /* Enter the tmpfs via the mount fd */ 560 + if (fchdir(mntfd)) { 561 + close(mntfd); 562 + _exit(17); 563 + } 564 + 565 + if (chroot(".")) { 566 + close(mntfd); 567 + _exit(18); 568 + } 569 + 570 + close(mntfd); 571 + 572 + /* Verify "/" now resolves to tmpfs */ 573 + root_id = get_unique_mnt_id("/"); 574 + if (!root_id) 575 + _exit(19); 576 + 577 + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); 578 + if (!sm) 579 + _exit(20); 580 + 581 + if (!(sm->mask & STATMOUNT_FS_TYPE)) 582 + _exit(21); 583 + 584 + if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) 585 + _exit(22); 586 + 587 + free(sm); 588 + 589 + /* Verify tmpfs is writable */ 590 + fd = open("/testfile", O_CREAT | O_RDWR, 0644); 591 + if (fd < 0) 592 + _exit(23); 593 + 594 + if (write(fd, "test", 4) != 4) { 595 + close(fd); 596 + _exit(24); 597 + } 598 + 599 + close(fd); 600 + 601 + if (access("/testfile", F_OK)) 602 + _exit(25); 603 + 604 + _exit(0); 605 + } 606 + 607 + ASSERT_EQ(wait_for_pid(pid), 0); 608 + } 609 + 610 + /* 611 + * Tests below do not require UNSHARE_EMPTY_MNTNS support. 612 + */ 613 + 614 + /* Invalid unshare flags return EINVAL */ 615 + TEST(invalid_flags) 616 + { 617 + pid_t pid; 618 + 619 + pid = fork(); 620 + ASSERT_GE(pid, 0); 621 + 622 + if (pid == 0) { 623 + if (enter_userns()) 624 + _exit(1); 625 + 626 + if (unshare(0x80000000) == 0) 627 + _exit(2); 628 + 629 + if (errno != EINVAL) 630 + _exit(3); 631 + 632 + _exit(0); 633 + } 634 + 635 + ASSERT_EQ(wait_for_pid(pid), 0); 636 + } 637 + 638 + /* Regular CLONE_NEWNS still copies the full mount tree */ 639 + TEST(clone_newns_full_copy) 640 + { 641 + pid_t pid; 642 + 643 + pid = fork(); 644 + ASSERT_GE(pid, 0); 645 + 646 + if (pid == 0) { 647 + ssize_t nr_mounts_before, nr_mounts_after; 648 + char tmpdir[] = "/tmp/empty_mntns_regr.XXXXXX"; 649 + int i; 650 + 651 + if (enter_userns()) 652 + _exit(1); 653 + 654 + if (unshare(CLONE_NEWNS)) 655 + _exit(2); 656 + 657 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 658 + _exit(3); 659 + 660 + if (!mkdtemp(tmpdir)) 661 + _exit(4); 662 + 663 + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M")) 664 + _exit(5); 665 + 666 + for (i = 0; i < 3; i++) { 667 + char subdir[256]; 668 + 669 + snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i); 670 + if (mkdir(subdir, 0755) && errno != EEXIST) 671 + _exit(6); 672 + if (mount(subdir, subdir, NULL, MS_BIND, NULL)) 673 + _exit(7); 674 + } 675 + 676 + nr_mounts_before = count_mounts(); 677 + if (nr_mounts_before < 3) 678 + _exit(8); 679 + 680 + if (unshare(CLONE_NEWNS)) 681 + _exit(9); 682 + 683 + nr_mounts_after = count_mounts(); 684 + if (nr_mounts_after < nr_mounts_before) 685 + _exit(10); 686 + 687 + _exit(0); 688 + } 689 + 690 + ASSERT_EQ(wait_for_pid(pid), 0); 691 + } 692 + 693 + /* Other namespace unshares are unaffected */ 694 + TEST(other_ns_unaffected) 695 + { 696 + pid_t pid; 697 + 698 + pid = fork(); 699 + ASSERT_GE(pid, 0); 700 + 701 + if (pid == 0) { 702 + char hostname[256]; 703 + 704 + if (enter_userns()) 705 + _exit(1); 706 + 707 + if (unshare(CLONE_NEWUTS)) 708 + _exit(2); 709 + 710 + if (sethostname("test-empty-mntns", 16)) 711 + _exit(3); 712 + 713 + if (gethostname(hostname, sizeof(hostname))) 714 + _exit(4); 715 + 716 + if (strcmp(hostname, "test-empty-mntns") != 0) 717 + _exit(5); 718 + 719 + _exit(0); 720 + } 721 + 722 + ASSERT_EQ(wait_for_pid(pid), 0); 723 + } 724 + 725 + TEST_HARNESS_MAIN

+225

tools/testing/selftests/filesystems/empty_mntns/overmount_chroot_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Test: rootfs overmounted multiple times with chroot into topmost 4 + * 5 + * This test creates a scenario where: 6 + * 1. A new mount namespace is created with a tmpfs root (via pivot_root) 7 + * 2. A mountpoint is created and overmounted multiple times 8 + * 3. The caller chroots into the topmost mount layer 9 + * 10 + * The test verifies that: 11 + * - Multiple overmounts create separate mount layers 12 + * - Each layer's files are isolated 13 + * - chroot correctly sets the process's root to the topmost layer 14 + * - After chroot, only the topmost layer's files are visible 15 + * 16 + * Copyright (c) 2024 Christian Brauner <brauner@kernel.org> 17 + */ 18 + 19 + #define _GNU_SOURCE 20 + #include <fcntl.h> 21 + #include <linux/mount.h> 22 + #include <linux/stat.h> 23 + #include <sched.h> 24 + #include <stdio.h> 25 + #include <string.h> 26 + #include <sys/mount.h> 27 + #include <sys/stat.h> 28 + #include <sys/syscall.h> 29 + #include <sys/types.h> 30 + #include <sys/wait.h> 31 + #include <unistd.h> 32 + 33 + #include "../utils.h" 34 + #include "empty_mntns.h" 35 + #include "kselftest_harness.h" 36 + 37 + #define NR_OVERMOUNTS 5 38 + 39 + /* 40 + * Setup a proper root filesystem using pivot_root. 41 + * This ensures we own the root directory in our user namespace. 42 + */ 43 + static int setup_root(void) 44 + { 45 + char tmpdir[] = "/tmp/overmount_test.XXXXXX"; 46 + char oldroot[256]; 47 + 48 + if (!mkdtemp(tmpdir)) 49 + return -1; 50 + 51 + /* Mount tmpfs at the temporary directory */ 52 + if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=10M")) 53 + return -1; 54 + 55 + /* Create directory for old root */ 56 + snprintf(oldroot, sizeof(oldroot), "%s/oldroot", tmpdir); 57 + if (mkdir(oldroot, 0755)) 58 + return -1; 59 + 60 + /* pivot_root to use the tmpfs as new root */ 61 + if (syscall(SYS_pivot_root, tmpdir, oldroot)) 62 + return -1; 63 + 64 + if (chdir("/")) 65 + return -1; 66 + 67 + /* Unmount old root */ 68 + if (umount2("/oldroot", MNT_DETACH)) 69 + return -1; 70 + 71 + /* Remove oldroot directory */ 72 + if (rmdir("/oldroot")) 73 + return -1; 74 + 75 + return 0; 76 + } 77 + 78 + /* 79 + * Test scenario: 80 + * 1. Enter a user namespace to gain CAP_SYS_ADMIN 81 + * 2. Create a new mount namespace 82 + * 3. Setup a tmpfs root via pivot_root 83 + * 4. Create a mountpoint /newroot and overmount it multiple times 84 + * 5. Create a marker file in each layer 85 + * 6. Chroot into /newroot (the topmost overmount) 86 + * 7. Verify we're in the topmost layer (only topmost marker visible) 87 + */ 88 + TEST(overmount_chroot) 89 + { 90 + pid_t pid; 91 + 92 + pid = fork(); 93 + ASSERT_GE(pid, 0); 94 + 95 + if (pid == 0) { 96 + ssize_t nr_mounts; 97 + uint64_t mnt_ids[NR_OVERMOUNTS + 1]; 98 + uint64_t root_id_before, root_id_after; 99 + struct statmount *sm; 100 + char marker[64]; 101 + int fd, i; 102 + 103 + /* Step 1: Enter user namespace for privileges */ 104 + if (enter_userns()) 105 + _exit(1); 106 + 107 + /* Step 2: Create a new mount namespace */ 108 + if (unshare(CLONE_NEWNS)) 109 + _exit(2); 110 + 111 + /* Step 3: Make the mount tree private */ 112 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 113 + _exit(3); 114 + 115 + /* Step 4: Setup a proper tmpfs root via pivot_root */ 116 + if (setup_root()) 117 + _exit(4); 118 + 119 + /* Create the base mount point for overmounting */ 120 + if (mkdir("/newroot", 0755)) 121 + _exit(5); 122 + 123 + /* Mount base tmpfs on /newroot */ 124 + if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M")) 125 + _exit(6); 126 + 127 + /* Record base mount ID */ 128 + mnt_ids[0] = get_unique_mnt_id("/newroot"); 129 + if (!mnt_ids[0]) 130 + _exit(7); 131 + 132 + /* Create marker in base layer */ 133 + fd = open("/newroot/layer_0", O_CREAT | O_RDWR, 0644); 134 + if (fd < 0) 135 + _exit(8); 136 + if (write(fd, "layer_0", 7) != 7) { 137 + close(fd); 138 + _exit(9); 139 + } 140 + close(fd); 141 + 142 + /* Step 5: Overmount /newroot multiple times with tmpfs */ 143 + for (i = 0; i < NR_OVERMOUNTS; i++) { 144 + if (mount("tmpfs", "/newroot", "tmpfs", 0, "size=1M")) 145 + _exit(10); 146 + 147 + /* Record mount ID for this layer */ 148 + mnt_ids[i + 1] = get_unique_mnt_id("/newroot"); 149 + if (!mnt_ids[i + 1]) 150 + _exit(11); 151 + 152 + /* Create a marker file in each layer */ 153 + snprintf(marker, sizeof(marker), "/newroot/layer_%d", i + 1); 154 + fd = open(marker, O_CREAT | O_RDWR, 0644); 155 + if (fd < 0) 156 + _exit(12); 157 + 158 + if (write(fd, marker, strlen(marker)) != (ssize_t)strlen(marker)) { 159 + close(fd); 160 + _exit(13); 161 + } 162 + close(fd); 163 + } 164 + 165 + /* Verify mount count increased */ 166 + nr_mounts = count_mounts(); 167 + if (nr_mounts < NR_OVERMOUNTS + 2) 168 + _exit(14); 169 + 170 + /* Record root mount ID before chroot */ 171 + root_id_before = get_unique_mnt_id("/newroot"); 172 + 173 + /* Verify this is the topmost layer's mount */ 174 + if (root_id_before != mnt_ids[NR_OVERMOUNTS]) 175 + _exit(15); 176 + 177 + /* Step 6: Chroot into /newroot (the topmost overmount) */ 178 + if (chroot("/newroot")) 179 + _exit(16); 180 + 181 + /* Change to root directory within the chroot */ 182 + if (chdir("/")) 183 + _exit(17); 184 + 185 + /* Step 7: Verify we're in the topmost layer */ 186 + root_id_after = get_unique_mnt_id("/"); 187 + 188 + /* The mount ID should be the same as the topmost layer */ 189 + if (root_id_after != mnt_ids[NR_OVERMOUNTS]) 190 + _exit(18); 191 + 192 + /* Verify the topmost layer's marker file exists */ 193 + snprintf(marker, sizeof(marker), "/layer_%d", NR_OVERMOUNTS); 194 + if (access(marker, F_OK)) 195 + _exit(19); 196 + 197 + /* Verify we cannot see markers from lower layers (they're hidden) */ 198 + for (i = 0; i < NR_OVERMOUNTS; i++) { 199 + snprintf(marker, sizeof(marker), "/layer_%d", i); 200 + if (access(marker, F_OK) == 0) 201 + _exit(20); 202 + } 203 + 204 + /* Verify the root mount is tmpfs */ 205 + sm = statmount_alloc(root_id_after, 0, 206 + STATMOUNT_MNT_BASIC | STATMOUNT_MNT_ROOT | 207 + STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE, 0); 208 + if (!sm) 209 + _exit(21); 210 + 211 + if (sm->mask & STATMOUNT_FS_TYPE) { 212 + if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0) { 213 + free(sm); 214 + _exit(22); 215 + } 216 + } 217 + 218 + free(sm); 219 + _exit(0); 220 + } 221 + 222 + ASSERT_EQ(wait_for_pid(pid), 0); 223 + } 224 + 225 + TEST_HARNESS_MAIN

+1

tools/testing/selftests/filesystems/fsmount_ns/.gitignore

··· 1 + fsmount_ns_test

+10

tools/testing/selftests/filesystems/fsmount_ns/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + TEST_GEN_PROGS := fsmount_ns_test 3 + 4 + CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) 5 + LDLIBS := -lcap 6 + 7 + include ../../lib.mk 8 + 9 + $(OUTPUT)/fsmount_ns_test: fsmount_ns_test.c ../utils.c 10 + $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)

+1135

tools/testing/selftests/filesystems/fsmount_ns/fsmount_ns_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2026 Christian Brauner <brauner@kernel.org> 4 + * 5 + * Test for FSMOUNT_NAMESPACE flag. 6 + * 7 + * Test that fsmount() with FSMOUNT_NAMESPACE creates a new mount 8 + * namespace containing the specified mount. 9 + */ 10 + #define _GNU_SOURCE 11 + 12 + #include <errno.h> 13 + #include <fcntl.h> 14 + #include <limits.h> 15 + #include <linux/nsfs.h> 16 + #include <sched.h> 17 + #include <stdio.h> 18 + #include <stdlib.h> 19 + #include <string.h> 20 + #include <sys/ioctl.h> 21 + #include <sys/mount.h> 22 + #include <sys/stat.h> 23 + #include <sys/wait.h> 24 + #include <unistd.h> 25 + 26 + #include "../wrappers.h" 27 + #include "../statmount/statmount.h" 28 + #include "../utils.h" 29 + #include "../../kselftest_harness.h" 30 + 31 + #ifndef FSMOUNT_NAMESPACE 32 + #define FSMOUNT_NAMESPACE 0x00000002 33 + #endif 34 + 35 + #ifndef FSMOUNT_CLOEXEC 36 + #define FSMOUNT_CLOEXEC 0x00000001 37 + #endif 38 + 39 + #ifndef FSCONFIG_CMD_CREATE 40 + #define FSCONFIG_CMD_CREATE 6 41 + #endif 42 + 43 + static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id) 44 + { 45 + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) 46 + return -errno; 47 + return 0; 48 + } 49 + 50 + static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id) 51 + { 52 + int fd, ret; 53 + 54 + fd = open(path, O_RDONLY); 55 + if (fd < 0) 56 + return -errno; 57 + 58 + ret = get_mnt_ns_id(fd, mnt_ns_id); 59 + close(fd); 60 + return ret; 61 + } 62 + 63 + static void log_mount(struct __test_metadata *_metadata, struct statmount *sm) 64 + { 65 + const char *fs_type = ""; 66 + const char *mnt_root = ""; 67 + const char *mnt_point = ""; 68 + 69 + if (sm->mask & STATMOUNT_FS_TYPE) 70 + fs_type = sm->str + sm->fs_type; 71 + if (sm->mask & STATMOUNT_MNT_ROOT) 72 + mnt_root = sm->str + sm->mnt_root; 73 + if (sm->mask & STATMOUNT_MNT_POINT) 74 + mnt_point = sm->str + sm->mnt_point; 75 + 76 + TH_LOG(" mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s", 77 + (unsigned long long)sm->mnt_id, 78 + (unsigned long long)sm->mnt_parent_id, 79 + fs_type, mnt_root, mnt_point); 80 + } 81 + 82 + static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id) 83 + { 84 + uint64_t list[256]; 85 + ssize_t nr_mounts; 86 + 87 + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); 88 + if (nr_mounts < 0) { 89 + TH_LOG("listmount failed: %s", strerror(errno)); 90 + return; 91 + } 92 + 93 + TH_LOG("Mount namespace %llu contains %zd mount(s):", 94 + (unsigned long long)mnt_ns_id, nr_mounts); 95 + 96 + for (ssize_t i = 0; i < nr_mounts; i++) { 97 + struct statmount *sm; 98 + 99 + sm = statmount_alloc(list[i], mnt_ns_id, 100 + STATMOUNT_MNT_BASIC | 101 + STATMOUNT_FS_TYPE | 102 + STATMOUNT_MNT_ROOT | 103 + STATMOUNT_MNT_POINT, 0); 104 + if (!sm) { 105 + TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s", 106 + i, (unsigned long long)list[i], strerror(errno)); 107 + continue; 108 + } 109 + 110 + log_mount(_metadata, sm); 111 + free(sm); 112 + } 113 + } 114 + 115 + static int create_tmpfs_fd(void) 116 + { 117 + int fs_fd, ret; 118 + 119 + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); 120 + if (fs_fd < 0) 121 + return -errno; 122 + 123 + ret = sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); 124 + if (ret < 0) { 125 + close(fs_fd); 126 + return -errno; 127 + } 128 + 129 + return fs_fd; 130 + } 131 + 132 + FIXTURE(fsmount_ns) 133 + { 134 + int fd; 135 + int fs_fd; 136 + uint64_t current_ns_id; 137 + }; 138 + 139 + FIXTURE_VARIANT(fsmount_ns) 140 + { 141 + const char *fstype; 142 + unsigned int flags; 143 + bool expect_success; 144 + bool expect_different_ns; 145 + int min_mounts; 146 + }; 147 + 148 + FIXTURE_VARIANT_ADD(fsmount_ns, basic_tmpfs) 149 + { 150 + .fstype = "tmpfs", 151 + .flags = FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 152 + .expect_success = true, 153 + .expect_different_ns = true, 154 + .min_mounts = 1, 155 + }; 156 + 157 + FIXTURE_VARIANT_ADD(fsmount_ns, cloexec_only) 158 + { 159 + .fstype = "tmpfs", 160 + .flags = FSMOUNT_CLOEXEC, 161 + .expect_success = true, 162 + .expect_different_ns = false, 163 + .min_mounts = 1, 164 + }; 165 + 166 + FIXTURE_VARIANT_ADD(fsmount_ns, namespace_only) 167 + { 168 + .fstype = "tmpfs", 169 + .flags = FSMOUNT_NAMESPACE, 170 + .expect_success = true, 171 + .expect_different_ns = true, 172 + .min_mounts = 1, 173 + }; 174 + 175 + FIXTURE_SETUP(fsmount_ns) 176 + { 177 + int ret; 178 + 179 + self->fd = -1; 180 + self->fs_fd = -1; 181 + 182 + /* Check if fsopen syscall is supported */ 183 + ret = sys_fsopen("tmpfs", 0); 184 + if (ret == -1 && errno == ENOSYS) 185 + SKIP(return, "fsopen() syscall not supported"); 186 + if (ret >= 0) 187 + close(ret); 188 + 189 + /* Check if statmount/listmount are supported */ 190 + ret = statmount(0, 0, 0, 0, NULL, 0, 0); 191 + if (ret == -1 && errno == ENOSYS) 192 + SKIP(return, "statmount() syscall not supported"); 193 + 194 + /* Get current mount namespace ID for comparison */ 195 + ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id); 196 + if (ret < 0) 197 + SKIP(return, "Failed to get current mount namespace ID"); 198 + } 199 + 200 + FIXTURE_TEARDOWN(fsmount_ns) 201 + { 202 + if (self->fd >= 0) 203 + close(self->fd); 204 + if (self->fs_fd >= 0) 205 + close(self->fs_fd); 206 + } 207 + 208 + TEST_F(fsmount_ns, create_namespace) 209 + { 210 + uint64_t new_ns_id; 211 + uint64_t list[256]; 212 + ssize_t nr_mounts; 213 + int ret; 214 + 215 + self->fs_fd = create_tmpfs_fd(); 216 + ASSERT_GE(self->fs_fd, 0); 217 + 218 + self->fd = sys_fsmount(self->fs_fd, variant->flags, 0); 219 + 220 + if (!variant->expect_success) { 221 + ASSERT_LT(self->fd, 0); 222 + return; 223 + } 224 + 225 + if (self->fd < 0 && errno == EINVAL) 226 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 227 + 228 + ASSERT_GE(self->fd, 0); 229 + 230 + if (variant->expect_different_ns) { 231 + /* Verify we can get the namespace ID from the fd */ 232 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 233 + ASSERT_EQ(ret, 0); 234 + 235 + /* Verify it's a different namespace */ 236 + ASSERT_NE(new_ns_id, self->current_ns_id); 237 + 238 + /* List mounts in the new namespace */ 239 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 240 + ASSERT_GE(nr_mounts, 0) { 241 + TH_LOG("%m - listmount failed"); 242 + } 243 + 244 + /* Verify minimum expected mounts */ 245 + ASSERT_GE(nr_mounts, variant->min_mounts); 246 + TH_LOG("Namespace contains %zd mounts", nr_mounts); 247 + } 248 + } 249 + 250 + TEST_F(fsmount_ns, setns_into_namespace) 251 + { 252 + uint64_t new_ns_id; 253 + pid_t pid; 254 + int status; 255 + int ret; 256 + 257 + /* Only test with FSMOUNT_NAMESPACE flag */ 258 + if (!(variant->flags & FSMOUNT_NAMESPACE)) 259 + SKIP(return, "setns test only for FSMOUNT_NAMESPACE case"); 260 + 261 + self->fs_fd = create_tmpfs_fd(); 262 + ASSERT_GE(self->fs_fd, 0); 263 + 264 + self->fd = sys_fsmount(self->fs_fd, variant->flags, 0); 265 + if (self->fd < 0 && errno == EINVAL) 266 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 267 + 268 + ASSERT_GE(self->fd, 0); 269 + 270 + /* Get namespace ID and dump all mounts */ 271 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 272 + ASSERT_EQ(ret, 0); 273 + 274 + dump_mounts(_metadata, new_ns_id); 275 + 276 + pid = fork(); 277 + ASSERT_GE(pid, 0); 278 + 279 + if (pid == 0) { 280 + /* Child: try to enter the namespace */ 281 + if (setns(self->fd, CLONE_NEWNS) < 0) 282 + _exit(1); 283 + _exit(0); 284 + } 285 + 286 + ASSERT_EQ(waitpid(pid, &status, 0), pid); 287 + ASSERT_TRUE(WIFEXITED(status)); 288 + ASSERT_EQ(WEXITSTATUS(status), 0); 289 + } 290 + 291 + TEST_F(fsmount_ns, verify_mount_properties) 292 + { 293 + struct statmount sm; 294 + uint64_t new_ns_id; 295 + uint64_t list[256]; 296 + ssize_t nr_mounts; 297 + int ret; 298 + 299 + /* Only test with basic FSMOUNT_NAMESPACE flags */ 300 + if (variant->flags != (FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC)) 301 + SKIP(return, "mount properties test only for basic case"); 302 + 303 + self->fs_fd = create_tmpfs_fd(); 304 + ASSERT_GE(self->fs_fd, 0); 305 + 306 + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); 307 + if (self->fd < 0 && errno == EINVAL) 308 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 309 + 310 + ASSERT_GE(self->fd, 0); 311 + 312 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 313 + ASSERT_EQ(ret, 0); 314 + 315 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 316 + ASSERT_GE(nr_mounts, 1); 317 + 318 + /* Get info about the root mount */ 319 + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); 320 + ASSERT_EQ(ret, 0); 321 + 322 + TH_LOG("Root mount id: %llu, parent: %llu", 323 + (unsigned long long)sm.mnt_id, 324 + (unsigned long long)sm.mnt_parent_id); 325 + } 326 + 327 + TEST_F(fsmount_ns, verify_tmpfs_type) 328 + { 329 + struct statmount *sm; 330 + uint64_t new_ns_id; 331 + uint64_t list[256]; 332 + ssize_t nr_mounts; 333 + const char *fs_type; 334 + int ret; 335 + 336 + /* Only test with basic FSMOUNT_NAMESPACE flags */ 337 + if (variant->flags != (FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC)) 338 + SKIP(return, "fs type test only for basic case"); 339 + 340 + self->fs_fd = create_tmpfs_fd(); 341 + ASSERT_GE(self->fs_fd, 0); 342 + 343 + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); 344 + if (self->fd < 0 && errno == EINVAL) 345 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 346 + 347 + ASSERT_GE(self->fd, 0); 348 + 349 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 350 + ASSERT_EQ(ret, 0); 351 + 352 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 353 + ASSERT_GE(nr_mounts, 1); 354 + 355 + sm = statmount_alloc(list[0], new_ns_id, STATMOUNT_FS_TYPE, 0); 356 + ASSERT_NE(sm, NULL); 357 + 358 + fs_type = sm->str + sm->fs_type; 359 + ASSERT_STREQ(fs_type, "tmpfs"); 360 + 361 + free(sm); 362 + } 363 + 364 + FIXTURE(fsmount_ns_caps) 365 + { 366 + bool has_caps; 367 + }; 368 + 369 + FIXTURE_SETUP(fsmount_ns_caps) 370 + { 371 + int ret; 372 + 373 + /* Check if fsopen syscall is supported */ 374 + ret = sys_fsopen("tmpfs", 0); 375 + if (ret == -1 && errno == ENOSYS) 376 + SKIP(return, "fsopen() syscall not supported"); 377 + if (ret >= 0) 378 + close(ret); 379 + 380 + self->has_caps = (geteuid() == 0); 381 + } 382 + 383 + FIXTURE_TEARDOWN(fsmount_ns_caps) 384 + { 385 + } 386 + 387 + TEST_F(fsmount_ns_caps, requires_cap_sys_admin) 388 + { 389 + pid_t pid; 390 + int status; 391 + int fs_fd; 392 + 393 + /* 394 + * Prepare the configured filesystem fd as root before forking. 395 + * fsopen() requires CAP_SYS_ADMIN in the mount namespace's 396 + * user_ns, which won't be available after enter_userns(). 397 + */ 398 + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); 399 + ASSERT_GE(fs_fd, 0); 400 + 401 + ASSERT_EQ(sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); 402 + 403 + pid = fork(); 404 + ASSERT_GE(pid, 0); 405 + 406 + if (pid == 0) { 407 + int fd; 408 + 409 + /* Child: drop privileges using utils.h helper */ 410 + if (enter_userns() != 0) 411 + _exit(2); 412 + 413 + /* Drop all caps using utils.h helper */ 414 + if (caps_down() == 0) 415 + _exit(3); 416 + 417 + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); 418 + close(fs_fd); 419 + 420 + if (fd >= 0) { 421 + close(fd); 422 + /* Should have failed without caps */ 423 + _exit(1); 424 + } 425 + 426 + if (errno == EPERM) 427 + _exit(0); 428 + 429 + /* EINVAL means FSMOUNT_NAMESPACE not supported */ 430 + if (errno == EINVAL) 431 + _exit(6); 432 + 433 + /* Unexpected error */ 434 + _exit(7); 435 + } 436 + 437 + close(fs_fd); 438 + ASSERT_EQ(waitpid(pid, &status, 0), pid); 439 + ASSERT_TRUE(WIFEXITED(status)); 440 + 441 + switch (WEXITSTATUS(status)) { 442 + case 0: 443 + /* Expected: EPERM without caps */ 444 + break; 445 + case 1: 446 + ASSERT_FALSE(true) TH_LOG("FSMOUNT_NAMESPACE succeeded without caps"); 447 + break; 448 + case 2: 449 + SKIP(return, "enter_userns failed"); 450 + break; 451 + case 3: 452 + SKIP(return, "caps_down failed"); 453 + break; 454 + case 6: 455 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 456 + break; 457 + default: 458 + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", 459 + WEXITSTATUS(status)); 460 + break; 461 + } 462 + } 463 + 464 + FIXTURE(fsmount_ns_userns) 465 + { 466 + int fd; 467 + int fs_fd; 468 + }; 469 + 470 + FIXTURE_SETUP(fsmount_ns_userns) 471 + { 472 + int ret; 473 + 474 + self->fd = -1; 475 + self->fs_fd = -1; 476 + 477 + /* Check if fsopen syscall is supported */ 478 + ret = sys_fsopen("tmpfs", 0); 479 + if (ret == -1 && errno == ENOSYS) 480 + SKIP(return, "fsopen() syscall not supported"); 481 + if (ret >= 0) 482 + close(ret); 483 + 484 + /* Check if statmount/listmount are supported */ 485 + ret = statmount(0, 0, 0, 0, NULL, 0, 0); 486 + if (ret == -1 && errno == ENOSYS) 487 + SKIP(return, "statmount() syscall not supported"); 488 + } 489 + 490 + FIXTURE_TEARDOWN(fsmount_ns_userns) 491 + { 492 + if (self->fd >= 0) 493 + close(self->fd); 494 + if (self->fs_fd >= 0) 495 + close(self->fs_fd); 496 + } 497 + 498 + TEST_F(fsmount_ns_userns, create_in_userns) 499 + { 500 + pid_t pid; 501 + int status; 502 + 503 + pid = fork(); 504 + ASSERT_GE(pid, 0); 505 + 506 + if (pid == 0) { 507 + uint64_t new_ns_id; 508 + uint64_t list[256]; 509 + ssize_t nr_mounts; 510 + int fs_fd, fd; 511 + 512 + /* Create new user namespace (also creates mount namespace) */ 513 + if (setup_userns() != 0) 514 + _exit(2); 515 + 516 + /* Now we have CAP_SYS_ADMIN in the user namespace */ 517 + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); 518 + if (fs_fd < 0) 519 + _exit(3); 520 + 521 + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { 522 + close(fs_fd); 523 + _exit(4); 524 + } 525 + 526 + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); 527 + close(fs_fd); 528 + 529 + if (fd < 0) { 530 + if (errno == EINVAL) 531 + _exit(6); /* FSMOUNT_NAMESPACE not supported */ 532 + _exit(1); 533 + } 534 + 535 + /* Verify we can get the namespace ID */ 536 + if (get_mnt_ns_id(fd, &new_ns_id) != 0) 537 + _exit(7); 538 + 539 + /* Verify we can list mounts in the new namespace */ 540 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 541 + if (nr_mounts < 0) 542 + _exit(8); 543 + 544 + /* Should have at least 1 mount (the tmpfs) */ 545 + if (nr_mounts < 1) 546 + _exit(9); 547 + 548 + close(fd); 549 + _exit(0); 550 + } 551 + 552 + ASSERT_EQ(waitpid(pid, &status, 0), pid); 553 + ASSERT_TRUE(WIFEXITED(status)); 554 + 555 + switch (WEXITSTATUS(status)) { 556 + case 0: 557 + /* Success */ 558 + break; 559 + case 1: 560 + ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed in userns"); 561 + break; 562 + case 2: 563 + SKIP(return, "setup_userns failed"); 564 + break; 565 + case 3: 566 + SKIP(return, "fsopen failed in userns"); 567 + break; 568 + case 4: 569 + SKIP(return, "fsconfig CMD_CREATE failed in userns"); 570 + break; 571 + case 6: 572 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 573 + break; 574 + case 7: 575 + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); 576 + break; 577 + case 8: 578 + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); 579 + break; 580 + case 9: 581 + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); 582 + break; 583 + default: 584 + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", 585 + WEXITSTATUS(status)); 586 + break; 587 + } 588 + } 589 + 590 + TEST_F(fsmount_ns_userns, setns_in_userns) 591 + { 592 + pid_t pid; 593 + int status; 594 + 595 + pid = fork(); 596 + ASSERT_GE(pid, 0); 597 + 598 + if (pid == 0) { 599 + uint64_t new_ns_id; 600 + int fs_fd, fd; 601 + pid_t inner_pid; 602 + int inner_status; 603 + 604 + /* Create new user namespace */ 605 + if (setup_userns() != 0) 606 + _exit(2); 607 + 608 + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); 609 + if (fs_fd < 0) 610 + _exit(3); 611 + 612 + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { 613 + close(fs_fd); 614 + _exit(4); 615 + } 616 + 617 + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); 618 + close(fs_fd); 619 + 620 + if (fd < 0) { 621 + if (errno == EINVAL) 622 + _exit(6); 623 + _exit(1); 624 + } 625 + 626 + if (get_mnt_ns_id(fd, &new_ns_id) != 0) 627 + _exit(7); 628 + 629 + /* Fork again to test setns into the new namespace */ 630 + inner_pid = fork(); 631 + if (inner_pid < 0) 632 + _exit(10); 633 + 634 + if (inner_pid == 0) { 635 + /* Inner child: enter the new namespace */ 636 + if (setns(fd, CLONE_NEWNS) < 0) 637 + _exit(1); 638 + _exit(0); 639 + } 640 + 641 + if (waitpid(inner_pid, &inner_status, 0) != inner_pid) 642 + _exit(11); 643 + 644 + if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0) 645 + _exit(12); 646 + 647 + close(fd); 648 + _exit(0); 649 + } 650 + 651 + ASSERT_EQ(waitpid(pid, &status, 0), pid); 652 + ASSERT_TRUE(WIFEXITED(status)); 653 + 654 + switch (WEXITSTATUS(status)) { 655 + case 0: 656 + /* Success */ 657 + break; 658 + case 1: 659 + ASSERT_FALSE(true) TH_LOG("fsmount or setns failed in userns"); 660 + break; 661 + case 2: 662 + SKIP(return, "setup_userns failed"); 663 + break; 664 + case 3: 665 + SKIP(return, "fsopen failed in userns"); 666 + break; 667 + case 4: 668 + SKIP(return, "fsconfig CMD_CREATE failed in userns"); 669 + break; 670 + case 6: 671 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 672 + break; 673 + case 7: 674 + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); 675 + break; 676 + case 10: 677 + ASSERT_FALSE(true) TH_LOG("Inner fork failed"); 678 + break; 679 + case 11: 680 + ASSERT_FALSE(true) TH_LOG("Inner waitpid failed"); 681 + break; 682 + case 12: 683 + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); 684 + break; 685 + default: 686 + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", 687 + WEXITSTATUS(status)); 688 + break; 689 + } 690 + } 691 + 692 + TEST_F(fsmount_ns_userns, umount_fails_einval) 693 + { 694 + pid_t pid; 695 + int status; 696 + 697 + pid = fork(); 698 + ASSERT_GE(pid, 0); 699 + 700 + if (pid == 0) { 701 + uint64_t new_ns_id; 702 + uint64_t list[256]; 703 + ssize_t nr_mounts; 704 + int fs_fd, fd; 705 + ssize_t i; 706 + 707 + /* Create new user namespace */ 708 + if (setup_userns() != 0) 709 + _exit(2); 710 + 711 + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); 712 + if (fs_fd < 0) 713 + _exit(3); 714 + 715 + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { 716 + close(fs_fd); 717 + _exit(4); 718 + } 719 + 720 + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); 721 + close(fs_fd); 722 + 723 + if (fd < 0) { 724 + if (errno == EINVAL) 725 + _exit(6); 726 + _exit(1); 727 + } 728 + 729 + if (get_mnt_ns_id(fd, &new_ns_id) != 0) 730 + _exit(7); 731 + 732 + /* Get all mounts in the new namespace */ 733 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); 734 + if (nr_mounts < 0) 735 + _exit(13); 736 + 737 + if (nr_mounts < 1) 738 + _exit(14); 739 + 740 + /* Enter the new namespace */ 741 + if (setns(fd, CLONE_NEWNS) < 0) 742 + _exit(8); 743 + 744 + for (i = 0; i < nr_mounts; i++) { 745 + struct statmount *sm; 746 + const char *mnt_point; 747 + 748 + sm = statmount_alloc(list[i], new_ns_id, 749 + STATMOUNT_MNT_POINT, 0); 750 + if (!sm) 751 + _exit(15); 752 + 753 + mnt_point = sm->str + sm->mnt_point; 754 + 755 + if (umount2(mnt_point, MNT_DETACH) == 0) { 756 + free(sm); 757 + _exit(9); 758 + } 759 + 760 + if (errno != EINVAL) { 761 + /* Wrong error */ 762 + free(sm); 763 + _exit(10); 764 + } 765 + 766 + free(sm); 767 + } 768 + 769 + close(fd); 770 + _exit(0); 771 + } 772 + 773 + ASSERT_EQ(waitpid(pid, &status, 0), pid); 774 + ASSERT_TRUE(WIFEXITED(status)); 775 + 776 + switch (WEXITSTATUS(status)) { 777 + case 0: 778 + break; 779 + case 1: 780 + ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed"); 781 + break; 782 + case 2: 783 + SKIP(return, "setup_userns failed"); 784 + break; 785 + case 3: 786 + SKIP(return, "fsopen failed in userns"); 787 + break; 788 + case 4: 789 + SKIP(return, "fsconfig CMD_CREATE failed in userns"); 790 + break; 791 + case 6: 792 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 793 + break; 794 + case 7: 795 + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); 796 + break; 797 + case 8: 798 + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); 799 + break; 800 + case 9: 801 + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); 802 + break; 803 + case 10: 804 + ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)"); 805 + break; 806 + case 13: 807 + ASSERT_FALSE(true) TH_LOG("listmount failed"); 808 + break; 809 + case 14: 810 + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); 811 + break; 812 + case 15: 813 + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); 814 + break; 815 + default: 816 + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", 817 + WEXITSTATUS(status)); 818 + break; 819 + } 820 + } 821 + 822 + TEST_F(fsmount_ns_userns, umount_succeeds) 823 + { 824 + pid_t pid; 825 + int status; 826 + 827 + pid = fork(); 828 + ASSERT_GE(pid, 0); 829 + 830 + if (pid == 0) { 831 + uint64_t new_ns_id; 832 + uint64_t list[256]; 833 + ssize_t nr_mounts; 834 + int fs_fd, fd; 835 + ssize_t i; 836 + 837 + if (unshare(CLONE_NEWNS)) 838 + _exit(1); 839 + 840 + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0) 841 + _exit(1); 842 + 843 + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); 844 + if (fs_fd < 0) 845 + _exit(3); 846 + 847 + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { 848 + close(fs_fd); 849 + _exit(4); 850 + } 851 + 852 + fd = sys_fsmount(fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 0); 853 + close(fs_fd); 854 + 855 + if (fd < 0) { 856 + if (errno == EINVAL) 857 + _exit(6); 858 + _exit(1); 859 + } 860 + 861 + if (get_mnt_ns_id(fd, &new_ns_id) != 0) 862 + _exit(7); 863 + 864 + /* Get all mounts in the new namespace */ 865 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); 866 + if (nr_mounts < 0) 867 + _exit(13); 868 + 869 + if (nr_mounts < 1) 870 + _exit(14); 871 + 872 + /* Enter the new namespace */ 873 + if (setns(fd, CLONE_NEWNS) < 0) 874 + _exit(8); 875 + 876 + for (i = 0; i < nr_mounts; i++) { 877 + struct statmount *sm; 878 + const char *mnt_point; 879 + 880 + sm = statmount_alloc(list[i], new_ns_id, 881 + STATMOUNT_MNT_POINT, 0); 882 + if (!sm) 883 + _exit(15); 884 + 885 + mnt_point = sm->str + sm->mnt_point; 886 + 887 + if (umount2(mnt_point, MNT_DETACH) != 0) { 888 + free(sm); 889 + _exit(9); 890 + } 891 + 892 + free(sm); 893 + } 894 + 895 + close(fd); 896 + _exit(0); 897 + } 898 + 899 + ASSERT_EQ(waitpid(pid, &status, 0), pid); 900 + ASSERT_TRUE(WIFEXITED(status)); 901 + 902 + switch (WEXITSTATUS(status)) { 903 + case 0: 904 + break; 905 + case 1: 906 + ASSERT_FALSE(true) TH_LOG("fsmount(FSMOUNT_NAMESPACE) failed or unshare failed"); 907 + break; 908 + case 3: 909 + SKIP(return, "fsopen failed"); 910 + break; 911 + case 4: 912 + SKIP(return, "fsconfig CMD_CREATE failed"); 913 + break; 914 + case 6: 915 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 916 + break; 917 + case 7: 918 + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); 919 + break; 920 + case 8: 921 + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); 922 + break; 923 + case 9: 924 + ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded"); 925 + break; 926 + case 13: 927 + ASSERT_FALSE(true) TH_LOG("listmount failed"); 928 + break; 929 + case 14: 930 + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); 931 + break; 932 + case 15: 933 + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); 934 + break; 935 + default: 936 + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", 937 + WEXITSTATUS(status)); 938 + break; 939 + } 940 + } 941 + 942 + FIXTURE(fsmount_ns_mount_attrs) 943 + { 944 + int fd; 945 + int fs_fd; 946 + }; 947 + 948 + FIXTURE_SETUP(fsmount_ns_mount_attrs) 949 + { 950 + int ret; 951 + 952 + self->fd = -1; 953 + self->fs_fd = -1; 954 + 955 + /* Check if fsopen syscall is supported */ 956 + ret = sys_fsopen("tmpfs", 0); 957 + if (ret == -1 && errno == ENOSYS) 958 + SKIP(return, "fsopen() syscall not supported"); 959 + if (ret >= 0) 960 + close(ret); 961 + 962 + /* Check if statmount/listmount are supported */ 963 + ret = statmount(0, 0, 0, 0, NULL, 0, 0); 964 + if (ret == -1 && errno == ENOSYS) 965 + SKIP(return, "statmount() syscall not supported"); 966 + } 967 + 968 + FIXTURE_TEARDOWN(fsmount_ns_mount_attrs) 969 + { 970 + if (self->fd >= 0) 971 + close(self->fd); 972 + if (self->fs_fd >= 0) 973 + close(self->fs_fd); 974 + } 975 + 976 + TEST_F(fsmount_ns_mount_attrs, readonly) 977 + { 978 + struct statmount sm; 979 + uint64_t new_ns_id; 980 + uint64_t list[256]; 981 + ssize_t nr_mounts; 982 + int ret; 983 + 984 + self->fs_fd = create_tmpfs_fd(); 985 + ASSERT_GE(self->fs_fd, 0); 986 + 987 + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 988 + MOUNT_ATTR_RDONLY); 989 + if (self->fd < 0 && errno == EINVAL) 990 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 991 + 992 + ASSERT_GE(self->fd, 0); 993 + 994 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 995 + ASSERT_EQ(ret, 0); 996 + 997 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 998 + ASSERT_GE(nr_mounts, 1); 999 + 1000 + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); 1001 + ASSERT_EQ(ret, 0); 1002 + 1003 + /* Verify the mount is read-only */ 1004 + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_RDONLY); 1005 + } 1006 + 1007 + TEST_F(fsmount_ns_mount_attrs, noexec) 1008 + { 1009 + struct statmount sm; 1010 + uint64_t new_ns_id; 1011 + uint64_t list[256]; 1012 + ssize_t nr_mounts; 1013 + int ret; 1014 + 1015 + self->fs_fd = create_tmpfs_fd(); 1016 + ASSERT_GE(self->fs_fd, 0); 1017 + 1018 + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 1019 + MOUNT_ATTR_NOEXEC); 1020 + if (self->fd < 0 && errno == EINVAL) 1021 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 1022 + 1023 + ASSERT_GE(self->fd, 0); 1024 + 1025 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 1026 + ASSERT_EQ(ret, 0); 1027 + 1028 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 1029 + ASSERT_GE(nr_mounts, 1); 1030 + 1031 + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); 1032 + ASSERT_EQ(ret, 0); 1033 + 1034 + /* Verify the mount is noexec */ 1035 + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOEXEC); 1036 + } 1037 + 1038 + TEST_F(fsmount_ns_mount_attrs, nosuid) 1039 + { 1040 + struct statmount sm; 1041 + uint64_t new_ns_id; 1042 + uint64_t list[256]; 1043 + ssize_t nr_mounts; 1044 + int ret; 1045 + 1046 + self->fs_fd = create_tmpfs_fd(); 1047 + ASSERT_GE(self->fs_fd, 0); 1048 + 1049 + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 1050 + MOUNT_ATTR_NOSUID); 1051 + if (self->fd < 0 && errno == EINVAL) 1052 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 1053 + 1054 + ASSERT_GE(self->fd, 0); 1055 + 1056 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 1057 + ASSERT_EQ(ret, 0); 1058 + 1059 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 1060 + ASSERT_GE(nr_mounts, 1); 1061 + 1062 + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); 1063 + ASSERT_EQ(ret, 0); 1064 + 1065 + /* Verify the mount is nosuid */ 1066 + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOSUID); 1067 + } 1068 + 1069 + TEST_F(fsmount_ns_mount_attrs, noatime) 1070 + { 1071 + struct statmount sm; 1072 + uint64_t new_ns_id; 1073 + uint64_t list[256]; 1074 + ssize_t nr_mounts; 1075 + int ret; 1076 + 1077 + self->fs_fd = create_tmpfs_fd(); 1078 + ASSERT_GE(self->fs_fd, 0); 1079 + 1080 + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 1081 + MOUNT_ATTR_NOATIME); 1082 + if (self->fd < 0 && errno == EINVAL) 1083 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 1084 + 1085 + ASSERT_GE(self->fd, 0); 1086 + 1087 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 1088 + ASSERT_EQ(ret, 0); 1089 + 1090 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 1091 + ASSERT_GE(nr_mounts, 1); 1092 + 1093 + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); 1094 + ASSERT_EQ(ret, 0); 1095 + 1096 + /* Verify the mount is noatime */ 1097 + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOATIME); 1098 + } 1099 + 1100 + TEST_F(fsmount_ns_mount_attrs, combined) 1101 + { 1102 + struct statmount sm; 1103 + uint64_t new_ns_id; 1104 + uint64_t list[256]; 1105 + ssize_t nr_mounts; 1106 + int ret; 1107 + 1108 + self->fs_fd = create_tmpfs_fd(); 1109 + ASSERT_GE(self->fs_fd, 0); 1110 + 1111 + self->fd = sys_fsmount(self->fs_fd, FSMOUNT_NAMESPACE | FSMOUNT_CLOEXEC, 1112 + MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOEXEC | 1113 + MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOATIME); 1114 + if (self->fd < 0 && errno == EINVAL) 1115 + SKIP(return, "FSMOUNT_NAMESPACE not supported"); 1116 + 1117 + ASSERT_GE(self->fd, 0); 1118 + 1119 + ret = get_mnt_ns_id(self->fd, &new_ns_id); 1120 + ASSERT_EQ(ret, 0); 1121 + 1122 + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); 1123 + ASSERT_GE(nr_mounts, 1); 1124 + 1125 + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); 1126 + ASSERT_EQ(ret, 0); 1127 + 1128 + /* Verify all attributes are set */ 1129 + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_RDONLY); 1130 + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOEXEC); 1131 + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOSUID); 1132 + ASSERT_TRUE(sm.mnt_attr & MOUNT_ATTR_NOATIME); 1133 + } 1134 + 1135 + TEST_HARNESS_MAIN

+2

tools/testing/selftests/filesystems/move_mount/.gitignore

··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + move_mount_test

+10

tools/testing/selftests/filesystems/move_mount/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) 4 + LDLIBS += -lcap 5 + 6 + TEST_GEN_PROGS := move_mount_test 7 + 8 + include ../../lib.mk 9 + 10 + $(OUTPUT)/move_mount_test: ../utils.c

+492

tools/testing/selftests/filesystems/move_mount/move_mount_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + // Copyright (c) 2026 Christian Brauner <brauner@kernel.org> 3 + 4 + #define _GNU_SOURCE 5 + 6 + #include <errno.h> 7 + #include <fcntl.h> 8 + #include <sched.h> 9 + #include <stdio.h> 10 + #include <string.h> 11 + #include <sys/stat.h> 12 + #include <sys/mount.h> 13 + #include <unistd.h> 14 + #include <sys/syscall.h> 15 + 16 + #include "../wrappers.h" 17 + #include "../utils.h" 18 + #include "../statmount/statmount.h" 19 + #include "../../kselftest_harness.h" 20 + 21 + #include <linux/stat.h> 22 + 23 + #ifndef MOVE_MOUNT_BENEATH 24 + #define MOVE_MOUNT_BENEATH 0x00000200 25 + #endif 26 + 27 + static uint64_t get_unique_mnt_id_fd(int fd) 28 + { 29 + struct statx sx; 30 + int ret; 31 + 32 + ret = statx(fd, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &sx); 33 + if (ret) 34 + return 0; 35 + 36 + if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) 37 + return 0; 38 + 39 + return sx.stx_mnt_id; 40 + } 41 + 42 + /* 43 + * Create a locked overmount stack at /mnt_dir for testing MNT_LOCKED 44 + * transfer on non-rootfs mounts. 45 + * 46 + * Mounts tmpfs A at /mnt_dir, overmounts with tmpfs B, then enters a 47 + * new user+mount namespace where both become locked. Returns the exit 48 + * code to use on failure, or 0 on success. 49 + */ 50 + static int setup_locked_overmount(void) 51 + { 52 + /* Isolate so mounts don't leak. */ 53 + if (unshare(CLONE_NEWNS)) 54 + return 1; 55 + if (mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL)) 56 + return 2; 57 + 58 + /* 59 + * Create mounts while still in the initial user namespace so 60 + * they become locked after the subsequent user namespace 61 + * unshare. 62 + */ 63 + rmdir("/mnt_dir"); 64 + if (mkdir("/mnt_dir", 0755)) 65 + return 3; 66 + 67 + /* Mount tmpfs A */ 68 + if (mount("tmpfs", "/mnt_dir", "tmpfs", 0, NULL)) 69 + return 4; 70 + 71 + /* Overmount with tmpfs B */ 72 + if (mount("tmpfs", "/mnt_dir", "tmpfs", 0, NULL)) 73 + return 5; 74 + 75 + /* 76 + * Create user+mount namespace. Mounts A and B become locked 77 + * because they might be covering something that is not supposed 78 + * to be revealed. 79 + */ 80 + if (setup_userns()) 81 + return 6; 82 + 83 + /* Sanity check: B must be locked */ 84 + if (!umount2("/mnt_dir", MNT_DETACH) || errno != EINVAL) 85 + return 7; 86 + 87 + return 0; 88 + } 89 + 90 + /* 91 + * Create a detached tmpfs mount and return its fd, or -1 on failure. 92 + */ 93 + static int create_detached_tmpfs(void) 94 + { 95 + int fs_fd, mnt_fd; 96 + 97 + fs_fd = sys_fsopen("tmpfs", FSOPEN_CLOEXEC); 98 + if (fs_fd < 0) 99 + return -1; 100 + 101 + if (sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { 102 + close(fs_fd); 103 + return -1; 104 + } 105 + 106 + mnt_fd = sys_fsmount(fs_fd, FSMOUNT_CLOEXEC, 0); 107 + close(fs_fd); 108 + return mnt_fd; 109 + } 110 + 111 + FIXTURE(move_mount) { 112 + uint64_t orig_root_id; 113 + }; 114 + 115 + FIXTURE_SETUP(move_mount) 116 + { 117 + ASSERT_EQ(unshare(CLONE_NEWNS), 0); 118 + 119 + ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0); 120 + 121 + self->orig_root_id = get_unique_mnt_id("/"); 122 + ASSERT_NE(self->orig_root_id, 0); 123 + } 124 + 125 + FIXTURE_TEARDOWN(move_mount) 126 + { 127 + } 128 + 129 + /* 130 + * Test successful MOVE_MOUNT_BENEATH on the rootfs. 131 + * Mount a clone beneath /, fchdir to the clone, chroot to switch root, 132 + * then detach the old root. 133 + */ 134 + TEST_F(move_mount, beneath_rootfs_success) 135 + { 136 + int fd_tree, ret; 137 + uint64_t clone_id, root_id; 138 + 139 + fd_tree = sys_open_tree(AT_FDCWD, "/", 140 + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); 141 + ASSERT_GE(fd_tree, 0); 142 + 143 + clone_id = get_unique_mnt_id_fd(fd_tree); 144 + ASSERT_NE(clone_id, 0); 145 + ASSERT_NE(clone_id, self->orig_root_id); 146 + 147 + ASSERT_EQ(fchdir(fd_tree), 0); 148 + 149 + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", 150 + MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH); 151 + ASSERT_EQ(ret, 0); 152 + 153 + close(fd_tree); 154 + 155 + /* Switch root to the clone */ 156 + ASSERT_EQ(chroot("."), 0); 157 + 158 + /* Verify "/" is now the clone */ 159 + root_id = get_unique_mnt_id("/"); 160 + ASSERT_NE(root_id, 0); 161 + ASSERT_EQ(root_id, clone_id); 162 + 163 + /* Detach old root */ 164 + ASSERT_EQ(umount2(".", MNT_DETACH), 0); 165 + } 166 + 167 + /* 168 + * Test that after MOVE_MOUNT_BENEATH on the rootfs the old root is 169 + * stacked on top of the clone. Verify via statmount that the old 170 + * root's parent is the clone. 171 + */ 172 + TEST_F(move_mount, beneath_rootfs_old_root_stacked) 173 + { 174 + int fd_tree, ret; 175 + uint64_t clone_id; 176 + struct statmount sm; 177 + 178 + fd_tree = sys_open_tree(AT_FDCWD, "/", 179 + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); 180 + ASSERT_GE(fd_tree, 0); 181 + 182 + clone_id = get_unique_mnt_id_fd(fd_tree); 183 + ASSERT_NE(clone_id, 0); 184 + ASSERT_NE(clone_id, self->orig_root_id); 185 + 186 + ASSERT_EQ(fchdir(fd_tree), 0); 187 + 188 + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", 189 + MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH); 190 + ASSERT_EQ(ret, 0); 191 + 192 + close(fd_tree); 193 + 194 + ASSERT_EQ(chroot("."), 0); 195 + 196 + /* Old root's parent should now be the clone */ 197 + ASSERT_EQ(statmount(self->orig_root_id, 0, 0, 198 + STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0), 0); 199 + ASSERT_EQ(sm.mnt_parent_id, clone_id); 200 + 201 + ASSERT_EQ(umount2(".", MNT_DETACH), 0); 202 + } 203 + 204 + /* 205 + * Test that MOVE_MOUNT_BENEATH on rootfs fails when chroot'd into a 206 + * subdirectory of the same mount. The caller's fs->root.dentry doesn't 207 + * match mnt->mnt_root so the kernel rejects it. 208 + */ 209 + TEST_F(move_mount, beneath_rootfs_in_chroot_fail) 210 + { 211 + int fd_tree, ret; 212 + uint64_t chroot_id, clone_id; 213 + 214 + rmdir("/chroot_dir"); 215 + ASSERT_EQ(mkdir("/chroot_dir", 0755), 0); 216 + 217 + chroot_id = get_unique_mnt_id("/chroot_dir"); 218 + ASSERT_NE(chroot_id, 0); 219 + ASSERT_EQ(self->orig_root_id, chroot_id); 220 + 221 + ASSERT_EQ(chdir("/chroot_dir"), 0); 222 + ASSERT_EQ(chroot("."), 0); 223 + 224 + fd_tree = sys_open_tree(AT_FDCWD, "/", 225 + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); 226 + ASSERT_GE(fd_tree, 0); 227 + 228 + clone_id = get_unique_mnt_id_fd(fd_tree); 229 + ASSERT_NE(clone_id, 0); 230 + ASSERT_NE(clone_id, chroot_id); 231 + 232 + ASSERT_EQ(fchdir(fd_tree), 0); 233 + 234 + /* 235 + * Should fail: fs->root.dentry (/chroot_dir) doesn't match 236 + * the mount's mnt_root (/). 237 + */ 238 + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", 239 + MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH); 240 + ASSERT_EQ(ret, -1); 241 + ASSERT_EQ(errno, EINVAL); 242 + 243 + close(fd_tree); 244 + } 245 + 246 + /* 247 + * Test that MOVE_MOUNT_BENEATH on rootfs succeeds when chroot'd into a 248 + * separate tmpfs mount. The caller's root dentry matches the mount's 249 + * mnt_root since it's a dedicated mount. 250 + */ 251 + TEST_F(move_mount, beneath_rootfs_in_chroot_success) 252 + { 253 + int fd_tree, ret; 254 + uint64_t chroot_id, clone_id, root_id; 255 + struct statmount sm; 256 + 257 + rmdir("/chroot_dir"); 258 + ASSERT_EQ(mkdir("/chroot_dir", 0755), 0); 259 + ASSERT_EQ(mount("tmpfs", "/chroot_dir", "tmpfs", 0, NULL), 0); 260 + 261 + chroot_id = get_unique_mnt_id("/chroot_dir"); 262 + ASSERT_NE(chroot_id, 0); 263 + 264 + ASSERT_EQ(chdir("/chroot_dir"), 0); 265 + ASSERT_EQ(chroot("."), 0); 266 + 267 + ASSERT_EQ(get_unique_mnt_id("/"), chroot_id); 268 + 269 + fd_tree = sys_open_tree(AT_FDCWD, "/", 270 + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); 271 + ASSERT_GE(fd_tree, 0); 272 + 273 + clone_id = get_unique_mnt_id_fd(fd_tree); 274 + ASSERT_NE(clone_id, 0); 275 + ASSERT_NE(clone_id, chroot_id); 276 + 277 + ASSERT_EQ(fchdir(fd_tree), 0); 278 + 279 + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", 280 + MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_BENEATH); 281 + ASSERT_EQ(ret, 0); 282 + 283 + close(fd_tree); 284 + 285 + ASSERT_EQ(chroot("."), 0); 286 + 287 + root_id = get_unique_mnt_id("/"); 288 + ASSERT_NE(root_id, 0); 289 + ASSERT_EQ(root_id, clone_id); 290 + 291 + ASSERT_EQ(statmount(chroot_id, 0, 0, 292 + STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0), 0); 293 + ASSERT_EQ(sm.mnt_parent_id, clone_id); 294 + 295 + ASSERT_EQ(umount2(".", MNT_DETACH), 0); 296 + } 297 + 298 + /* 299 + * Test MNT_LOCKED transfer when mounting beneath rootfs in a user+mount 300 + * namespace. After mount-beneath the new root gets MNT_LOCKED and the 301 + * old root has MNT_LOCKED cleared so it can be unmounted. 302 + */ 303 + TEST_F(move_mount, beneath_rootfs_locked_transfer) 304 + { 305 + int fd_tree, ret; 306 + uint64_t clone_id, root_id; 307 + 308 + ASSERT_EQ(setup_userns(), 0); 309 + 310 + ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0); 311 + 312 + fd_tree = sys_open_tree(AT_FDCWD, "/", 313 + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | 314 + AT_RECURSIVE); 315 + ASSERT_GE(fd_tree, 0); 316 + 317 + clone_id = get_unique_mnt_id_fd(fd_tree); 318 + ASSERT_NE(clone_id, 0); 319 + 320 + ASSERT_EQ(fchdir(fd_tree), 0); 321 + 322 + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", 323 + MOVE_MOUNT_F_EMPTY_PATH | 324 + MOVE_MOUNT_BENEATH); 325 + ASSERT_EQ(ret, 0); 326 + 327 + close(fd_tree); 328 + 329 + ASSERT_EQ(chroot("."), 0); 330 + 331 + root_id = get_unique_mnt_id("/"); 332 + ASSERT_EQ(root_id, clone_id); 333 + 334 + /* 335 + * The old root should be unmountable (MNT_LOCKED was 336 + * transferred to the clone). If MNT_LOCKED wasn't 337 + * cleared, this would fail with EINVAL. 338 + */ 339 + ASSERT_EQ(umount2(".", MNT_DETACH), 0); 340 + 341 + /* Verify "/" is still the clone after detaching old root */ 342 + root_id = get_unique_mnt_id("/"); 343 + ASSERT_EQ(root_id, clone_id); 344 + } 345 + 346 + /* 347 + * Test containment invariant: after mount-beneath rootfs in a user+mount 348 + * namespace, the new root must be MNT_LOCKED. The lock transfer from the 349 + * old root preserves containment -- the process cannot unmount the new root 350 + * to escape the namespace. 351 + */ 352 + TEST_F(move_mount, beneath_rootfs_locked_containment) 353 + { 354 + int fd_tree, ret; 355 + uint64_t clone_id, root_id; 356 + 357 + ASSERT_EQ(setup_userns(), 0); 358 + 359 + ASSERT_EQ(mount("", "/", NULL, MS_REC | MS_PRIVATE, NULL), 0); 360 + 361 + /* Sanity: rootfs must be locked in the new userns */ 362 + ASSERT_EQ(umount2("/", MNT_DETACH), -1); 363 + ASSERT_EQ(errno, EINVAL); 364 + 365 + fd_tree = sys_open_tree(AT_FDCWD, "/", 366 + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | 367 + AT_RECURSIVE); 368 + ASSERT_GE(fd_tree, 0); 369 + 370 + clone_id = get_unique_mnt_id_fd(fd_tree); 371 + ASSERT_NE(clone_id, 0); 372 + 373 + ASSERT_EQ(fchdir(fd_tree), 0); 374 + 375 + ret = sys_move_mount(fd_tree, "", AT_FDCWD, "/", 376 + MOVE_MOUNT_F_EMPTY_PATH | 377 + MOVE_MOUNT_BENEATH); 378 + ASSERT_EQ(ret, 0); 379 + 380 + close(fd_tree); 381 + 382 + ASSERT_EQ(chroot("."), 0); 383 + 384 + root_id = get_unique_mnt_id("/"); 385 + ASSERT_EQ(root_id, clone_id); 386 + 387 + /* Detach old root (MNT_LOCKED was cleared from it) */ 388 + ASSERT_EQ(umount2(".", MNT_DETACH), 0); 389 + 390 + /* Verify "/" is still the clone after detaching old root */ 391 + root_id = get_unique_mnt_id("/"); 392 + ASSERT_EQ(root_id, clone_id); 393 + 394 + /* 395 + * The new root must be locked (MNT_LOCKED was transferred 396 + * from the old root). Attempting to unmount it must fail 397 + * with EINVAL, preserving the containment invariant. 398 + */ 399 + ASSERT_EQ(umount2("/", MNT_DETACH), -1); 400 + ASSERT_EQ(errno, EINVAL); 401 + } 402 + 403 + /* 404 + * Test MNT_LOCKED transfer when mounting beneath a non-rootfs locked mount. 405 + * Mounts created before unshare(CLONE_NEWUSER | CLONE_NEWNS) become locked 406 + * in the new namespace. Mount-beneath transfers the lock from the displaced 407 + * mount to the new mount, so the displaced mount can be unmounted. 408 + */ 409 + TEST_F(move_mount, beneath_non_rootfs_locked_transfer) 410 + { 411 + int mnt_fd, ret; 412 + uint64_t mnt_new_id, mnt_visible_id; 413 + 414 + ASSERT_EQ(setup_locked_overmount(), 0); 415 + 416 + mnt_fd = create_detached_tmpfs(); 417 + ASSERT_GE(mnt_fd, 0); 418 + 419 + mnt_new_id = get_unique_mnt_id_fd(mnt_fd); 420 + ASSERT_NE(mnt_new_id, 0); 421 + 422 + /* Move mount beneath B (which is locked) */ 423 + ret = sys_move_mount(mnt_fd, "", AT_FDCWD, "/mnt_dir", 424 + MOVE_MOUNT_F_EMPTY_PATH | 425 + MOVE_MOUNT_BENEATH); 426 + ASSERT_EQ(ret, 0); 427 + 428 + close(mnt_fd); 429 + 430 + /* 431 + * B should now be unmountable (MNT_LOCKED was transferred 432 + * to the new mount beneath it). If MNT_LOCKED wasn't 433 + * cleared from B, this would fail with EINVAL. 434 + */ 435 + ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), 0); 436 + 437 + /* Verify the new mount is now visible */ 438 + mnt_visible_id = get_unique_mnt_id("/mnt_dir"); 439 + ASSERT_EQ(mnt_visible_id, mnt_new_id); 440 + } 441 + 442 + /* 443 + * Test MNT_LOCKED containment when mounting beneath a non-rootfs mount 444 + * that was locked during unshare(CLONE_NEWUSER | CLONE_NEWNS). 445 + * Mounts created before unshare become locked in the new namespace. 446 + * Mount-beneath transfers the lock, preserving containment: the new 447 + * mount cannot be unmounted, but the displaced mount can. 448 + */ 449 + TEST_F(move_mount, beneath_non_rootfs_locked_containment) 450 + { 451 + int mnt_fd, ret; 452 + uint64_t mnt_new_id, mnt_visible_id; 453 + 454 + ASSERT_EQ(setup_locked_overmount(), 0); 455 + 456 + mnt_fd = create_detached_tmpfs(); 457 + ASSERT_GE(mnt_fd, 0); 458 + 459 + mnt_new_id = get_unique_mnt_id_fd(mnt_fd); 460 + ASSERT_NE(mnt_new_id, 0); 461 + 462 + /* 463 + * Move new tmpfs beneath B at /mnt_dir. 464 + * Stack becomes: A -> new -> B 465 + * Lock transfers from B to new. 466 + */ 467 + ret = sys_move_mount(mnt_fd, "", AT_FDCWD, "/mnt_dir", 468 + MOVE_MOUNT_F_EMPTY_PATH | 469 + MOVE_MOUNT_BENEATH); 470 + ASSERT_EQ(ret, 0); 471 + 472 + close(mnt_fd); 473 + 474 + /* 475 + * B lost MNT_LOCKED -- unmounting it must succeed. 476 + * This reveals the new mount at /mnt_dir. 477 + */ 478 + ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), 0); 479 + 480 + /* Verify the new mount is now visible */ 481 + mnt_visible_id = get_unique_mnt_id("/mnt_dir"); 482 + ASSERT_EQ(mnt_visible_id, mnt_new_id); 483 + 484 + /* 485 + * The new mount gained MNT_LOCKED -- unmounting it must 486 + * fail with EINVAL, preserving the containment invariant. 487 + */ 488 + ASSERT_EQ(umount2("/mnt_dir", MNT_DETACH), -1); 489 + ASSERT_EQ(errno, EINVAL); 490 + } 491 + 492 + TEST_HARNESS_MAIN

+1 -1

tools/testing/selftests/filesystems/open_tree_ns/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 TEST_GEN_PROGS := open_tree_ns_test 3 3 4 - CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES) 4 + CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) 5 5 LDLIBS := -lcap 6 6 7 7 include ../../lib.mk

+10 -33

tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 + * Copyright (c) 2026 Christian Brauner <brauner@kernel.org> 4 + * 3 5 * Test for OPEN_TREE_NAMESPACE flag. 4 6 * 5 7 * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount ··· 52 50 return ret; 53 51 } 54 52 55 - #define STATMOUNT_BUFSIZE (1 << 15) 56 - 57 - static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask) 58 - { 59 - struct statmount *buf; 60 - size_t bufsize = STATMOUNT_BUFSIZE; 61 - int ret; 62 - 63 - for (;;) { 64 - buf = malloc(bufsize); 65 - if (!buf) 66 - return NULL; 67 - 68 - ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0); 69 - if (ret == 0) 70 - return buf; 71 - 72 - free(buf); 73 - if (errno != EOVERFLOW) 74 - return NULL; 75 - 76 - bufsize <<= 1; 77 - } 78 - } 79 - 80 53 static void log_mount(struct __test_metadata *_metadata, struct statmount *sm) 81 54 { 82 55 const char *fs_type = ""; ··· 92 115 STATMOUNT_MNT_BASIC | 93 116 STATMOUNT_FS_TYPE | 94 117 STATMOUNT_MNT_ROOT | 95 - STATMOUNT_MNT_POINT); 118 + STATMOUNT_MNT_POINT, 0); 96 119 if (!sm) { 97 120 TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s", 98 121 i, (unsigned long long)list[i], strerror(errno)); ··· 198 221 SKIP(return, "open_tree() syscall not supported"); 199 222 200 223 /* Check if statmount/listmount are supported */ 201 - ret = statmount(0, 0, 0, NULL, 0, 0); 224 + ret = statmount(0, 0, 0, 0, NULL, 0, 0); 202 225 if (ret == -1 && errno == ENOSYS) 203 226 SKIP(return, "statmount() syscall not supported"); 204 227 ··· 317 340 ASSERT_GE(nr_mounts, 1); 318 341 319 342 /* Get info about the root mount (the bind mount, rootfs is hidden) */ 320 - ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); 343 + ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); 321 344 ASSERT_EQ(ret, 0); 322 345 323 346 ASSERT_NE(sm.mnt_id, sm.mnt_parent_id); ··· 429 452 SKIP(return, "open_tree() syscall not supported"); 430 453 431 454 /* Check if statmount/listmount are supported */ 432 - ret = statmount(0, 0, 0, NULL, 0, 0); 455 + ret = statmount(0, 0, 0, 0, NULL, 0, 0); 433 456 if (ret == -1 && errno == ENOSYS) 434 457 SKIP(return, "statmount() syscall not supported"); 435 458 } ··· 723 746 const char *mnt_point; 724 747 725 748 sm = statmount_alloc(list[i], new_ns_id, 726 - STATMOUNT_MNT_POINT); 749 + STATMOUNT_MNT_POINT, 0); 727 750 if (!sm) 728 751 _exit(11); 729 752 ··· 840 863 const char *mnt_point; 841 864 842 865 sm = statmount_alloc(list[i], new_ns_id, 843 - STATMOUNT_MNT_POINT); 866 + STATMOUNT_MNT_POINT, 0); 844 867 if (!sm) 845 868 _exit(11); 846 869 ··· 881 904 ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); 882 905 break; 883 906 case 7: 884 - ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); 907 + ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded"); 885 908 break; 886 909 case 9: 887 910 ASSERT_FALSE(true) TH_LOG("listmount failed"); ··· 980 1003 struct statmount *sm; 981 1004 const char *mnt_point; 982 1005 983 - sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT); 1006 + sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT, 0); 984 1007 ASSERT_NE(sm, NULL) { 985 1008 TH_LOG("statmount_alloc failed for mnt_id %llu", 986 1009 (unsigned long long)list[i]);

+51

tools/testing/selftests/filesystems/statmount/statmount.h

··· 3 3 #ifndef __STATMOUNT_H 4 4 #define __STATMOUNT_H 5 5 6 + #include <errno.h> 6 7 #include <stdint.h> 8 + #include <stdlib.h> 7 9 #include <linux/mount.h> 8 10 #include <asm/unistd.h> 11 + 12 + #define STATMOUNT_BUFSIZE (1 << 15) 9 13 10 14 #ifndef __NR_statmount 11 15 #if defined __alpha__ ··· 86 82 } 87 83 88 84 return syscall(__NR_listmount, &req, list, num, flags); 85 + } 86 + 87 + static inline struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, 88 + uint64_t mask, unsigned int flags) 89 + { 90 + struct statmount *buf; 91 + size_t bufsize = STATMOUNT_BUFSIZE; 92 + int ret; 93 + 94 + for (;;) { 95 + buf = malloc(bufsize); 96 + if (!buf) 97 + return NULL; 98 + 99 + ret = statmount(mnt_id, mnt_ns_id, 0, mask, buf, bufsize, flags); 100 + if (ret == 0) 101 + return buf; 102 + 103 + free(buf); 104 + if (errno != EOVERFLOW) 105 + return NULL; 106 + 107 + bufsize <<= 1; 108 + } 109 + } 110 + 111 + static inline struct statmount *statmount_alloc_by_fd(int fd, uint64_t mask) 112 + { 113 + struct statmount *buf; 114 + size_t bufsize = STATMOUNT_BUFSIZE; 115 + int ret; 116 + 117 + for (;;) { 118 + buf = malloc(bufsize); 119 + if (!buf) 120 + return NULL; 121 + 122 + ret = statmount(0, 0, fd, mask, buf, bufsize, STATMOUNT_BY_FD); 123 + if (ret == 0) 124 + return buf; 125 + 126 + free(buf); 127 + if (errno != EOVERFLOW) 128 + return NULL; 129 + 130 + bufsize <<= 1; 131 + } 89 132 } 90 133 91 134 #endif /* __STATMOUNT_H */

+3 -42

tools/testing/selftests/filesystems/statmount/statmount_test.c

··· 33 33 "sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf", 34 34 "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; 35 35 36 - static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags) 37 - { 38 - size_t bufsize = 1 << 15; 39 - struct statmount *buf = NULL, *tmp = NULL; 40 - int tofree = 0; 41 - int ret; 42 - 43 - if (flags & STATMOUNT_BY_FD && fd < 0) 44 - return NULL; 45 - 46 - tmp = alloca(bufsize); 47 - 48 - for (;;) { 49 - if (flags & STATMOUNT_BY_FD) 50 - ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags); 51 - else 52 - ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags); 53 - 54 - if (ret != -1) 55 - break; 56 - if (tofree) 57 - free(tmp); 58 - if (errno != EOVERFLOW) 59 - return NULL; 60 - bufsize <<= 1; 61 - tofree = 1; 62 - tmp = malloc(bufsize); 63 - if (!tmp) 64 - return NULL; 65 - } 66 - buf = malloc(tmp->size); 67 - if (buf) 68 - memcpy(buf, tmp, tmp->size); 69 - if (tofree) 70 - free(tmp); 71 - 72 - return buf; 73 - } 74 - 75 36 static void write_file(const char *path, const char *val) 76 37 { 77 38 int fd = open(path, O_WRONLY); ··· 676 715 goto err_fd; 677 716 } 678 717 679 - sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); 718 + sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT); 680 719 if (!sm) { 681 720 ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); 682 721 goto err_chroot; ··· 711 750 } 712 751 713 752 free(sm); 714 - sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); 753 + sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT); 715 754 if (!sm) { 716 755 ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); 717 756 goto err_fd; ··· 805 844 goto err_fd; 806 845 } 807 846 808 - sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD); 847 + sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT); 809 848 if (!sm) { 810 849 ksft_test_result_fail("statmount by fd unmounted: %s\n", 811 850 strerror(errno));

-25

tools/testing/selftests/filesystems/statmount/statmount_test_ns.c

··· 34 34 ksft_test_result_skip("%s\n", testname); 35 35 } 36 36 37 - static inline int wait_for_pid(pid_t pid) 38 - { 39 - int status, ret; 40 - 41 - again: 42 - ret = waitpid(pid, &status, 0); 43 - if (ret == -1) { 44 - if (errno == EINTR) 45 - goto again; 46 - 47 - ksft_print_msg("waitpid returned -1, errno=%d\n", errno); 48 - return -1; 49 - } 50 - 51 - if (!WIFEXITED(status)) { 52 - ksft_print_msg( 53 - "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n", 54 - WIFSIGNALED(status), WTERMSIG(status)); 55 - return -1; 56 - } 57 - 58 - ret = WEXITSTATUS(status); 59 - return ret; 60 - } 61 - 62 37 static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id) 63 38 { 64 39 int fd = open(mnt_ns, O_RDONLY);

+2 -2

tools/testing/selftests/filesystems/utils.c

··· 158 158 _exit(0); 159 159 } 160 160 161 - static int wait_for_pid(pid_t pid) 161 + int wait_for_pid(pid_t pid) 162 162 { 163 163 int status, ret; 164 164 ··· 450 450 return fret; 451 451 } 452 452 453 - static int write_file(const char *path, const char *val) 453 + int write_file(const char *path, const char *val) 454 454 { 455 455 int fd = open(path, O_WRONLY); 456 456 size_t len = strlen(val);

+2

tools/testing/selftests/filesystems/utils.h

··· 44 44 return true; 45 45 } 46 46 47 + extern int wait_for_pid(pid_t pid); 48 + extern int write_file(const char *path, const char *val); 47 49 extern uint64_t get_unique_mnt_id(const char *path); 48 50 49 51 #endif /* __IDMAP_UTILS_H */

-1

tools/testing/selftests/namespaces/listns_efault_test.c

··· 19 19 #include <sys/wait.h> 20 20 #include <unistd.h> 21 21 #include "../kselftest_harness.h" 22 - #include "../filesystems/utils.h" 23 22 #include "../pidfd/pidfd.h" 24 23 #include "wrappers.h" 25 24

Configure Feed

Configure Feed