Merge tag 'vfs-7.0-rc1.nullfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

+11 -13

Documentation/filesystems/ramfs-rootfs-initramfs.rst

··· 76 76 --------------- 77 77 78 78 Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is 79 - always present in 2.6 systems. You can't unmount rootfs for approximately the 80 - same reason you can't kill the init process; rather than having special code 81 - to check for and handle an empty list, it's smaller and simpler for the kernel 82 - to just make sure certain lists can't become empty. 79 + always present in Linux systems. The kernel uses an immutable empty filesystem 80 + called nullfs as the true root of the VFS hierarchy, with the mutable rootfs 81 + (tmpfs/ramfs) mounted on top of it. This allows pivot_root() and unmounting 82 + of the initramfs to work normally. 83 83 84 84 Most systems just mount another filesystem over rootfs and ignore it. The 85 85 amount of space an empty instance of ramfs takes up is tiny. ··· 121 121 program. See the switch_root utility, below.) 122 122 123 123 - When switching another root device, initrd would pivot_root and then 124 - umount the ramdisk. But initramfs is rootfs: you can neither pivot_root 125 - rootfs, nor unmount it. Instead delete everything out of rootfs to 126 - free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs 127 - with the new root (cd /newmount; mount --move . /; chroot .), attach 128 - stdin/stdout/stderr to the new /dev/console, and exec the new init. 124 + umount the ramdisk. With nullfs as the true root, pivot_root() works 125 + normally from the initramfs. Userspace can simply do:: 129 126 130 - Since this is a remarkably persnickety process (and involves deleting 131 - commands before you can run them), the klibc package introduced a helper 132 - program (utils/run_init.c) to do all this for you. Most other packages 133 - (such as busybox) have named this command "switch_root". 127 + chdir(new_root); 128 + pivot_root(".", "."); 129 + umount2(".", MNT_DETACH); 130 + 131 + This is the preferred method for switching root filesystems. 134 132 135 133 Populating initramfs: 136 134 ---------------------

+1 -1

fs/Makefile

··· 16 16 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ 17 17 fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ 18 18 kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ 19 - file_attr.o fserror.o 19 + file_attr.o fserror.o nullfs.o 20 20 21 21 obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o 22 22 obj-$(CONFIG_PROC_FS) += proc_namespace.o

+17

fs/init.c

··· 13 13 #include <linux/security.h> 14 14 #include "internal.h" 15 15 16 + int __init init_pivot_root(const char *new_root, const char *put_old) 17 + { 18 + struct path new_path __free(path_put) = {}; 19 + struct path old_path __free(path_put) = {}; 20 + int ret; 21 + 22 + ret = kern_path(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new_path); 23 + if (ret) 24 + return ret; 25 + 26 + ret = kern_path(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_path); 27 + if (ret) 28 + return ret; 29 + 30 + return path_pivot_root(&new_path, &old_path); 31 + } 32 + 16 33 int __init init_mount(const char *dev_name, const char *dir_name, 17 34 const char *type_page, unsigned long flags, void *data_page) 18 35 {

+1

fs/internal.h

··· 90 90 int path_mount(const char *dev_name, const struct path *path, 91 91 const char *type_page, unsigned long flags, void *data_page); 92 92 int path_umount(const struct path *path, int flags); 93 + int path_pivot_root(struct path *new, struct path *old); 93 94 94 95 int show_path(struct seq_file *m, struct dentry *root); 95 96

+1

fs/mount.h

··· 5 5 #include <linux/ns_common.h> 6 6 #include <linux/fs_pin.h> 7 7 8 + extern struct file_system_type nullfs_fs_type; 8 9 extern struct list_head notify_list; 9 10 10 11 struct mnt_namespace {

+102 -57

fs/namespace.c

··· 221 221 int res; 222 222 223 223 xa_lock(&mnt_id_xa); 224 - res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL); 224 + res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL); 225 225 if (!res) 226 226 mnt->mnt_id_unique = ++mnt_id_ctr; 227 227 xa_unlock(&mnt_id_xa); ··· 4498 4498 } 4499 4499 EXPORT_SYMBOL(path_is_under); 4500 4500 4501 - /* 4502 - * pivot_root Semantics: 4503 - * Moves the root file system of the current process to the directory put_old, 4504 - * makes new_root as the new root file system of the current process, and sets 4505 - * root/cwd of all processes which had them on the current root to new_root. 4506 - * 4507 - * Restrictions: 4508 - * The new_root and put_old must be directories, and must not be on the 4509 - * same file system as the current process root. The put_old must be 4510 - * underneath new_root, i.e. adding a non-zero number of /.. to the string 4511 - * pointed to by put_old must yield the same directory as new_root. No other 4512 - * file system may be mounted on put_old. After all, new_root is a mountpoint. 4513 - * 4514 - * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. 4515 - * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives 4516 - * in this situation. 4517 - * 4518 - * Notes: 4519 - * - we don't move root/cwd if they are not at the root (reason: if something 4520 - * cared enough to change them, it's probably wrong to force them elsewhere) 4521 - * - it's okay to pick a root that isn't the root of a file system, e.g. 4522 - * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 4523 - * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 4524 - * first. 4525 - */ 4526 - SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 4527 - const char __user *, put_old) 4501 + int path_pivot_root(struct path *new, struct path *old) 4528 4502 { 4529 - struct path new __free(path_put) = {}; 4530 - struct path old __free(path_put) = {}; 4531 4503 struct path root __free(path_put) = {}; 4532 4504 struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; 4533 4505 int error; ··· 4507 4535 if (!may_mount()) 4508 4536 return -EPERM; 4509 4537 4510 - error = user_path_at(AT_FDCWD, new_root, 4511 - LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); 4512 - if (error) 4513 - return error; 4514 - 4515 - error = user_path_at(AT_FDCWD, put_old, 4516 - LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); 4517 - if (error) 4518 - return error; 4519 - 4520 - error = security_sb_pivotroot(&old, &new); 4538 + error = security_sb_pivotroot(old, new); 4521 4539 if (error) 4522 4540 return error; 4523 4541 4524 4542 get_fs_root(current->fs, &root); 4525 4543 4526 - LOCK_MOUNT(old_mp, &old); 4544 + LOCK_MOUNT(old_mp, old); 4527 4545 old_mnt = old_mp.parent; 4528 4546 if (IS_ERR(old_mnt)) 4529 4547 return PTR_ERR(old_mnt); 4530 4548 4531 - new_mnt = real_mount(new.mnt); 4549 + new_mnt = real_mount(new->mnt); 4532 4550 root_mnt = real_mount(root.mnt); 4533 4551 ex_parent = new_mnt->mnt_parent; 4534 4552 root_parent = root_mnt->mnt_parent; ··· 4530 4568 return -EINVAL; 4531 4569 if (new_mnt->mnt.mnt_flags & MNT_LOCKED) 4532 4570 return -EINVAL; 4533 - if (d_unlinked(new.dentry)) 4571 + if (d_unlinked(new->dentry)) 4534 4572 return -ENOENT; 4535 4573 if (new_mnt == root_mnt || old_mnt == root_mnt) 4536 4574 return -EBUSY; /* loop, on the same file system */ ··· 4538 4576 return -EINVAL; /* not a mountpoint */ 4539 4577 if (!mnt_has_parent(root_mnt)) 4540 4578 return -EINVAL; /* absolute root */ 4541 - if (!path_mounted(&new)) 4579 + if (!path_mounted(new)) 4542 4580 return -EINVAL; /* not a mountpoint */ 4543 4581 if (!mnt_has_parent(new_mnt)) 4544 4582 return -EINVAL; /* absolute root */ 4545 4583 /* make sure we can reach put_old from new_root */ 4546 - if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new)) 4584 + if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new)) 4547 4585 return -EINVAL; 4548 4586 /* make certain new is below the root */ 4549 - if (!is_path_reachable(new_mnt, new.dentry, &root)) 4587 + if (!is_path_reachable(new_mnt, new->dentry, &root)) 4550 4588 return -EINVAL; 4551 4589 lock_mount_hash(); 4552 4590 umount_mnt(new_mnt); ··· 4565 4603 unlock_mount_hash(); 4566 4604 mnt_notify_add(root_mnt); 4567 4605 mnt_notify_add(new_mnt); 4568 - chroot_fs_refs(&root, &new); 4606 + chroot_fs_refs(&root, new); 4569 4607 return 0; 4608 + } 4609 + 4610 + /* 4611 + * pivot_root Semantics: 4612 + * Moves the root file system of the current process to the directory put_old, 4613 + * makes new_root as the new root file system of the current process, and sets 4614 + * root/cwd of all processes which had them on the current root to new_root. 4615 + * 4616 + * Restrictions: 4617 + * The new_root and put_old must be directories, and must not be on the 4618 + * same file system as the current process root. The put_old must be 4619 + * underneath new_root, i.e. adding a non-zero number of /.. to the string 4620 + * pointed to by put_old must yield the same directory as new_root. No other 4621 + * file system may be mounted on put_old. After all, new_root is a mountpoint. 4622 + * 4623 + * The immutable nullfs filesystem is mounted as the true root of the VFS 4624 + * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this, 4625 + * allowing pivot_root() to work normally from initramfs. 4626 + * 4627 + * Notes: 4628 + * - we don't move root/cwd if they are not at the root (reason: if something 4629 + * cared enough to change them, it's probably wrong to force them elsewhere) 4630 + * - it's okay to pick a root that isn't the root of a file system, e.g. 4631 + * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 4632 + * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 4633 + * first. 4634 + */ 4635 + SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 4636 + const char __user *, put_old) 4637 + { 4638 + struct path new __free(path_put) = {}; 4639 + struct path old __free(path_put) = {}; 4640 + int error; 4641 + 4642 + error = user_path_at(AT_FDCWD, new_root, 4643 + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); 4644 + if (error) 4645 + return error; 4646 + 4647 + error = user_path_at(AT_FDCWD, put_old, 4648 + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); 4649 + if (error) 4650 + return error; 4651 + 4652 + return path_pivot_root(&new, &old); 4570 4653 } 4571 4654 4572 4655 static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) ··· 5976 5969 5977 5970 static void __init init_mount_tree(void) 5978 5971 { 5979 - struct vfsmount *mnt; 5980 - struct mount *m; 5972 + struct vfsmount *mnt, *nullfs_mnt; 5973 + struct mount *mnt_root; 5981 5974 struct path root; 5975 + 5976 + /* 5977 + * We create two mounts: 5978 + * 5979 + * (1) nullfs with mount id 1 5980 + * (2) mutable rootfs with mount id 2 5981 + * 5982 + * with (2) mounted on top of (1). 5983 + */ 5984 + nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL); 5985 + if (IS_ERR(nullfs_mnt)) 5986 + panic("VFS: Failed to create nullfs"); 5982 5987 5983 5988 mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); 5984 5989 if (IS_ERR(mnt)) 5985 5990 panic("Can't create rootfs"); 5986 5991 5987 - m = real_mount(mnt); 5988 - init_mnt_ns.root = m; 5989 - init_mnt_ns.nr_mounts = 1; 5990 - mnt_add_to_ns(&init_mnt_ns, m); 5992 + VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1); 5993 + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2); 5994 + 5995 + /* The namespace root is the nullfs mnt. */ 5996 + mnt_root = real_mount(nullfs_mnt); 5997 + init_mnt_ns.root = mnt_root; 5998 + 5999 + /* Mount mutable rootfs on top of nullfs. */ 6000 + root.mnt = nullfs_mnt; 6001 + root.dentry = nullfs_mnt->mnt_root; 6002 + 6003 + LOCK_MOUNT_EXACT(mp, &root); 6004 + if (unlikely(IS_ERR(mp.parent))) 6005 + panic("VFS: Failed to mount rootfs on nullfs"); 6006 + scoped_guard(mount_writer) 6007 + attach_mnt(real_mount(mnt), mp.parent, mp.mp); 6008 + 6009 + pr_info("VFS: Finished mounting rootfs on nullfs\n"); 6010 + 6011 + /* 6012 + * We've dropped all locks here but that's fine. Not just are we 6013 + * the only task that's running, there's no other mount 6014 + * namespace in existence and the initial mount namespace is 6015 + * completely empty until we add the mounts we just created. 6016 + */ 6017 + for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) { 6018 + mnt_add_to_ns(&init_mnt_ns, p); 6019 + init_mnt_ns.nr_mounts++; 6020 + } 6021 + 5991 6022 init_task.nsproxy->mnt_ns = &init_mnt_ns; 5992 6023 get_mnt_ns(&init_mnt_ns); 5993 6024 5994 - root.mnt = mnt; 5995 - root.dentry = mnt->mnt_root; 5996 - 6025 + /* The root and pwd always point to the mutable rootfs. */ 6026 + root.mnt = mnt; 6027 + root.dentry = mnt->mnt_root; 5997 6028 set_fs_pwd(current->fs, &root); 5998 6029 set_fs_root(current->fs, &root); 5999 6030

+70

fs/nullfs.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */ 3 + #include <linux/fs/super_types.h> 4 + #include <linux/fs_context.h> 5 + #include <linux/magic.h> 6 + 7 + static const struct super_operations nullfs_super_operations = { 8 + .statfs = simple_statfs, 9 + }; 10 + 11 + static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc) 12 + { 13 + struct inode *inode; 14 + 15 + s->s_maxbytes = MAX_LFS_FILESIZE; 16 + s->s_blocksize = PAGE_SIZE; 17 + s->s_blocksize_bits = PAGE_SHIFT; 18 + s->s_magic = NULL_FS_MAGIC; 19 + s->s_op = &nullfs_super_operations; 20 + s->s_export_op = NULL; 21 + s->s_xattr = NULL; 22 + s->s_time_gran = 1; 23 + s->s_d_flags = 0; 24 + 25 + inode = new_inode(s); 26 + if (!inode) 27 + return -ENOMEM; 28 + 29 + /* nullfs is permanently empty... */ 30 + make_empty_dir_inode(inode); 31 + simple_inode_init_ts(inode); 32 + inode->i_ino = 1; 33 + /* ... and immutable. */ 34 + inode->i_flags |= S_IMMUTABLE; 35 + 36 + s->s_root = d_make_root(inode); 37 + if (!s->s_root) 38 + return -ENOMEM; 39 + 40 + return 0; 41 + } 42 + 43 + /* 44 + * For now this is a single global instance. If needed we can make it 45 + * mountable by userspace at which point we will need to make it 46 + * multi-instance. 47 + */ 48 + static int nullfs_fs_get_tree(struct fs_context *fc) 49 + { 50 + return get_tree_single(fc, nullfs_fs_fill_super); 51 + } 52 + 53 + static const struct fs_context_operations nullfs_fs_context_ops = { 54 + .get_tree = nullfs_fs_get_tree, 55 + }; 56 + 57 + static int nullfs_init_fs_context(struct fs_context *fc) 58 + { 59 + fc->ops = &nullfs_fs_context_ops; 60 + fc->global = true; 61 + fc->sb_flags = SB_NOUSER; 62 + fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV; 63 + return 0; 64 + } 65 + 66 + struct file_system_type nullfs_fs_type = { 67 + .name = "nullfs", 68 + .init_fs_context = nullfs_init_fs_context, 69 + .kill_sb = kill_anon_super, 70 + };

+1

include/linux/init_syscalls.h

··· 17 17 int __init init_rmdir(const char *pathname); 18 18 int __init init_utimes(char *filename, struct timespec64 *ts); 19 19 int __init init_dup(struct file *file); 20 + int __init init_pivot_root(const char *new_root, const char *put_old);

+1

include/uapi/linux/magic.h

··· 104 104 #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ 105 105 #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ 106 106 #define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ 107 + #define NULL_FS_MAGIC 0x4E554C4C /* "NULL" */ 107 108 108 109 #endif /* __LINUX_MAGIC_H__ */

+10 -2

init/do_mounts.c

··· 483 483 wait_for_root(saved_root_name); 484 484 mount_root(saved_root_name); 485 485 devtmpfs_mount(); 486 - init_mount(".", "/", NULL, MS_MOVE, NULL); 487 - init_chroot("."); 486 + 487 + if (init_pivot_root(".", ".")) { 488 + pr_err("VFS: Failed to pivot into new rootfs\n"); 489 + return; 490 + } 491 + if (init_umount(".", MNT_DETACH)) { 492 + pr_err("VFS: Failed to unmount old rootfs\n"); 493 + return; 494 + } 495 + pr_info("VFS: Pivoted into new rootfs\n"); 488 496 } 489 497 490 498 static bool is_tmpfs;

Configure Feed

Configure Feed