Merge tag 'ovl-update-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs

+12 -14

Documentation/filesystems/overlayfs.rst

··· 40 40 underlying filesystem, the same compliant behavior could be achieved 41 41 with the "xino" feature. The "xino" feature composes a unique object 42 42 identifier from the real object st_ino and an underlying fsid index. 43 - 44 - If all underlying filesystems support NFS file handles and export file 45 - handles with 32bit inode number encoding (e.g. ext4), overlay filesystem 46 - will use the high inode number bits for fsid. Even when the underlying 47 - filesystem uses 64bit inode numbers, users can still enable the "xino" 48 - feature with the "-o xino=on" overlay mount option. That is useful for the 49 - case of underlying filesystems like xfs and tmpfs, which use 64bit inode 50 - numbers, but are very unlikely to use the high inode number bits. In case 43 + The "xino" feature uses the high inode number bits for fsid, because the 44 + underlying filesystems rarely use the high inode number bits. In case 51 45 the underlying inode number does overflow into the high xino bits, overlay 52 46 filesystem will fall back to the non xino behavior for that inode. 47 + 48 + The "xino" feature can be enabled with the "-o xino=on" overlay mount option. 49 + If all underlying filesystems support NFS file handles, the value of st_ino 50 + for overlay filesystem objects is not only unique, but also persistent over 51 + the lifetime of the filesystem. The "-o xino=auto" overlay mount option 52 + enables the "xino" feature only if the persistent st_ino requirement is met. 53 53 54 54 The following table summarizes what can be expected in different overlay 55 55 configurations. ··· 66 66 | All layers | Y | Y | Y | Y | Y | Y | Y | Y | 67 67 | on same fs | | | | | | | | | 68 68 +--------------+-----+------+-----+------+--------+--------+--------+-------+ 69 - | Layers not | N | Y | Y | N | N | Y | N | Y | 69 + | Layers not | N | N | Y | N | N | Y | N | Y | 70 70 | on same fs, | | | | | | | | | 71 71 | xino=off | | | | | | | | | 72 72 +--------------+-----+------+-----+------+--------+--------+--------+-------+ 73 73 | xino=on/auto | Y | Y | Y | Y | Y | Y | Y | Y | 74 - | | | | | | | | | | 75 74 +--------------+-----+------+-----+------+--------+--------+--------+-------+ 76 - | xino=on/auto,| N | Y | Y | N | N | Y | N | Y | 75 + | xino=on/auto,| N | N | Y | N | N | Y | N | Y | 77 76 | ino overflow | | | | | | | | | 78 77 +--------------+-----+------+-----+------+--------+--------+--------+-------+ 79 78 80 79 [*] nfsd v3 readdirplus verifies d_ino == i_ino. i_ino is exposed via several 81 80 /proc files, such as /proc/locks and /proc/self/fdinfo/<fd> of an inotify 82 81 file descriptor. 83 - 84 82 85 83 Upper and Lower 86 84 --------------- ··· 459 461 guarantee that the values of st_ino and st_dev returned by stat(2) and the 460 462 value of d_ino returned by readdir(3) will act like on a normal filesystem. 461 463 E.g. the value of st_dev may be different for two objects in the same 462 - overlay filesystem and the value of st_ino for directory objects may not be 464 + overlay filesystem and the value of st_ino for filesystem objects may not be 463 465 persistent and could change even while the overlay filesystem is mounted, as 464 466 summarized in the `Inode properties`_ table above. 465 467 ··· 474 476 475 477 Offline changes, when the overlay is not mounted, are allowed to the 476 478 upper tree. Offline changes to the lower tree are only allowed if the 477 - "metadata only copy up", "inode index", and "redirect_dir" features 479 + "metadata only copy up", "inode index", "xino" and "redirect_dir" features 478 480 have not been used. If the lower tree is modified and any of these 479 481 features has been used, the behavior of the overlay is undefined, 480 482 though it will not result in a crash or deadlock.

+2 -1

fs/overlayfs/copy_up.c

··· 932 932 static int ovl_copy_up_flags(struct dentry *dentry, int flags) 933 933 { 934 934 int err = 0; 935 - const struct cred *old_cred = ovl_override_creds(dentry->d_sb); 935 + const struct cred *old_cred; 936 936 bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED); 937 937 938 938 /* ··· 943 943 if (WARN_ON(disconnected && d_is_dir(dentry))) 944 944 return -EIO; 945 945 946 + old_cred = ovl_override_creds(dentry->d_sb); 946 947 while (!err) { 947 948 struct dentry *next; 948 949 struct dentry *parent = NULL;

+21

fs/overlayfs/file.c

··· 571 571 remap_flags, op); 572 572 } 573 573 574 + static int ovl_flush(struct file *file, fl_owner_t id) 575 + { 576 + struct fd real; 577 + const struct cred *old_cred; 578 + int err; 579 + 580 + err = ovl_real_fdget(file, &real); 581 + if (err) 582 + return err; 583 + 584 + if (real.file->f_op->flush) { 585 + old_cred = ovl_override_creds(file_inode(file)->i_sb); 586 + err = real.file->f_op->flush(real.file, id); 587 + revert_creds(old_cred); 588 + } 589 + fdput(real); 590 + 591 + return err; 592 + } 593 + 574 594 const struct file_operations ovl_file_operations = { 575 595 .open = ovl_open, 576 596 .release = ovl_release, ··· 601 581 .mmap = ovl_mmap, 602 582 .fallocate = ovl_fallocate, 603 583 .fadvise = ovl_fadvise, 584 + .flush = ovl_flush, 604 585 .splice_read = generic_file_splice_read, 605 586 .splice_write = iter_file_splice_write, 606 587

+7 -11

fs/overlayfs/inode.c

··· 97 97 return err; 98 98 } 99 99 100 - static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) 100 + static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) 101 101 { 102 102 bool samefs = ovl_same_fs(dentry->d_sb); 103 103 unsigned int xinobits = ovl_xino_bits(dentry->d_sb); ··· 110 110 * which is friendly to du -x. 111 111 */ 112 112 stat->dev = dentry->d_sb->s_dev; 113 - return 0; 113 + return; 114 114 } else if (xinobits) { 115 115 /* 116 116 * All inode numbers of underlying fs should not be using the 117 117 * high xinobits, so we use high xinobits to partition the 118 118 * overlay st_ino address space. The high bits holds the fsid 119 119 * (upper fsid is 0). The lowest xinobit is reserved for mapping 120 - * the non-peresistent inode numbers range in case of overflow. 120 + * the non-persistent inode numbers range in case of overflow. 121 121 * This way all overlay inode numbers are unique and use the 122 122 * overlay st_dev. 123 123 */ 124 124 if (likely(!(stat->ino >> xinoshift))) { 125 125 stat->ino |= ((u64)fsid) << (xinoshift + 1); 126 126 stat->dev = dentry->d_sb->s_dev; 127 - return 0; 127 + return; 128 128 } else if (ovl_xino_warn(dentry->d_sb)) { 129 129 pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", 130 130 dentry, stat->ino, xinobits); ··· 153 153 */ 154 154 stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; 155 155 } 156 - 157 - return 0; 158 156 } 159 157 160 158 int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, ··· 251 253 } 252 254 } 253 255 254 - err = ovl_map_dev_ino(dentry, stat, fsid); 255 - if (err) 256 - goto out; 256 + ovl_map_dev_ino(dentry, stat, fsid); 257 257 258 258 /* 259 259 * It's probably not worth it to count subdirs to get the ··· 406 410 if (ovl_is_private_xattr(sb, s)) 407 411 return false; 408 412 409 - /* List all non-trusted xatts */ 413 + /* List all non-trusted xattrs */ 410 414 if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) 411 415 return true; 412 416 ··· 611 615 * stackable i_mutex locks according to stack level of the super 612 616 * block instance. An overlayfs instance can never be in stack 613 617 * depth 0 (there is always a real fs below it). An overlayfs 614 - * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth]. 618 + * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. 615 619 * 616 620 * For example, here is a snip from /proc/lockdep_chains after 617 621 * dir_iterate of nested overlayfs:

+1

fs/overlayfs/namei.c

··· 919 919 continue; 920 920 921 921 if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { 922 + dput(this); 922 923 err = -EPERM; 923 924 pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); 924 925 goto out_put;

+33 -4

fs/overlayfs/overlayfs.h

··· 186 186 size_t size) 187 187 { 188 188 const char *name = ovl_xattr(ofs, ox); 189 - return vfs_getxattr(&init_user_ns, dentry, name, value, size); 189 + int err = vfs_getxattr(&init_user_ns, dentry, name, value, size); 190 + int len = (value && err > 0) ? err : 0; 191 + 192 + pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", 193 + dentry, name, min(len, 48), value, size, err); 194 + return err; 190 195 } 191 196 192 197 static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, ··· 324 319 enum ovl_xattr ox, const void *value, size_t size, 325 320 int xerr); 326 321 int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); 327 - void ovl_set_flag(unsigned long flag, struct inode *inode); 328 - void ovl_clear_flag(unsigned long flag, struct inode *inode); 329 - bool ovl_test_flag(unsigned long flag, struct inode *inode); 330 322 bool ovl_inuse_trylock(struct dentry *dentry); 331 323 void ovl_inuse_unlock(struct dentry *dentry); 332 324 bool ovl_is_inuse(struct dentry *dentry); ··· 336 334 char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, 337 335 int padding); 338 336 int ovl_sync_status(struct ovl_fs *ofs); 337 + 338 + static inline void ovl_set_flag(unsigned long flag, struct inode *inode) 339 + { 340 + set_bit(flag, &OVL_I(inode)->flags); 341 + } 342 + 343 + static inline void ovl_clear_flag(unsigned long flag, struct inode *inode) 344 + { 345 + clear_bit(flag, &OVL_I(inode)->flags); 346 + } 347 + 348 + static inline bool ovl_test_flag(unsigned long flag, struct inode *inode) 349 + { 350 + return test_bit(flag, &OVL_I(inode)->flags); 351 + } 339 352 340 353 static inline bool ovl_is_impuredir(struct super_block *sb, 341 354 struct dentry *dentry) ··· 455 438 int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, 456 439 struct dentry *dentry, int level); 457 440 int ovl_indexdir_cleanup(struct ovl_fs *ofs); 441 + 442 + /* 443 + * Can we iterate real dir directly? 444 + * 445 + * Non-merge dir may contain whiteouts from a time it was a merge upper, before 446 + * lower dir was removed under it and possibly before it was rotated from upper 447 + * to lower layer. 448 + */ 449 + static inline bool ovl_dir_is_real(struct dentry *dir) 450 + { 451 + return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); 452 + } 458 453 459 454 /* inode.c */ 460 455 int ovl_set_nlink_upper(struct dentry *dentry);

-12

fs/overlayfs/readdir.c

··· 319 319 return err; 320 320 } 321 321 322 - /* 323 - * Can we iterate real dir directly? 324 - * 325 - * Non-merge dir may contain whiteouts from a time it was a merge upper, before 326 - * lower dir was removed under it and possibly before it was rotated from upper 327 - * to lower layer. 328 - */ 329 - static bool ovl_dir_is_real(struct dentry *dir) 330 - { 331 - return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); 332 - } 333 - 334 322 static void ovl_dir_reset(struct file *file) 335 323 { 336 324 struct ovl_dir_file *od = file->private_data;

+50 -16

fs/overlayfs/super.c

··· 380 380 ofs->config.metacopy ? "on" : "off"); 381 381 if (ofs->config.ovl_volatile) 382 382 seq_puts(m, ",volatile"); 383 + if (ofs->config.userxattr) 384 + seq_puts(m, ",userxattr"); 383 385 return 0; 384 386 } 385 387 ··· 947 945 pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", 948 946 name); 949 947 } 948 + /* 949 + * Decoding origin file handle is required for persistent st_ino. 950 + * Without persistent st_ino, xino=auto falls back to xino=off. 951 + */ 952 + if (ofs->config.xino == OVL_XINO_AUTO && 953 + ofs->config.upperdir && !fh_type) { 954 + ofs->config.xino = OVL_XINO_OFF; 955 + pr_warn("fs on '%s' does not support file handles, falling back to xino=off.\n", 956 + name); 957 + } 950 958 951 959 /* Check if lower fs has 32bit inode numbers */ 952 960 if (fh_type != FILEID_INO32_GEN) ··· 1054 1042 } 1055 1043 1056 1044 err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); 1057 - if (!err) 1058 - ovl_copyattr(ovl_inode_real(inode), inode); 1059 - 1060 1045 return err; 1061 1046 1062 1047 out_acl_release: ··· 1194 1185 if (err) 1195 1186 goto out; 1196 1187 1197 - /* Upper fs should not be r/o */ 1198 - if (sb_rdonly(upperpath->mnt->mnt_sb)) { 1188 + /* Upperdir path should not be r/o */ 1189 + if (__mnt_is_readonly(upperpath->mnt)) { 1199 1190 pr_err("upper fs is r/o, try multi-lower layers mount\n"); 1200 1191 err = -EINVAL; 1201 1192 goto out; ··· 1410 1401 err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); 1411 1402 if (err) { 1412 1403 ofs->noxattr = true; 1413 - ofs->config.index = false; 1414 - ofs->config.metacopy = false; 1415 - pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); 1404 + if (ofs->config.index || ofs->config.metacopy) { 1405 + ofs->config.index = false; 1406 + ofs->config.metacopy = false; 1407 + pr_warn("upper fs does not support xattr, falling back to index=off,metacopy=off.\n"); 1408 + } 1409 + /* 1410 + * xattr support is required for persistent st_ino. 1411 + * Without persistent st_ino, xino=auto falls back to xino=off. 1412 + */ 1413 + if (ofs->config.xino == OVL_XINO_AUTO) { 1414 + ofs->config.xino = OVL_XINO_OFF; 1415 + pr_warn("upper fs does not support xattr, falling back to xino=off.\n"); 1416 + } 1416 1417 err = 0; 1417 1418 } else { 1418 1419 ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); ··· 1599 1580 * user opted-in to one of the new features that require following the 1600 1581 * lower inode of non-dir upper. 1601 1582 */ 1602 - if (!ofs->config.index && !ofs->config.metacopy && !ofs->config.xino && 1583 + if (!ofs->config.index && !ofs->config.metacopy && 1584 + ofs->config.xino != OVL_XINO_ON && 1603 1585 uuid_is_null(uuid)) 1604 1586 return false; 1605 1587 ··· 1629 1609 dev_t dev; 1630 1610 int err; 1631 1611 bool bad_uuid = false; 1612 + bool warn = false; 1632 1613 1633 1614 for (i = 0; i < ofs->numfs; i++) { 1634 1615 if (ofs->fs[i].sb == sb) ··· 1638 1617 1639 1618 if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { 1640 1619 bad_uuid = true; 1620 + if (ofs->config.xino == OVL_XINO_AUTO) { 1621 + ofs->config.xino = OVL_XINO_OFF; 1622 + warn = true; 1623 + } 1641 1624 if (ofs->config.index || ofs->config.nfs_export) { 1642 1625 ofs->config.index = false; 1643 1626 ofs->config.nfs_export = false; 1644 - pr_warn("%s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", 1627 + warn = true; 1628 + } 1629 + if (warn) { 1630 + pr_warn("%s uuid detected in lower fs '%pd2', falling back to xino=%s,index=off,nfs_export=off.\n", 1645 1631 uuid_is_null(&sb->s_uuid) ? "null" : 1646 1632 "conflicting", 1647 - path->dentry); 1633 + path->dentry, ovl_xino_str[ofs->config.xino]); 1648 1634 } 1649 1635 } 1650 1636 ··· 1854 1826 * - upper/work dir of any overlayfs instance 1855 1827 */ 1856 1828 static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, 1857 - struct dentry *dentry, const char *name) 1829 + struct dentry *dentry, const char *name, 1830 + bool is_lower) 1858 1831 { 1859 1832 struct dentry *next = dentry, *parent; 1860 1833 int err = 0; ··· 1867 1838 1868 1839 /* Walk back ancestors to root (inclusive) looking for traps */ 1869 1840 while (!err && parent != next) { 1870 - if (ovl_lookup_trap_inode(sb, parent)) { 1841 + if (is_lower && ovl_lookup_trap_inode(sb, parent)) { 1871 1842 err = -ELOOP; 1872 1843 pr_err("overlapping %s path\n", name); 1873 1844 } else if (ovl_is_inuse(parent)) { ··· 1893 1864 1894 1865 if (ovl_upper_mnt(ofs)) { 1895 1866 err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root, 1896 - "upperdir"); 1867 + "upperdir", false); 1897 1868 if (err) 1898 1869 return err; 1899 1870 ··· 1904 1875 * workbasedir. In that case, we already have their traps in 1905 1876 * inode cache and we will catch that case on lookup. 1906 1877 */ 1907 - err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir"); 1878 + err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir", 1879 + false); 1908 1880 if (err) 1909 1881 return err; 1910 1882 } ··· 1913 1883 for (i = 1; i < ofs->numlayer; i++) { 1914 1884 err = ovl_check_layer(sb, ofs, 1915 1885 ofs->layers[i].mnt->mnt_root, 1916 - "lowerdir"); 1886 + "lowerdir", true); 1917 1887 if (err) 1918 1888 return err; 1919 1889 } ··· 1982 1952 if (!ofs) 1983 1953 goto out; 1984 1954 1955 + err = -ENOMEM; 1985 1956 ofs->creator_cred = cred = prepare_creds(); 1986 1957 if (!cred) 1987 1958 goto out_err; ··· 2011 1980 if (!splitlower) 2012 1981 goto out_err; 2013 1982 1983 + err = -EINVAL; 2014 1984 numlower = ovl_split_lowerdirs(splitlower); 2015 1985 if (numlower > OVL_MAX_STACK) { 2016 1986 pr_err("too many lower directories, limit is %d\n", ··· 2019 1987 goto out_err; 2020 1988 } 2021 1989 1990 + err = -ENOMEM; 2022 1991 layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); 2023 1992 if (!layers) 2024 1993 goto out_err; ··· 2046 2013 if (ofs->config.upperdir) { 2047 2014 struct super_block *upper_sb; 2048 2015 2016 + err = -EINVAL; 2049 2017 if (!ofs->config.workdir) { 2050 2018 pr_err("missing 'workdir'\n"); 2051 2019 goto out_err;

+10 -23

fs/overlayfs/util.c

··· 214 214 215 215 /* 216 216 * ovl_dentry_lower() could return either a data dentry or metacopy dentry 217 - * dependig on what is stored in lowerstack[0]. At times we need to find 217 + * depending on what is stored in lowerstack[0]. At times we need to find 218 218 * lower dentry which has data (and not metacopy dentry). This helper 219 219 * returns the lower data dentry. 220 220 */ ··· 422 422 } 423 423 } 424 424 425 - static void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) 425 + static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) 426 426 { 427 427 struct inode *inode = d_inode(dentry); 428 428 429 429 WARN_ON(!inode_is_locked(inode)); 430 + WARN_ON(!d_is_dir(dentry)); 430 431 /* 431 - * Version is used by readdir code to keep cache consistent. For merge 432 - * dirs all changes need to be noted. For non-merge dirs, cache only 433 - * contains impure (ones which have been copied up and have origins) 434 - * entries, so only need to note changes to impure entries. 432 + * Version is used by readdir code to keep cache consistent. 433 + * For merge dirs (or dirs with origin) all changes need to be noted. 434 + * For non-merge dirs, cache contains only impure entries (i.e. ones 435 + * which have been copied up and have origins), so only need to note 436 + * changes to impure entries. 435 437 */ 436 - if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity) 438 + if (!ovl_dir_is_real(dentry) || impurity) 437 439 OVL_I(inode)->version++; 438 440 } 439 441 ··· 444 442 /* Copy mtime/ctime */ 445 443 ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry)); 446 444 447 - ovl_dentry_version_inc(dentry, impurity); 445 + ovl_dir_version_inc(dentry, impurity); 448 446 } 449 447 450 448 u64 ovl_dentry_version_get(struct dentry *dentry) ··· 638 636 ovl_set_flag(OVL_IMPURE, d_inode(dentry)); 639 637 640 638 return err; 641 - } 642 - 643 - void ovl_set_flag(unsigned long flag, struct inode *inode) 644 - { 645 - set_bit(flag, &OVL_I(inode)->flags); 646 - } 647 - 648 - void ovl_clear_flag(unsigned long flag, struct inode *inode) 649 - { 650 - clear_bit(flag, &OVL_I(inode)->flags); 651 - } 652 - 653 - bool ovl_test_flag(unsigned long flag, struct inode *inode) 654 - { 655 - return test_bit(flag, &OVL_I(inode)->flags); 656 639 } 657 640 658 641 /**

Configure Feed

Configure Feed