Merge tag 'ovl-fixes-5.11-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs

+8

Documentation/filesystems/overlayfs.rst

··· 586 586 The advantage of mounting with the "volatile" option is that all forms of 587 587 sync calls to the upper filesystem are omitted. 588 588 589 + In order to avoid a giving a false sense of safety, the syncfs (and fsync) 590 + semantics of volatile mounts are slightly different than that of the rest of 591 + VFS. If any writeback error occurs on the upperdir's filesystem after a 592 + volatile mount takes place, all sync functions will return an error. Once this 593 + condition is reached, the filesystem will not recover, and every subsequent sync 594 + call will return an error, even if the upperdir has not experience a new error 595 + since the last sync call. 596 + 589 597 When overlay is mounted with "volatile" option, the directory 590 598 "$workdir/work/incompat/volatile" is created. During next mount, overlay 591 599 checks for this directory and refuses to mount if present. This is a strong

+8 -7

fs/overlayfs/copy_up.c

··· 84 84 85 85 if (ovl_is_private_xattr(sb, name)) 86 86 continue; 87 + 88 + error = security_inode_copy_up_xattr(name); 89 + if (error < 0 && error != -EOPNOTSUPP) 90 + break; 91 + if (error == 1) { 92 + error = 0; 93 + continue; /* Discard */ 94 + } 87 95 retry: 88 96 size = vfs_getxattr(old, name, value, value_size); 89 97 if (size == -ERANGE) ··· 115 107 goto retry; 116 108 } 117 109 118 - error = security_inode_copy_up_xattr(name); 119 - if (error < 0 && error != -EOPNOTSUPP) 120 - break; 121 - if (error == 1) { 122 - error = 0; 123 - continue; /* Discard */ 124 - } 125 110 error = vfs_setxattr(new, name, value, size, 0); 126 111 if (error) { 127 112 if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))

+1 -1

fs/overlayfs/dir.c

··· 992 992 993 993 buflen -= thislen; 994 994 memcpy(&buf[buflen], name, thislen); 995 - tmp = dget_dlock(d->d_parent); 996 995 spin_unlock(&d->d_lock); 996 + tmp = dget_parent(d); 997 997 998 998 dput(d); 999 999 d = tmp;

+3 -2

fs/overlayfs/file.c

··· 398 398 const struct cred *old_cred; 399 399 int ret; 400 400 401 - if (!ovl_should_sync(OVL_FS(file_inode(file)->i_sb))) 402 - return 0; 401 + ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); 402 + if (ret <= 0) 403 + return ret; 403 404 404 405 ret = ovl_real_fdget_meta(file, &real, !datasync); 405 406 if (ret)

+2

fs/overlayfs/inode.c

··· 352 352 goto out; 353 353 354 354 if (!value && !upperdentry) { 355 + old_cred = ovl_override_creds(dentry->d_sb); 355 356 err = vfs_getxattr(realdentry, name, NULL, 0); 357 + revert_creds(old_cred); 356 358 if (err < 0) 357 359 goto out_drop_write; 358 360 }

+1

fs/overlayfs/overlayfs.h

··· 324 324 bool ovl_is_metacopy_dentry(struct dentry *dentry); 325 325 char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, 326 326 int padding); 327 + int ovl_sync_status(struct ovl_fs *ofs); 327 328 328 329 static inline bool ovl_is_impuredir(struct super_block *sb, 329 330 struct dentry *dentry)

+2

fs/overlayfs/ovl_entry.h

··· 81 81 atomic_long_t last_ino; 82 82 /* Whiteout dentry cache */ 83 83 struct dentry *whiteout; 84 + /* r/o snapshot of upperdir sb's only taken on volatile mounts */ 85 + errseq_t errseq; 84 86 }; 85 87 86 88 static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs)

+10 -18

fs/overlayfs/readdir.c

··· 865 865 866 866 struct ovl_dir_file *od = file->private_data; 867 867 struct dentry *dentry = file->f_path.dentry; 868 - struct file *realfile = od->realfile; 868 + struct file *old, *realfile = od->realfile; 869 869 870 870 if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) 871 871 return want_upper ? NULL : realfile; ··· 874 874 * Need to check if we started out being a lower dir, but got copied up 875 875 */ 876 876 if (!od->is_upper) { 877 - struct inode *inode = file_inode(file); 878 - 879 877 realfile = READ_ONCE(od->upperfile); 880 878 if (!realfile) { 881 879 struct path upperpath; 882 880 883 881 ovl_path_upper(dentry, &upperpath); 884 882 realfile = ovl_dir_open_realfile(file, &upperpath); 883 + if (IS_ERR(realfile)) 884 + return realfile; 885 885 886 - inode_lock(inode); 887 - if (!od->upperfile) { 888 - if (IS_ERR(realfile)) { 889 - inode_unlock(inode); 890 - return realfile; 891 - } 892 - smp_store_release(&od->upperfile, realfile); 893 - } else { 894 - /* somebody has beaten us to it */ 895 - if (!IS_ERR(realfile)) 896 - fput(realfile); 897 - realfile = od->upperfile; 886 + old = cmpxchg_release(&od->upperfile, NULL, realfile); 887 + if (old) { 888 + fput(realfile); 889 + realfile = old; 898 890 } 899 - inode_unlock(inode); 900 891 } 901 892 } 902 893 ··· 900 909 struct file *realfile; 901 910 int err; 902 911 903 - if (!ovl_should_sync(OVL_FS(file->f_path.dentry->d_sb))) 904 - return 0; 912 + err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb)); 913 + if (err <= 0) 914 + return err; 905 915 906 916 realfile = ovl_dir_real_file(file, true); 907 917 err = PTR_ERR_OR_ZERO(realfile);

+31 -7

fs/overlayfs/super.c

··· 264 264 struct super_block *upper_sb; 265 265 int ret; 266 266 267 - if (!ovl_upper_mnt(ofs)) 268 - return 0; 267 + ret = ovl_sync_status(ofs); 268 + /* 269 + * We have to always set the err, because the return value isn't 270 + * checked in syncfs, and instead indirectly return an error via 271 + * the sb's writeback errseq, which VFS inspects after this call. 272 + */ 273 + if (ret < 0) { 274 + errseq_set(&sb->s_wb_err, -EIO); 275 + return -EIO; 276 + } 269 277 270 - if (!ovl_should_sync(ofs)) 271 - return 0; 278 + if (!ret) 279 + return ret; 280 + 272 281 /* 273 282 * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC). 274 283 * All the super blocks will be iterated, including upper_sb. ··· 1932 1923 unsigned int numlower; 1933 1924 int err; 1934 1925 1926 + err = -EIO; 1927 + if (WARN_ON(sb->s_user_ns != current_user_ns())) 1928 + goto out; 1929 + 1935 1930 sb->s_d_op = &ovl_dentry_operations; 1936 1931 1937 1932 err = -ENOMEM; ··· 2002 1989 sb->s_op = &ovl_super_operations; 2003 1990 2004 1991 if (ofs->config.upperdir) { 1992 + struct super_block *upper_sb; 1993 + 2005 1994 if (!ofs->config.workdir) { 2006 1995 pr_err("missing 'workdir'\n"); 2007 1996 goto out_err; ··· 2013 1998 if (err) 2014 1999 goto out_err; 2015 2000 2001 + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; 2002 + if (!ovl_should_sync(ofs)) { 2003 + ofs->errseq = errseq_sample(&upper_sb->s_wb_err); 2004 + if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) { 2005 + err = -EIO; 2006 + pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n"); 2007 + goto out_err; 2008 + } 2009 + } 2010 + 2016 2011 err = ovl_get_workdir(sb, ofs, &upperpath); 2017 2012 if (err) 2018 2013 goto out_err; ··· 2030 2005 if (!ofs->workdir) 2031 2006 sb->s_flags |= SB_RDONLY; 2032 2007 2033 - sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth; 2034 - sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran; 2035 - 2008 + sb->s_stack_depth = upper_sb->s_stack_depth; 2009 + sb->s_time_gran = upper_sb->s_time_gran; 2036 2010 } 2037 2011 oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers); 2038 2012 err = PTR_ERR(oe);

+27

fs/overlayfs/util.c

··· 962 962 kfree(buf); 963 963 return ERR_PTR(res); 964 964 } 965 + 966 + /* 967 + * ovl_sync_status() - Check fs sync status for volatile mounts 968 + * 969 + * Returns 1 if this is not a volatile mount and a real sync is required. 970 + * 971 + * Returns 0 if syncing can be skipped because mount is volatile, and no errors 972 + * have occurred on the upperdir since the mount. 973 + * 974 + * Returns -errno if it is a volatile mount, and the error that occurred since 975 + * the last mount. If the error code changes, it'll return the latest error 976 + * code. 977 + */ 978 + 979 + int ovl_sync_status(struct ovl_fs *ofs) 980 + { 981 + struct vfsmount *mnt; 982 + 983 + if (ovl_should_sync(ofs)) 984 + return 1; 985 + 986 + mnt = ovl_upper_mnt(ofs); 987 + if (!mnt) 988 + return 0; 989 + 990 + return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq); 991 + }

+43 -24

security/commoncap.c

··· 371 371 { 372 372 int size, ret; 373 373 kuid_t kroot; 374 + u32 nsmagic, magic; 374 375 uid_t root, mappedroot; 375 376 char *tmpbuf = NULL; 376 377 struct vfs_cap_data *cap; 377 - struct vfs_ns_cap_data *nscap; 378 + struct vfs_ns_cap_data *nscap = NULL; 378 379 struct dentry *dentry; 379 380 struct user_namespace *fs_ns; 380 381 ··· 397 396 fs_ns = inode->i_sb->s_user_ns; 398 397 cap = (struct vfs_cap_data *) tmpbuf; 399 398 if (is_v2header((size_t) ret, cap)) { 400 - /* If this is sizeof(vfs_cap_data) then we're ok with the 401 - * on-disk value, so return that. */ 402 - if (alloc) 403 - *buffer = tmpbuf; 404 - else 405 - kfree(tmpbuf); 406 - return ret; 407 - } else if (!is_v3header((size_t) ret, cap)) { 408 - kfree(tmpbuf); 409 - return -EINVAL; 399 + root = 0; 400 + } else if (is_v3header((size_t) ret, cap)) { 401 + nscap = (struct vfs_ns_cap_data *) tmpbuf; 402 + root = le32_to_cpu(nscap->rootid); 403 + } else { 404 + size = -EINVAL; 405 + goto out_free; 410 406 } 411 407 412 - nscap = (struct vfs_ns_cap_data *) tmpbuf; 413 - root = le32_to_cpu(nscap->rootid); 414 408 kroot = make_kuid(fs_ns, root); 415 409 416 410 /* If the root kuid maps to a valid uid in current ns, then return 417 411 * this as a nscap. */ 418 412 mappedroot = from_kuid(current_user_ns(), kroot); 419 413 if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { 414 + size = sizeof(struct vfs_ns_cap_data); 420 415 if (alloc) { 421 - *buffer = tmpbuf; 416 + if (!nscap) { 417 + /* v2 -> v3 conversion */ 418 + nscap = kzalloc(size, GFP_ATOMIC); 419 + if (!nscap) { 420 + size = -ENOMEM; 421 + goto out_free; 422 + } 423 + nsmagic = VFS_CAP_REVISION_3; 424 + magic = le32_to_cpu(cap->magic_etc); 425 + if (magic & VFS_CAP_FLAGS_EFFECTIVE) 426 + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; 427 + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); 428 + nscap->magic_etc = cpu_to_le32(nsmagic); 429 + } else { 430 + /* use allocated v3 buffer */ 431 + tmpbuf = NULL; 432 + } 422 433 nscap->rootid = cpu_to_le32(mappedroot); 423 - } else 424 - kfree(tmpbuf); 425 - return size; 434 + *buffer = nscap; 435 + } 436 + goto out_free; 426 437 } 427 438 428 439 if (!rootid_owns_currentns(kroot)) { 429 - kfree(tmpbuf); 430 - return -EOPNOTSUPP; 440 + size = -EOVERFLOW; 441 + goto out_free; 431 442 } 432 443 433 444 /* This comes from a parent namespace. Return as a v2 capability */ 434 445 size = sizeof(struct vfs_cap_data); 435 446 if (alloc) { 436 - *buffer = kmalloc(size, GFP_ATOMIC); 437 - if (*buffer) { 438 - struct vfs_cap_data *cap = *buffer; 439 - __le32 nsmagic, magic; 447 + if (nscap) { 448 + /* v3 -> v2 conversion */ 449 + cap = kzalloc(size, GFP_ATOMIC); 450 + if (!cap) { 451 + size = -ENOMEM; 452 + goto out_free; 453 + } 440 454 magic = VFS_CAP_REVISION_2; 441 455 nsmagic = le32_to_cpu(nscap->magic_etc); 442 456 if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) ··· 459 443 memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); 460 444 cap->magic_etc = cpu_to_le32(magic); 461 445 } else { 462 - size = -ENOMEM; 446 + /* use unconverted v2 */ 447 + tmpbuf = NULL; 463 448 } 449 + *buffer = cap; 464 450 } 451 + out_free: 465 452 kfree(tmpbuf); 466 453 return size; 467 454 }

Configure Feed

Configure Feed