Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'ovl-fixes-4.19-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs

Pull overlayfs fixes from Miklos Szeredi:
"This fixes a regression in the recent file stacking update, reported
and fixed by Amir Goldstein. The fix is fairly trivial, but involves
adding a fadvise() f_op and the associated churn in the vfs. As
discussed on -fsdevel, there are other possible uses for this method,
than allowing proper stacking for overlays.

And there's one other fix for a syzkaller detected oops"

* tag 'ovl-fixes-4.19-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs:
ovl: fix oopses in ovl_fill_super() failure paths
ovl: add ovl_fadvise()
vfs: implement readahead(2) using POSIX_FADV_WILLNEED
vfs: add the fadvise() file operation
Documentation/filesystems: update documentation of file_operations
ovl: fix GPF in swapfile_activate of file from overlayfs over xfs
ovl: respect FIEMAP_FLAG_SYNC flag

+134 -80
+19 -2
Documentation/filesystems/vfs.txt
··· 848 848 ---------------------- 849 849 850 850 This describes how the VFS can manipulate an open file. As of kernel 851 - 4.1, the following members are defined: 851 + 4.18, the following members are defined: 852 852 853 853 struct file_operations { 854 854 struct module *owner; ··· 858 858 ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); 859 859 ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); 860 860 int (*iterate) (struct file *, struct dir_context *); 861 + int (*iterate_shared) (struct file *, struct dir_context *); 861 862 __poll_t (*poll) (struct file *, struct poll_table_struct *); 862 863 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 863 864 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 864 865 int (*mmap) (struct file *, struct vm_area_struct *); 865 - int (*mremap)(struct file *, struct vm_area_struct *); 866 866 int (*open) (struct inode *, struct file *); 867 867 int (*flush) (struct file *, fl_owner_t id); 868 868 int (*release) (struct inode *, struct file *); ··· 882 882 #ifndef CONFIG_MMU 883 883 unsigned (*mmap_capabilities)(struct file *); 884 884 #endif 885 + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); 886 + int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); 887 + int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); 888 + int (*fadvise)(struct file *, loff_t, loff_t, int); 885 889 }; 886 890 887 891 Again, all methods are called without any locks being held, unless ··· 902 898 write_iter: possibly asynchronous write with iov_iter as source 903 899 904 900 iterate: called when the VFS needs to read the directory contents 901 + 902 + iterate_shared: called when the VFS needs to read the directory contents 903 + when filesystem supports concurrent dir iterators 905 904 906 905 poll: called by the VFS when a process wants to check if there is 907 906 activity on this file and (optionally) go to sleep until there ··· 957 950 the lease in the inode after setting it. 958 951 959 952 fallocate: called by the VFS to preallocate blocks or punch a hole. 953 + 954 + copy_file_range: called by the copy_file_range(2) system call. 955 + 956 + clone_file_range: called by the ioctl(2) system call for FICLONERANGE and 957 + FICLONE commands. 958 + 959 + dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE 960 + command. 961 + 962 + fadvise: possibly called by the fadvise64() system call. 960 963 961 964 Note that the file operations are implemented by the specific 962 965 filesystem in which the inode resides. When opening a device node
+20 -3
fs/overlayfs/file.c
··· 131 131 if (IS_ERR(realfile)) 132 132 return PTR_ERR(realfile); 133 133 134 - /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ 135 - file->f_mapping = realfile->f_mapping; 136 - 137 134 file->private_data = realfile; 138 135 139 136 return 0; ··· 331 334 return ret; 332 335 } 333 336 337 + static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) 338 + { 339 + struct fd real; 340 + const struct cred *old_cred; 341 + int ret; 342 + 343 + ret = ovl_real_fdget(file, &real); 344 + if (ret) 345 + return ret; 346 + 347 + old_cred = ovl_override_creds(file_inode(file)->i_sb); 348 + ret = vfs_fadvise(real.file, offset, len, advice); 349 + revert_creds(old_cred); 350 + 351 + fdput(real); 352 + 353 + return ret; 354 + } 355 + 334 356 static long ovl_real_ioctl(struct file *file, unsigned int cmd, 335 357 unsigned long arg) 336 358 { ··· 518 502 .fsync = ovl_fsync, 519 503 .mmap = ovl_mmap, 520 504 .fallocate = ovl_fallocate, 505 + .fadvise = ovl_fadvise, 521 506 .unlocked_ioctl = ovl_ioctl, 522 507 .compat_ioctl = ovl_compat_ioctl, 523 508
+10
fs/overlayfs/inode.c
··· 467 467 return -EOPNOTSUPP; 468 468 469 469 old_cred = ovl_override_creds(inode->i_sb); 470 + 471 + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) 472 + filemap_write_and_wait(realinode->i_mapping); 473 + 470 474 err = realinode->i_op->fiemap(realinode, fieinfo, start, len); 471 475 revert_creds(old_cred); 472 476 ··· 502 498 .listxattr = ovl_listxattr, 503 499 .get_acl = ovl_get_acl, 504 500 .update_time = ovl_update_time, 501 + }; 502 + 503 + const struct address_space_operations ovl_aops = { 504 + /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ 505 + .direct_IO = noop_direct_IO, 505 506 }; 506 507 507 508 /* ··· 580 571 case S_IFREG: 581 572 inode->i_op = &ovl_file_inode_operations; 582 573 inode->i_fop = &ovl_file_operations; 574 + inode->i_mapping->a_ops = &ovl_aops; 583 575 break; 584 576 585 577 case S_IFDIR:
+14 -12
fs/overlayfs/super.c
··· 982 982 if (err) 983 983 goto out; 984 984 985 - err = -EBUSY; 986 - if (ovl_inuse_trylock(upperpath->dentry)) { 987 - ofs->upperdir_locked = true; 988 - } else if (ofs->config.index) { 989 - pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); 990 - goto out; 991 - } else { 992 - pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); 993 - } 994 - 995 985 upper_mnt = clone_private_mount(upperpath); 996 986 err = PTR_ERR(upper_mnt); 997 987 if (IS_ERR(upper_mnt)) { ··· 992 1002 /* Don't inherit atime flags */ 993 1003 upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); 994 1004 ofs->upper_mnt = upper_mnt; 1005 + 1006 + err = -EBUSY; 1007 + if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) { 1008 + ofs->upperdir_locked = true; 1009 + } else if (ofs->config.index) { 1010 + pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); 1011 + goto out; 1012 + } else { 1013 + pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); 1014 + } 1015 + 995 1016 err = 0; 996 1017 out: 997 1018 return err; ··· 1102 1101 goto out; 1103 1102 } 1104 1103 1104 + ofs->workbasedir = dget(workpath.dentry); 1105 + 1105 1106 err = -EBUSY; 1106 - if (ovl_inuse_trylock(workpath.dentry)) { 1107 + if (ovl_inuse_trylock(ofs->workbasedir)) { 1107 1108 ofs->workdir_locked = true; 1108 1109 } else if (ofs->config.index) { 1109 1110 pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); ··· 1114 1111 pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); 1115 1112 } 1116 1113 1117 - ofs->workbasedir = dget(workpath.dentry); 1118 1114 err = ovl_make_workdir(ofs, &workpath); 1119 1115 if (err) 1120 1116 goto out;
+5
include/linux/fs.h
··· 1763 1763 u64); 1764 1764 int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, 1765 1765 u64); 1766 + int (*fadvise)(struct file *, loff_t, loff_t, int); 1766 1767 } __randomize_layout; 1767 1768 1768 1769 struct inode_operations { ··· 3459 3458 3460 3459 extern bool path_noexec(const struct path *path); 3461 3460 extern void inode_nohighmem(struct inode *inode); 3461 + 3462 + /* mm/fadvise.c */ 3463 + extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, 3464 + int advice); 3462 3465 3463 3466 #endif /* _LINUX_FS_H */
+1 -2
mm/Makefile
··· 32 32 mmu-$(CONFIG_MMU) += process_vm_access.o 33 33 endif 34 34 35 - obj-y := filemap.o mempool.o oom_kill.o \ 35 + obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 36 36 maccess.o page_alloc.o page-writeback.o \ 37 37 readahead.o swap.o truncate.o vmscan.o shmem.o \ 38 38 util.o mmzone.o vmstat.o backing-dev.o \ ··· 49 49 obj-y += bootmem.o 50 50 endif 51 51 52 - obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o 53 52 ifdef CONFIG_MMU 54 53 obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o 55 54 endif
+48 -33
mm/fadvise.c
··· 27 27 * deactivate the pages and clear PG_Referenced. 28 28 */ 29 29 30 - int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) 30 + static int generic_fadvise(struct file *file, loff_t offset, loff_t len, 31 + int advice) 31 32 { 32 - struct fd f = fdget(fd); 33 33 struct inode *inode; 34 34 struct address_space *mapping; 35 35 struct backing_dev_info *bdi; ··· 37 37 pgoff_t start_index; 38 38 pgoff_t end_index; 39 39 unsigned long nrpages; 40 - int ret = 0; 41 40 42 - if (!f.file) 43 - return -EBADF; 41 + inode = file_inode(file); 42 + if (S_ISFIFO(inode->i_mode)) 43 + return -ESPIPE; 44 44 45 - inode = file_inode(f.file); 46 - if (S_ISFIFO(inode->i_mode)) { 47 - ret = -ESPIPE; 48 - goto out; 49 - } 50 - 51 - mapping = f.file->f_mapping; 52 - if (!mapping || len < 0) { 53 - ret = -EINVAL; 54 - goto out; 55 - } 45 + mapping = file->f_mapping; 46 + if (!mapping || len < 0) 47 + return -EINVAL; 56 48 57 49 bdi = inode_to_bdi(mapping->host); 58 50 ··· 59 67 /* no bad return value, but ignore advice */ 60 68 break; 61 69 default: 62 - ret = -EINVAL; 70 + return -EINVAL; 63 71 } 64 - goto out; 72 + return 0; 65 73 } 66 74 67 75 /* ··· 77 85 78 86 switch (advice) { 79 87 case POSIX_FADV_NORMAL: 80 - f.file->f_ra.ra_pages = bdi->ra_pages; 81 - spin_lock(&f.file->f_lock); 82 - f.file->f_mode &= ~FMODE_RANDOM; 83 - spin_unlock(&f.file->f_lock); 88 + file->f_ra.ra_pages = bdi->ra_pages; 89 + spin_lock(&file->f_lock); 90 + file->f_mode &= ~FMODE_RANDOM; 91 + spin_unlock(&file->f_lock); 84 92 break; 85 93 case POSIX_FADV_RANDOM: 86 - spin_lock(&f.file->f_lock); 87 - f.file->f_mode |= FMODE_RANDOM; 88 - spin_unlock(&f.file->f_lock); 94 + spin_lock(&file->f_lock); 95 + file->f_mode |= FMODE_RANDOM; 96 + spin_unlock(&file->f_lock); 89 97 break; 90 98 case POSIX_FADV_SEQUENTIAL: 91 - f.file->f_ra.ra_pages = bdi->ra_pages * 2; 92 - spin_lock(&f.file->f_lock); 93 - f.file->f_mode &= ~FMODE_RANDOM; 94 - spin_unlock(&f.file->f_lock); 99 + file->f_ra.ra_pages = bdi->ra_pages * 2; 100 + spin_lock(&file->f_lock); 101 + file->f_mode &= ~FMODE_RANDOM; 102 + spin_unlock(&file->f_lock); 95 103 break; 96 104 case POSIX_FADV_WILLNEED: 97 105 /* First and last PARTIAL page! */ ··· 107 115 * Ignore return value because fadvise() shall return 108 116 * success even if filesystem can't retrieve a hint, 109 117 */ 110 - force_page_cache_readahead(mapping, f.file, start_index, 111 - nrpages); 118 + force_page_cache_readahead(mapping, file, start_index, nrpages); 112 119 break; 113 120 case POSIX_FADV_NOREUSE: 114 121 break; ··· 174 183 } 175 184 break; 176 185 default: 177 - ret = -EINVAL; 186 + return -EINVAL; 178 187 } 179 - out: 188 + return 0; 189 + } 190 + 191 + int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) 192 + { 193 + if (file->f_op->fadvise) 194 + return file->f_op->fadvise(file, offset, len, advice); 195 + 196 + return generic_fadvise(file, offset, len, advice); 197 + } 198 + EXPORT_SYMBOL(vfs_fadvise); 199 + 200 + #ifdef CONFIG_ADVISE_SYSCALLS 201 + 202 + int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) 203 + { 204 + struct fd f = fdget(fd); 205 + int ret; 206 + 207 + if (!f.file) 208 + return -EBADF; 209 + 210 + ret = vfs_fadvise(f.file, offset, len, advice); 211 + 180 212 fdput(f); 181 213 return ret; 182 214 } ··· 216 202 return ksys_fadvise64_64(fd, offset, len, advice); 217 203 } 218 204 205 + #endif 219 206 #endif
+17 -28
mm/readahead.c
··· 20 20 #include <linux/file.h> 21 21 #include <linux/mm_inline.h> 22 22 #include <linux/blk-cgroup.h> 23 + #include <linux/fadvise.h> 23 24 24 25 #include "internal.h" 25 26 ··· 576 575 } 577 576 EXPORT_SYMBOL_GPL(page_cache_async_readahead); 578 577 579 - static ssize_t 580 - do_readahead(struct address_space *mapping, struct file *filp, 581 - pgoff_t index, unsigned long nr) 582 - { 583 - if (!mapping || !mapping->a_ops) 584 - return -EINVAL; 585 - 586 - /* 587 - * Readahead doesn't make sense for DAX inodes, but we don't want it 588 - * to report a failure either. Instead, we just return success and 589 - * don't do any work. 590 - */ 591 - if (dax_mapping(mapping)) 592 - return 0; 593 - 594 - return force_page_cache_readahead(mapping, filp, index, nr); 595 - } 596 - 597 578 ssize_t ksys_readahead(int fd, loff_t offset, size_t count) 598 579 { 599 580 ssize_t ret; ··· 583 600 584 601 ret = -EBADF; 585 602 f = fdget(fd); 586 - if (f.file) { 587 - if (f.file->f_mode & FMODE_READ) { 588 - struct address_space *mapping = f.file->f_mapping; 589 - pgoff_t start = offset >> PAGE_SHIFT; 590 - pgoff_t end = (offset + count - 1) >> PAGE_SHIFT; 591 - unsigned long len = end - start + 1; 592 - ret = do_readahead(mapping, f.file, start, len); 593 - } 594 - fdput(f); 595 - } 603 + if (!f.file || !(f.file->f_mode & FMODE_READ)) 604 + goto out; 605 + 606 + /* 607 + * The readahead() syscall is intended to run only on files 608 + * that can execute readahead. If readahead is not possible 609 + * on this file, then we must return -EINVAL. 610 + */ 611 + ret = -EINVAL; 612 + if (!f.file->f_mapping || !f.file->f_mapping->a_ops || 613 + !S_ISREG(file_inode(f.file)->i_mode)) 614 + goto out; 615 + 616 + ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); 617 + out: 618 + fdput(f); 596 619 return ret; 597 620 } 598 621