Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fix from David Sterba:
"Fix for a deadlock when direct/buffered IO is done on a mmaped file
and a fault happens (details in the patch). There's a fstest
generic/647 that triggers the problem and makes testing hard"

* tag 'for-5.16-deadlock-fix-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: fix deadlock due to page faults during direct IO reads and writes

+123 -16
+123 -16
fs/btrfs/file.c
··· 1912 1912 1913 1913 static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 1914 1914 { 1915 + const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); 1915 1916 struct file *file = iocb->ki_filp; 1916 1917 struct inode *inode = file_inode(file); 1917 1918 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1918 1919 loff_t pos; 1919 1920 ssize_t written = 0; 1920 1921 ssize_t written_buffered; 1922 + size_t prev_left = 0; 1921 1923 loff_t endbyte; 1922 1924 ssize_t err; 1923 1925 unsigned int ilock_flags = 0; 1924 - struct iomap_dio *dio = NULL; 1925 1926 1926 1927 if (iocb->ki_flags & IOCB_NOWAIT) 1927 1928 ilock_flags |= BTRFS_ILOCK_TRY; ··· 1965 1964 goto buffered; 1966 1965 } 1967 1966 1968 - dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 1969 - 0, 0); 1967 + /* 1968 + * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() 1969 + * calls generic_write_sync() (through iomap_dio_complete()), because 1970 + * that results in calling fsync (btrfs_sync_file()) which will try to 1971 + * lock the inode in exclusive/write mode. 1972 + */ 1973 + if (is_sync_write) 1974 + iocb->ki_flags &= ~IOCB_DSYNC; 1975 + 1976 + /* 1977 + * The iov_iter can be mapped to the same file range we are writing to. 1978 + * If that's the case, then we will deadlock in the iomap code, because 1979 + * it first calls our callback btrfs_dio_iomap_begin(), which will create 1980 + * an ordered extent, and after that it will fault in the pages that the 1981 + * iov_iter refers to. During the fault in we end up in the readahead 1982 + * pages code (starting at btrfs_readahead()), which will lock the range, 1983 + * find that ordered extent and then wait for it to complete (at 1984 + * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since 1985 + * obviously the ordered extent can never complete as we didn't submit 1986 + * yet the respective bio(s). This always happens when the buffer is 1987 + * memory mapped to the same file range, since the iomap DIO code always 1988 + * invalidates pages in the target file range (after starting and waiting 1989 + * for any writeback). 1990 + * 1991 + * So here we disable page faults in the iov_iter and then retry if we 1992 + * got -EFAULT, faulting in the pages before the retry. 1993 + */ 1994 + again: 1995 + from->nofault = true; 1996 + err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 1997 + IOMAP_DIO_PARTIAL, written); 1998 + from->nofault = false; 1999 + 2000 + /* No increment (+=) because iomap returns a cumulative value. */ 2001 + if (err > 0) 2002 + written = err; 2003 + 2004 + if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { 2005 + const size_t left = iov_iter_count(from); 2006 + /* 2007 + * We have more data left to write. Try to fault in as many as 2008 + * possible of the remainder pages and retry. We do this without 2009 + * releasing and locking again the inode, to prevent races with 2010 + * truncate. 2011 + * 2012 + * Also, in case the iov refers to pages in the file range of the 2013 + * file we want to write to (due to a mmap), we could enter an 2014 + * infinite loop if we retry after faulting the pages in, since 2015 + * iomap will invalidate any pages in the range early on, before 2016 + * it tries to fault in the pages of the iov. So we keep track of 2017 + * how much was left of iov in the previous EFAULT and fallback 2018 + * to buffered IO in case we haven't made any progress. 2019 + */ 2020 + if (left == prev_left) { 2021 + err = -ENOTBLK; 2022 + } else { 2023 + fault_in_iov_iter_readable(from, left); 2024 + prev_left = left; 2025 + goto again; 2026 + } 2027 + } 1970 2028 1971 2029 btrfs_inode_unlock(inode, ilock_flags); 1972 2030 1973 - if (IS_ERR_OR_NULL(dio)) { 1974 - err = PTR_ERR_OR_ZERO(dio); 1975 - if (err < 0 && err != -ENOTBLK) 1976 - goto out; 1977 - } else { 1978 - written = iomap_dio_complete(dio); 1979 - } 2031 + /* 2032 + * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do 2033 + * the fsync (call generic_write_sync()). 2034 + */ 2035 + if (is_sync_write) 2036 + iocb->ki_flags |= IOCB_DSYNC; 1980 2037 1981 - if (written < 0 || !iov_iter_count(from)) { 1982 - err = written; 2038 + /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ 2039 + if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) 1983 2040 goto out; 1984 - } 1985 2041 1986 2042 buffered: 1987 2043 pos = iocb->ki_pos; ··· 2063 2005 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, 2064 2006 endbyte >> PAGE_SHIFT); 2065 2007 out: 2066 - return written ? written : err; 2008 + return err < 0 ? err : written; 2067 2009 } 2068 2010 2069 2011 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ··· 3717 3659 static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) 3718 3660 { 3719 3661 struct inode *inode = file_inode(iocb->ki_filp); 3662 + size_t prev_left = 0; 3663 + ssize_t read = 0; 3720 3664 ssize_t ret; 3721 3665 3722 3666 if (fsverity_active(inode)) ··· 3728 3668 return 0; 3729 3669 3730 3670 btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); 3671 + again: 3672 + /* 3673 + * This is similar to what we do for direct IO writes, see the comment 3674 + * at btrfs_direct_write(), but we also disable page faults in addition 3675 + * to disabling them only at the iov_iter level. This is because when 3676 + * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), 3677 + * which can still trigger page fault ins despite having set ->nofault 3678 + * to true of our 'to' iov_iter. 3679 + * 3680 + * The difference to direct IO writes is that we deadlock when trying 3681 + * to lock the extent range in the inode's tree during he page reads 3682 + * triggered by the fault in (while for writes it is due to waiting for 3683 + * our own ordered extent). This is because for direct IO reads, 3684 + * btrfs_dio_iomap_begin() returns with the extent range locked, which 3685 + * is only unlocked in the endio callback (end_bio_extent_readpage()). 3686 + */ 3687 + pagefault_disable(); 3688 + to->nofault = true; 3731 3689 ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 3732 - 0, 0); 3690 + IOMAP_DIO_PARTIAL, read); 3691 + to->nofault = false; 3692 + pagefault_enable(); 3693 + 3694 + /* No increment (+=) because iomap returns a cumulative value. */ 3695 + if (ret > 0) 3696 + read = ret; 3697 + 3698 + if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { 3699 + const size_t left = iov_iter_count(to); 3700 + 3701 + if (left == prev_left) { 3702 + /* 3703 + * We didn't make any progress since the last attempt, 3704 + * fallback to a buffered read for the remainder of the 3705 + * range. This is just to avoid any possibility of looping 3706 + * for too long. 3707 + */ 3708 + ret = read; 3709 + } else { 3710 + /* 3711 + * We made some progress since the last retry or this is 3712 + * the first time we are retrying. Fault in as many pages 3713 + * as possible and retry. 3714 + */ 3715 + fault_in_iov_iter_writeable(to, left); 3716 + prev_left = left; 3717 + goto again; 3718 + } 3719 + } 3733 3720 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 3734 - return ret; 3721 + return ret < 0 ? ret : read; 3735 3722 } 3736 3723 3737 3724 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)