Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
"This reverts the direct io port to iomap infrastructure of btrfs
merged in the first pull request. We found problems in invalidate page
that don't seem to be fixable as regressions or without changing iomap
code that would not affect other filesystems.

There are four reverts in total, but three of them are followup
cleanups needed to revert a43a67a2d715 cleanly. The result is the
buffer head based implementation of direct io.

Reverts are not great, but under current circumstances I don't see
better options"

* tag 'for-5.8-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
Revert "btrfs: switch to iomap_dio_rw() for dio"
Revert "fs: remove dio_end_io()"
Revert "btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK"
Revert "btrfs: split btrfs_direct_IO to read and write part"

+286 -234
-1
fs/btrfs/Kconfig
··· 14 14 select LZO_DECOMPRESS 15 15 select ZSTD_COMPRESS 16 16 select ZSTD_DECOMPRESS 17 - select FS_IOMAP 18 17 select RAID6_PQ 19 18 select XOR_BLOCKS 20 19 select SRCU
+18
fs/btrfs/btrfs_inode.h
··· 28 28 BTRFS_INODE_NEEDS_FULL_SYNC, 29 29 BTRFS_INODE_COPY_EVERYTHING, 30 30 BTRFS_INODE_IN_DELALLOC_LIST, 31 + BTRFS_INODE_READDIO_NEED_LOCK, 31 32 BTRFS_INODE_HAS_PROPS, 32 33 BTRFS_INODE_SNAPSHOT_FLUSH, 33 34 }; ··· 312 311 /* Array of checksums */ 313 312 u8 csums[]; 314 313 }; 314 + 315 + /* 316 + * Disable DIO read nolock optimization, so new dio readers will be forced 317 + * to grab i_mutex. It is used to avoid the endless truncate due to 318 + * nonlocked dio read. 319 + */ 320 + static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) 321 + { 322 + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); 323 + smp_mb(); 324 + } 325 + 326 + static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) 327 + { 328 + smp_mb__before_atomic(); 329 + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); 330 + } 315 331 316 332 /* Array of bytes with variable length, hexadecimal format 0x1234 */ 317 333 #define CSUM_FMT "0x%*phN"
-4
fs/btrfs/ctree.h
··· 28 28 #include <linux/dynamic_debug.h> 29 29 #include <linux/refcount.h> 30 30 #include <linux/crc32c.h> 31 - #include <linux/iomap.h> 32 31 #include "extent-io-tree.h" 33 32 #include "extent_io.h" 34 33 #include "extent_map.h" ··· 2933 2934 void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, 2934 2935 u64 end, int uptodate); 2935 2936 extern const struct dentry_operations btrfs_dentry_operations; 2936 - ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); 2937 - extern const struct iomap_ops btrfs_dio_iomap_ops; 2938 - extern const struct iomap_dio_ops btrfs_dops; 2939 2937 2940 2938 /* ioctl.c */ 2941 2939 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+6 -91
fs/btrfs/file.c
··· 1809 1809 return num_written ? num_written : ret; 1810 1810 } 1811 1811 1812 - static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 1813 - const struct iov_iter *iter, loff_t offset) 1814 - { 1815 - const unsigned int blocksize_mask = fs_info->sectorsize - 1; 1816 - 1817 - if (offset & blocksize_mask) 1818 - return -EINVAL; 1819 - 1820 - if (iov_iter_alignment(iter) & blocksize_mask) 1821 - return -EINVAL; 1822 - 1823 - return 0; 1824 - } 1825 - 1826 - static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 1812 + static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 1827 1813 { 1828 1814 struct file *file = iocb->ki_filp; 1829 1815 struct inode *inode = file_inode(file); 1830 - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1831 - loff_t pos = iocb->ki_pos; 1832 - ssize_t written = 0; 1816 + loff_t pos; 1817 + ssize_t written; 1833 1818 ssize_t written_buffered; 1834 1819 loff_t endbyte; 1835 1820 int err; 1836 - size_t count = 0; 1837 - bool relock = false; 1838 1821 1839 - if (check_direct_IO(fs_info, from, pos)) 1840 - goto buffered; 1841 - 1842 - count = iov_iter_count(from); 1843 - /* 1844 - * If the write DIO is beyond the EOF, we need update the isize, but it 1845 - * is protected by i_mutex. So we can not unlock the i_mutex at this 1846 - * case. 1847 - */ 1848 - if (pos + count <= inode->i_size) { 1849 - inode_unlock(inode); 1850 - relock = true; 1851 - } else if (iocb->ki_flags & IOCB_NOWAIT) { 1852 - return -EAGAIN; 1853 - } 1854 - 1855 - down_read(&BTRFS_I(inode)->dio_sem); 1856 - written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dops, 1857 - is_sync_kiocb(iocb)); 1858 - up_read(&BTRFS_I(inode)->dio_sem); 1859 - 1860 - if (relock) 1861 - inode_lock(inode); 1822 + written = generic_file_direct_write(iocb, from); 1862 1823 1863 1824 if (written < 0 || !iov_iter_count(from)) 1864 1825 return written; 1865 1826 1866 - buffered: 1867 1827 pos = iocb->ki_pos; 1868 1828 written_buffered = btrfs_buffered_write(iocb, from); 1869 1829 if (written_buffered < 0) { ··· 1962 2002 atomic_inc(&BTRFS_I(inode)->sync_writers); 1963 2003 1964 2004 if (iocb->ki_flags & IOCB_DIRECT) { 1965 - num_written = btrfs_direct_write(iocb, from); 2005 + num_written = __btrfs_direct_write(iocb, from); 1966 2006 } else { 1967 2007 num_written = btrfs_buffered_write(iocb, from); 1968 2008 if (num_written > 0) ··· 3476 3516 return generic_file_open(inode, filp); 3477 3517 } 3478 3518 3479 - static int check_direct_read(struct btrfs_fs_info *fs_info, 3480 - const struct iov_iter *iter, loff_t offset) 3481 - { 3482 - int ret; 3483 - int i, seg; 3484 - 3485 - ret = check_direct_IO(fs_info, iter, offset); 3486 - if (ret < 0) 3487 - return ret; 3488 - 3489 - for (seg = 0; seg < iter->nr_segs; seg++) 3490 - for (i = seg + 1; i < iter->nr_segs; i++) 3491 - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) 3492 - return -EINVAL; 3493 - return 0; 3494 - } 3495 - 3496 - static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) 3497 - { 3498 - struct inode *inode = file_inode(iocb->ki_filp); 3499 - ssize_t ret; 3500 - 3501 - if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) 3502 - return 0; 3503 - 3504 - inode_lock_shared(inode); 3505 - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dops, 3506 - is_sync_kiocb(iocb)); 3507 - inode_unlock_shared(inode); 3508 - return ret; 3509 - } 3510 - 3511 - static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 3512 - { 3513 - ssize_t ret = 0; 3514 - 3515 - if (iocb->ki_flags & IOCB_DIRECT) { 3516 - ret = btrfs_direct_read(iocb, to); 3517 - if (ret < 0) 3518 - return ret; 3519 - } 3520 - 3521 - return generic_file_buffered_read(iocb, to, ret); 3522 - } 3523 - 3524 3519 const struct file_operations btrfs_file_operations = { 3525 3520 .llseek = btrfs_file_llseek, 3526 - .read_iter = btrfs_file_read_iter, 3521 + .read_iter = generic_file_read_iter, 3527 3522 .splice_read = generic_file_splice_read, 3528 3523 .write_iter = btrfs_file_write_iter, 3529 3524 .mmap = btrfs_file_mmap,
+241 -138
fs/btrfs/inode.c
··· 5 5 6 6 #include <linux/kernel.h> 7 7 #include <linux/bio.h> 8 + #include <linux/buffer_head.h> 8 9 #include <linux/file.h> 9 10 #include <linux/fs.h> 10 11 #include <linux/pagemap.h> ··· 58 57 59 58 struct btrfs_dio_data { 60 59 u64 reserve; 61 - loff_t length; 62 - ssize_t submitted; 63 - struct extent_changeset *data_reserved; 60 + u64 unsubmitted_oe_range_start; 61 + u64 unsubmitted_oe_range_end; 62 + int overwrite; 64 63 }; 65 64 66 65 static const struct inode_operations btrfs_dir_inode_operations; ··· 4811 4810 4812 4811 truncate_setsize(inode, newsize); 4813 4812 4813 + /* Disable nonlocked read DIO to avoid the endless truncate */ 4814 + btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); 4814 4815 inode_dio_wait(inode); 4816 + btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); 4815 4817 4816 4818 ret = btrfs_truncate(inode, newsize == oldsize); 4817 4819 if (ret && inode->i_nlink) { ··· 7045 7041 } 7046 7042 7047 7043 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 7048 - struct extent_state **cached_state, bool writing) 7044 + struct extent_state **cached_state, int writing) 7049 7045 { 7050 7046 struct btrfs_ordered_extent *ordered; 7051 7047 int ret = 0; ··· 7183 7179 } 7184 7180 7185 7181 7182 + static int btrfs_get_blocks_direct_read(struct extent_map *em, 7183 + struct buffer_head *bh_result, 7184 + struct inode *inode, 7185 + u64 start, u64 len) 7186 + { 7187 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7188 + 7189 + if (em->block_start == EXTENT_MAP_HOLE || 7190 + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7191 + return -ENOENT; 7192 + 7193 + len = min(len, em->len - (start - em->start)); 7194 + 7195 + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 7196 + inode->i_blkbits; 7197 + bh_result->b_size = len; 7198 + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; 7199 + set_buffer_mapped(bh_result); 7200 + 7201 + return 0; 7202 + } 7203 + 7186 7204 static int btrfs_get_blocks_direct_write(struct extent_map **map, 7205 + struct buffer_head *bh_result, 7187 7206 struct inode *inode, 7188 7207 struct btrfs_dio_data *dio_data, 7189 7208 u64 start, u64 len) ··· 7268 7241 } 7269 7242 7270 7243 /* this will cow the extent */ 7244 + len = bh_result->b_size; 7271 7245 free_extent_map(em); 7272 7246 *map = em = btrfs_new_extent_direct(inode, start, len); 7273 7247 if (IS_ERR(em)) { ··· 7279 7251 len = min(len, em->len - (start - em->start)); 7280 7252 7281 7253 skip_cow: 7254 + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> 7255 + inode->i_blkbits; 7256 + bh_result->b_size = len; 7257 + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; 7258 + set_buffer_mapped(bh_result); 7259 + 7260 + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 7261 + set_buffer_new(bh_result); 7262 + 7282 7263 /* 7283 7264 * Need to update the i_size under the extent lock so buffered 7284 7265 * readers will get the updated i_size when we unlock. 7285 7266 */ 7286 - if (start + len > i_size_read(inode)) 7267 + if (!dio_data->overwrite && start + len > i_size_read(inode)) 7287 7268 i_size_write(inode, start + len); 7288 7269 7270 + WARN_ON(dio_data->reserve < len); 7289 7271 dio_data->reserve -= len; 7272 + dio_data->unsubmitted_oe_range_end = start + len; 7273 + current->journal_info = dio_data; 7290 7274 out: 7291 7275 return ret; 7292 7276 } 7293 7277 7294 - static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, 7295 - loff_t length, unsigned flags, struct iomap *iomap, 7296 - struct iomap *srcmap) 7278 + static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 7279 + struct buffer_head *bh_result, int create) 7297 7280 { 7298 7281 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7299 7282 struct extent_map *em; 7300 7283 struct extent_state *cached_state = NULL; 7301 7284 struct btrfs_dio_data *dio_data = NULL; 7285 + u64 start = iblock << inode->i_blkbits; 7302 7286 u64 lockstart, lockend; 7303 - const bool write = !!(flags & IOMAP_WRITE); 7287 + u64 len = bh_result->b_size; 7304 7288 int ret = 0; 7305 - u64 len = length; 7306 - bool unlock_extents = false; 7307 7289 7308 - if (!write) 7290 + if (!create) 7309 7291 len = min_t(u64, len, fs_info->sectorsize); 7310 7292 7311 7293 lockstart = start; 7312 7294 lockend = start + len - 1; 7313 7295 7314 - /* 7315 - * The generic stuff only does filemap_write_and_wait_range, which 7316 - * isn't enough if we've written compressed pages to this area, so we 7317 - * need to flush the dirty pages again to make absolutely sure that any 7318 - * outstanding dirty pages are on disk. 7319 - */ 7320 - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7321 - &BTRFS_I(inode)->runtime_flags)) 7322 - ret = filemap_fdatawrite_range(inode->i_mapping, start, 7323 - start + length - 1); 7324 - 7325 - dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); 7326 - if (!dio_data) 7327 - return -ENOMEM; 7328 - 7329 - dio_data->length = length; 7330 - if (write) { 7331 - dio_data->reserve = round_up(length, fs_info->sectorsize); 7332 - ret = btrfs_delalloc_reserve_space(inode, 7333 - &dio_data->data_reserved, 7334 - start, dio_data->reserve); 7335 - if (ret) { 7336 - extent_changeset_free(dio_data->data_reserved); 7337 - kfree(dio_data); 7338 - return ret; 7339 - } 7296 + if (current->journal_info) { 7297 + /* 7298 + * Need to pull our outstanding extents and set journal_info to NULL so 7299 + * that anything that needs to check if there's a transaction doesn't get 7300 + * confused. 7301 + */ 7302 + dio_data = current->journal_info; 7303 + current->journal_info = NULL; 7340 7304 } 7341 - iomap->private = dio_data; 7342 - 7343 7305 7344 7306 /* 7345 7307 * If this errors out it's because we couldn't invalidate pagecache for 7346 7308 * this range and we need to fallback to buffered. 7347 7309 */ 7348 - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { 7310 + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, 7311 + create)) { 7349 7312 ret = -ENOTBLK; 7350 7313 goto err; 7351 7314 } ··· 7368 7349 goto unlock_err; 7369 7350 } 7370 7351 7371 - len = min(len, em->len - (start - em->start)); 7372 - if (write) { 7373 - ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 7374 - start, len); 7352 + if (create) { 7353 + ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, 7354 + dio_data, start, len); 7375 7355 if (ret < 0) 7376 7356 goto unlock_err; 7377 - unlock_extents = true; 7378 - /* Recalc len in case the new em is smaller than requested */ 7379 - len = min(len, em->len - (start - em->start)); 7357 + 7358 + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, 7359 + lockend, &cached_state); 7380 7360 } else { 7361 + ret = btrfs_get_blocks_direct_read(em, bh_result, inode, 7362 + start, len); 7363 + /* Can be negative only if we read from a hole */ 7364 + if (ret < 0) { 7365 + ret = 0; 7366 + free_extent_map(em); 7367 + goto unlock_err; 7368 + } 7381 7369 /* 7382 7370 * We need to unlock only the end area that we aren't using. 7383 7371 * The rest is going to be unlocked by the endio routine. 7384 7372 */ 7385 - lockstart = start + len; 7386 - if (lockstart < lockend) 7387 - unlock_extents = true; 7373 + lockstart = start + bh_result->b_size; 7374 + if (lockstart < lockend) { 7375 + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 7376 + lockstart, lockend, &cached_state); 7377 + } else { 7378 + free_extent_state(cached_state); 7379 + } 7388 7380 } 7389 - 7390 - if (unlock_extents) 7391 - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 7392 - lockstart, lockend, &cached_state); 7393 - else 7394 - free_extent_state(cached_state); 7395 - 7396 - /* 7397 - * Translate extent map information to iomap. 7398 - * We trim the extents (and move the addr) even though iomap code does 7399 - * that, since we have locked only the parts we are performing I/O in. 7400 - */ 7401 - if ((em->block_start == EXTENT_MAP_HOLE) || 7402 - (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { 7403 - iomap->addr = IOMAP_NULL_ADDR; 7404 - iomap->type = IOMAP_HOLE; 7405 - } else { 7406 - iomap->addr = em->block_start + (start - em->start); 7407 - iomap->type = IOMAP_MAPPED; 7408 - } 7409 - iomap->offset = start; 7410 - iomap->bdev = fs_info->fs_devices->latest_bdev; 7411 - iomap->length = len; 7412 7381 7413 7382 free_extent_map(em); 7414 7383 ··· 7406 7399 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7407 7400 &cached_state); 7408 7401 err: 7409 - if (dio_data) { 7410 - btrfs_delalloc_release_space(inode, dio_data->data_reserved, 7411 - start, dio_data->reserve, true); 7412 - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); 7413 - extent_changeset_free(dio_data->data_reserved); 7414 - kfree(dio_data); 7415 - } 7416 - return ret; 7417 - } 7418 - 7419 - static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, 7420 - ssize_t written, unsigned flags, struct iomap *iomap) 7421 - { 7422 - int ret = 0; 7423 - struct btrfs_dio_data *dio_data = iomap->private; 7424 - size_t submitted = dio_data->submitted; 7425 - const bool write = !!(flags & IOMAP_WRITE); 7426 - 7427 - if (!write && (iomap->type == IOMAP_HOLE)) { 7428 - /* If reading from a hole, unlock and return */ 7429 - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); 7430 - goto out; 7431 - } 7432 - 7433 - if (submitted < length) { 7434 - pos += submitted; 7435 - length -= submitted; 7436 - if (write) 7437 - __endio_write_update_ordered(inode, pos, length, false); 7438 - else 7439 - unlock_extent(&BTRFS_I(inode)->io_tree, pos, 7440 - pos + length - 1); 7441 - ret = -ENOTBLK; 7442 - } 7443 - 7444 - if (write) { 7445 - if (dio_data->reserve) 7446 - btrfs_delalloc_release_space(inode, 7447 - dio_data->data_reserved, pos, 7448 - dio_data->reserve, true); 7449 - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); 7450 - extent_changeset_free(dio_data->data_reserved); 7451 - } 7452 - out: 7453 - kfree(dio_data); 7454 - iomap->private = NULL; 7455 - 7402 + if (dio_data) 7403 + current->journal_info = dio_data; 7456 7404 return ret; 7457 7405 } 7458 7406 ··· 7430 7468 dip->logical_offset + dip->bytes - 1); 7431 7469 } 7432 7470 7433 - bio_endio(dip->dio_bio); 7471 + dio_end_io(dip->dio_bio); 7434 7472 kfree(dip); 7435 7473 } 7436 7474 ··· 7666 7704 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 7667 7705 dip->dio_bio = dio_bio; 7668 7706 refcount_set(&dip->refs, 1); 7707 + 7708 + if (write) { 7709 + struct btrfs_dio_data *dio_data = current->journal_info; 7710 + 7711 + /* 7712 + * Setting range start and end to the same value means that 7713 + * no cleanup will happen in btrfs_direct_IO 7714 + */ 7715 + dio_data->unsubmitted_oe_range_end = dip->logical_offset + 7716 + dip->bytes; 7717 + dio_data->unsubmitted_oe_range_start = 7718 + dio_data->unsubmitted_oe_range_end; 7719 + } 7669 7720 return dip; 7670 7721 } 7671 7722 7672 - static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, 7673 - struct bio *dio_bio, loff_t file_offset) 7723 + static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, 7724 + loff_t file_offset) 7674 7725 { 7675 7726 const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); 7676 7727 const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); ··· 7700 7725 int ret; 7701 7726 blk_status_t status; 7702 7727 struct btrfs_io_geometry geom; 7703 - struct btrfs_dio_data *dio_data = iomap->private; 7704 7728 7705 7729 dip = btrfs_create_dio_private(dio_bio, inode, file_offset); 7706 7730 if (!dip) { ··· 7708 7734 file_offset + dio_bio->bi_iter.bi_size - 1); 7709 7735 } 7710 7736 dio_bio->bi_status = BLK_STS_RESOURCE; 7711 - bio_endio(dio_bio); 7712 - return BLK_QC_T_NONE; 7737 + dio_end_io(dio_bio); 7738 + return; 7713 7739 } 7714 7740 7715 7741 if (!write && csum) { ··· 7780 7806 goto out_err; 7781 7807 } 7782 7808 7783 - dio_data->submitted += clone_len; 7784 7809 clone_offset += clone_len; 7785 7810 start_sector += clone_len >> 9; 7786 7811 file_offset += clone_len; 7787 7812 } while (submit_len > 0); 7788 - return BLK_QC_T_NONE; 7813 + return; 7789 7814 7790 7815 out_err: 7791 7816 dip->dio_bio->bi_status = status; 7792 7817 btrfs_dio_private_put(dip); 7793 - return BLK_QC_T_NONE; 7794 7818 } 7795 7819 7796 - const struct iomap_ops btrfs_dio_iomap_ops = { 7797 - .iomap_begin = btrfs_dio_iomap_begin, 7798 - .iomap_end = btrfs_dio_iomap_end, 7799 - }; 7820 + static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 7821 + const struct iov_iter *iter, loff_t offset) 7822 + { 7823 + int seg; 7824 + int i; 7825 + unsigned int blocksize_mask = fs_info->sectorsize - 1; 7826 + ssize_t retval = -EINVAL; 7800 7827 7801 - const struct iomap_dio_ops btrfs_dops = { 7802 - .submit_io = btrfs_submit_direct, 7803 - }; 7828 + if (offset & blocksize_mask) 7829 + goto out; 7830 + 7831 + if (iov_iter_alignment(iter) & blocksize_mask) 7832 + goto out; 7833 + 7834 + /* If this is a write we don't need to check anymore */ 7835 + if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) 7836 + return 0; 7837 + /* 7838 + * Check to make sure we don't have duplicate iov_base's in this 7839 + * iovec, if so return EINVAL, otherwise we'll get csum errors 7840 + * when reading back. 7841 + */ 7842 + for (seg = 0; seg < iter->nr_segs; seg++) { 7843 + for (i = seg + 1; i < iter->nr_segs; i++) { 7844 + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) 7845 + goto out; 7846 + } 7847 + } 7848 + retval = 0; 7849 + out: 7850 + return retval; 7851 + } 7852 + 7853 + static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 7854 + { 7855 + struct file *file = iocb->ki_filp; 7856 + struct inode *inode = file->f_mapping->host; 7857 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7858 + struct btrfs_dio_data dio_data = { 0 }; 7859 + struct extent_changeset *data_reserved = NULL; 7860 + loff_t offset = iocb->ki_pos; 7861 + size_t count = 0; 7862 + int flags = 0; 7863 + bool wakeup = true; 7864 + bool relock = false; 7865 + ssize_t ret; 7866 + 7867 + if (check_direct_IO(fs_info, iter, offset)) 7868 + return 0; 7869 + 7870 + inode_dio_begin(inode); 7871 + 7872 + /* 7873 + * The generic stuff only does filemap_write_and_wait_range, which 7874 + * isn't enough if we've written compressed pages to this area, so 7875 + * we need to flush the dirty pages again to make absolutely sure 7876 + * that any outstanding dirty pages are on disk. 7877 + */ 7878 + count = iov_iter_count(iter); 7879 + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7880 + &BTRFS_I(inode)->runtime_flags)) 7881 + filemap_fdatawrite_range(inode->i_mapping, offset, 7882 + offset + count - 1); 7883 + 7884 + if (iov_iter_rw(iter) == WRITE) { 7885 + /* 7886 + * If the write DIO is beyond the EOF, we need update 7887 + * the isize, but it is protected by i_mutex. So we can 7888 + * not unlock the i_mutex at this case. 7889 + */ 7890 + if (offset + count <= inode->i_size) { 7891 + dio_data.overwrite = 1; 7892 + inode_unlock(inode); 7893 + relock = true; 7894 + } else if (iocb->ki_flags & IOCB_NOWAIT) { 7895 + ret = -EAGAIN; 7896 + goto out; 7897 + } 7898 + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 7899 + offset, count); 7900 + if (ret) 7901 + goto out; 7902 + 7903 + /* 7904 + * We need to know how many extents we reserved so that we can 7905 + * do the accounting properly if we go over the number we 7906 + * originally calculated. Abuse current->journal_info for this. 7907 + */ 7908 + dio_data.reserve = round_up(count, 7909 + fs_info->sectorsize); 7910 + dio_data.unsubmitted_oe_range_start = (u64)offset; 7911 + dio_data.unsubmitted_oe_range_end = (u64)offset; 7912 + current->journal_info = &dio_data; 7913 + down_read(&BTRFS_I(inode)->dio_sem); 7914 + } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 7915 + &BTRFS_I(inode)->runtime_flags)) { 7916 + inode_dio_end(inode); 7917 + flags = DIO_LOCKING | DIO_SKIP_HOLES; 7918 + wakeup = false; 7919 + } 7920 + 7921 + ret = __blockdev_direct_IO(iocb, inode, 7922 + fs_info->fs_devices->latest_bdev, 7923 + iter, btrfs_get_blocks_direct, NULL, 7924 + btrfs_submit_direct, flags); 7925 + if (iov_iter_rw(iter) == WRITE) { 7926 + up_read(&BTRFS_I(inode)->dio_sem); 7927 + current->journal_info = NULL; 7928 + if (ret < 0 && ret != -EIOCBQUEUED) { 7929 + if (dio_data.reserve) 7930 + btrfs_delalloc_release_space(inode, data_reserved, 7931 + offset, dio_data.reserve, true); 7932 + /* 7933 + * On error we might have left some ordered extents 7934 + * without submitting corresponding bios for them, so 7935 + * cleanup them up to avoid other tasks getting them 7936 + * and waiting for them to complete forever. 7937 + */ 7938 + if (dio_data.unsubmitted_oe_range_start < 7939 + dio_data.unsubmitted_oe_range_end) 7940 + __endio_write_update_ordered(inode, 7941 + dio_data.unsubmitted_oe_range_start, 7942 + dio_data.unsubmitted_oe_range_end - 7943 + dio_data.unsubmitted_oe_range_start, 7944 + false); 7945 + } else if (ret >= 0 && (size_t)ret < count) 7946 + btrfs_delalloc_release_space(inode, data_reserved, 7947 + offset, count - (size_t)ret, true); 7948 + btrfs_delalloc_release_extents(BTRFS_I(inode), count); 7949 + } 7950 + out: 7951 + if (wakeup) 7952 + inode_dio_end(inode); 7953 + if (relock) 7954 + inode_lock(inode); 7955 + 7956 + extent_changeset_free(data_reserved); 7957 + return ret; 7958 + } 7804 7959 7805 7960 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 7806 7961 __u64 start, __u64 len) ··· 10225 10122 .writepage = btrfs_writepage, 10226 10123 .writepages = btrfs_writepages, 10227 10124 .readahead = btrfs_readahead, 10228 - .direct_IO = noop_direct_IO, 10125 + .direct_IO = btrfs_direct_IO, 10229 10126 .invalidatepage = btrfs_invalidatepage, 10230 10127 .releasepage = btrfs_releasepage, 10231 10128 #ifdef CONFIG_MIGRATION
+19
fs/direct-io.c
··· 386 386 spin_unlock_irqrestore(&dio->bio_lock, flags); 387 387 } 388 388 389 + /** 390 + * dio_end_io - handle the end io action for the given bio 391 + * @bio: The direct io bio thats being completed 392 + * 393 + * This is meant to be called by any filesystem that uses their own dio_submit_t 394 + * so that the DIO specific endio actions are dealt with after the filesystem 395 + * has done it's completion work. 396 + */ 397 + void dio_end_io(struct bio *bio) 398 + { 399 + struct dio *dio = bio->bi_private; 400 + 401 + if (dio->is_async) 402 + dio_bio_end_aio(bio); 403 + else 404 + dio_bio_end_io(bio); 405 + } 406 + EXPORT_SYMBOL_GPL(dio_end_io); 407 + 389 408 static inline void 390 409 dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, 391 410 struct block_device *bdev,
+2
include/linux/fs.h
··· 3204 3204 DIO_SKIP_HOLES = 0x02, 3205 3205 }; 3206 3206 3207 + void dio_end_io(struct bio *bio); 3208 + 3207 3209 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, 3208 3210 struct block_device *bdev, struct iov_iter *iter, 3209 3211 get_block_t get_block,