Merge patch series "iomap: zero range folio batch support"

+85 -33

fs/iomap/buffered-io.c

··· 772 772 if (!mapping_large_folio_support(iter->inode->i_mapping)) 773 773 len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); 774 774 775 + if (iter->fbatch) { 776 + struct folio *folio = folio_batch_next(iter->fbatch); 777 + 778 + if (!folio) 779 + return NULL; 780 + 781 + /* 782 + * The folio mapping generally shouldn't have changed based on 783 + * fs locks, but be consistent with filemap lookup and retry 784 + * the iter if it does. 785 + */ 786 + folio_lock(folio); 787 + if (unlikely(folio->mapping != iter->inode->i_mapping)) { 788 + iter->iomap.flags |= IOMAP_F_STALE; 789 + folio_unlock(folio); 790 + return NULL; 791 + } 792 + 793 + folio_get(folio); 794 + return folio; 795 + } 796 + 775 797 if (write_ops && write_ops->get_folio) 776 798 return write_ops->get_folio(iter, pos, len); 777 799 return iomap_get_folio(iter, pos, len); ··· 848 826 size_t *poffset, u64 *plen) 849 827 { 850 828 const struct iomap *srcmap = iomap_iter_srcmap(iter); 851 - loff_t pos = iter->pos; 829 + loff_t pos; 852 830 u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); 853 831 struct folio *folio; 854 832 int status = 0; 855 833 856 834 len = min_not_zero(len, *plen); 857 - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); 858 - if (srcmap != &iter->iomap) 859 - BUG_ON(pos + len > srcmap->offset + srcmap->length); 835 + *foliop = NULL; 836 + *plen = 0; 860 837 861 838 if (fatal_signal_pending(current)) 862 839 return -EINTR; ··· 863 842 folio = __iomap_get_folio(iter, write_ops, len); 864 843 if (IS_ERR(folio)) 865 844 return PTR_ERR(folio); 845 + 846 + /* 847 + * No folio means we're done with a batch. We still have range to 848 + * process so return and let the caller iterate and refill the batch. 849 + */ 850 + if (!folio) { 851 + WARN_ON_ONCE(!iter->fbatch); 852 + return 0; 853 + } 866 854 867 855 /* 868 856 * Now we have a locked folio, before we do anything with it we need to ··· 891 861 status = 0; 892 862 goto out_unlock; 893 863 } 864 + } 865 + 866 + /* 867 + * The folios in a batch may not be contiguous. If we've skipped 868 + * forward, advance the iter to the pos of the current folio. If the 869 + * folio starts beyond the end of the mapping, it may have been trimmed 870 + * since the lookup for whatever reason. Return a NULL folio to 871 + * terminate the op. 872 + */ 873 + if (folio_pos(folio) > iter->pos) { 874 + len = min_t(u64, folio_pos(folio) - iter->pos, 875 + iomap_length(iter)); 876 + status = iomap_iter_advance(iter, len); 877 + len = iomap_length(iter); 878 + if (status || !len) 879 + goto out_unlock; 894 880 } 895 881 896 882 pos = iomap_trim_folio_range(iter, folio, poffset, &len); ··· 1455 1409 if (iter->iomap.flags & IOMAP_F_STALE) 1456 1410 break; 1457 1411 1412 + /* a NULL folio means we're done with a folio batch */ 1413 + if (!folio) { 1414 + status = iomap_iter_advance_full(iter); 1415 + break; 1416 + } 1417 + 1458 1418 /* warn about zeroing folios beyond eof that won't write back */ 1459 1419 WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); 1460 1420 ··· 1485 1433 return status; 1486 1434 } 1487 1435 1436 + loff_t 1437 + iomap_fill_dirty_folios( 1438 + struct iomap_iter *iter, 1439 + loff_t offset, 1440 + loff_t length) 1441 + { 1442 + struct address_space *mapping = iter->inode->i_mapping; 1443 + pgoff_t start = offset >> PAGE_SHIFT; 1444 + pgoff_t end = (offset + length - 1) >> PAGE_SHIFT; 1445 + 1446 + iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL); 1447 + if (!iter->fbatch) 1448 + return offset + length; 1449 + folio_batch_init(iter->fbatch); 1450 + 1451 + filemap_get_folios_dirty(mapping, &start, end, iter->fbatch); 1452 + return (start << PAGE_SHIFT); 1453 + } 1454 + EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios); 1455 + 1488 1456 int 1489 1457 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 1490 1458 const struct iomap_ops *ops, ··· 1518 1446 .private = private, 1519 1447 }; 1520 1448 struct address_space *mapping = inode->i_mapping; 1521 - unsigned int blocksize = i_blocksize(inode); 1522 - unsigned int off = pos & (blocksize - 1); 1523 - loff_t plen = min_t(loff_t, len, blocksize - off); 1524 1449 int ret; 1525 1450 bool range_dirty; 1526 - 1527 - /* 1528 - * Zero range can skip mappings that are zero on disk so long as 1529 - * pagecache is clean. If pagecache was dirty prior to zero range, the 1530 - * mapping converts on writeback completion and so must be zeroed. 1531 - * 1532 - * The simplest way to deal with this across a range is to flush 1533 - * pagecache and process the updated mappings. To avoid excessive 1534 - * flushing on partial eof zeroing, special case it to zero the 1535 - * unaligned start portion if already dirty in pagecache. 1536 - */ 1537 - if (off && 1538 - filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { 1539 - iter.len = plen; 1540 - while ((ret = iomap_iter(&iter, ops)) > 0) 1541 - iter.status = iomap_zero_iter(&iter, did_zero, 1542 - write_ops); 1543 - 1544 - iter.len = len - (iter.pos - pos); 1545 - if (ret || !iter.len) 1546 - return ret; 1547 - } 1548 1451 1549 1452 /* 1550 1453 * To avoid an unconditional flush, check pagecache state and only flush 1551 1454 * if dirty and the fs returns a mapping that might convert on 1552 1455 * writeback. 1553 1456 */ 1554 - range_dirty = filemap_range_needs_writeback(inode->i_mapping, 1555 - iter.pos, iter.pos + iter.len - 1); 1457 + range_dirty = filemap_range_needs_writeback(mapping, iter.pos, 1458 + iter.pos + iter.len - 1); 1556 1459 while ((ret = iomap_iter(&iter, ops)) > 0) { 1557 1460 const struct iomap *srcmap = iomap_iter_srcmap(&iter); 1558 1461 1559 - if (srcmap->type == IOMAP_HOLE || 1560 - srcmap->type == IOMAP_UNWRITTEN) { 1462 + if (WARN_ON_ONCE(iter.fbatch && 1463 + srcmap->type != IOMAP_UNWRITTEN)) 1464 + return -EIO; 1465 + 1466 + if (!iter.fbatch && 1467 + (srcmap->type == IOMAP_HOLE || 1468 + srcmap->type == IOMAP_UNWRITTEN)) { 1561 1469 s64 status; 1562 1470 1563 1471 if (range_dirty) {

+6

fs/iomap/iter.c

··· 8 8 9 9 static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) 10 10 { 11 + if (iter->fbatch) { 12 + folio_batch_release(iter->fbatch); 13 + kfree(iter->fbatch); 14 + iter->fbatch = NULL; 15 + } 16 + 11 17 iter->status = 0; 12 18 memset(&iter->iomap, 0, sizeof(iter->iomap)); 13 19 memset(&iter->srcmap, 0, sizeof(iter->srcmap));

+4 -2

fs/xfs/libxfs/xfs_errortag.h

··· 73 73 #define XFS_ERRTAG_WRITE_DELAY_MS 43 74 74 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 75 75 #define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 76 - #define XFS_ERRTAG_MAX 46 76 + #define XFS_ERRTAG_FORCE_ZERO_RANGE 46 77 + #define XFS_ERRTAG_MAX 47 77 78 78 79 /* 79 80 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. ··· 134 133 XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ 135 134 XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ 136 135 XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ 137 - XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) 136 + XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \ 137 + XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4) 138 138 #endif /* XFS_ERRTAG */ 139 139 140 140 #endif /* __XFS_ERRORTAG_H_ */

+22 -7

fs/xfs/xfs_file.c

··· 27 27 #include "xfs_file.h" 28 28 #include "xfs_aops.h" 29 29 #include "xfs_zone_alloc.h" 30 + #include "xfs_error.h" 31 + #include "xfs_errortag.h" 30 32 31 33 #include <linux/dax.h> 32 34 #include <linux/falloc.h> ··· 1256 1254 struct xfs_zone_alloc_ctx *ac) 1257 1255 { 1258 1256 struct inode *inode = file_inode(file); 1257 + struct xfs_inode *ip = XFS_I(inode); 1259 1258 unsigned int blksize = i_blocksize(inode); 1260 1259 loff_t new_size = 0; 1261 1260 int error; 1262 1261 1263 - trace_xfs_zero_file_space(XFS_I(inode)); 1262 + trace_xfs_zero_file_space(ip); 1264 1263 1265 1264 error = xfs_falloc_newsize(file, mode, offset, len, &new_size); 1266 1265 if (error) 1267 1266 return error; 1268 1267 1269 - error = xfs_free_file_space(XFS_I(inode), offset, len, ac); 1270 - if (error) 1271 - return error; 1268 + /* 1269 + * Zero range implements a full zeroing mechanism but is only used in 1270 + * limited situations. It is more efficient to allocate unwritten 1271 + * extents than to perform zeroing here, so use an errortag to randomly 1272 + * force zeroing on DEBUG kernels for added test coverage. 1273 + */ 1274 + if (XFS_TEST_ERROR(ip->i_mount, 1275 + XFS_ERRTAG_FORCE_ZERO_RANGE)) { 1276 + error = xfs_zero_range(ip, offset, len, ac, NULL); 1277 + } else { 1278 + error = xfs_free_file_space(ip, offset, len, ac); 1279 + if (error) 1280 + return error; 1272 1281 1273 - len = round_up(offset + len, blksize) - round_down(offset, blksize); 1274 - offset = round_down(offset, blksize); 1275 - error = xfs_alloc_file_space(XFS_I(inode), offset, len); 1282 + len = round_up(offset + len, blksize) - 1283 + round_down(offset, blksize); 1284 + offset = round_down(offset, blksize); 1285 + error = xfs_alloc_file_space(ip, offset, len); 1286 + } 1276 1287 if (error) 1277 1288 return error; 1278 1289 return xfs_falloc_setsize(file, new_size);

+30 -8

fs/xfs/xfs_iomap.c

··· 1702 1702 struct iomap *iomap, 1703 1703 struct iomap *srcmap) 1704 1704 { 1705 + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, 1706 + iomap); 1705 1707 struct xfs_inode *ip = XFS_I(inode); 1706 1708 struct xfs_mount *mp = ip->i_mount; 1707 1709 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); ··· 1769 1767 } 1770 1768 1771 1769 /* 1772 - * For zeroing, trim a delalloc extent that extends beyond the EOF 1773 - * block. If it starts beyond the EOF block, convert it to an 1770 + * For zeroing, trim extents that extend beyond the EOF block. If a 1771 + * delalloc extent starts beyond the EOF block, convert it to an 1774 1772 * unwritten extent. 1775 1773 */ 1776 - if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && 1777 - isnullstartblock(imap.br_startblock)) { 1774 + if (flags & IOMAP_ZERO) { 1778 1775 xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); 1776 + u64 end; 1779 1777 1780 - if (offset_fsb >= eof_fsb) 1778 + if (isnullstartblock(imap.br_startblock) && 1779 + offset_fsb >= eof_fsb) 1781 1780 goto convert_delay; 1782 - if (end_fsb > eof_fsb) { 1781 + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) 1783 1782 end_fsb = eof_fsb; 1784 - xfs_trim_extent(&imap, offset_fsb, 1785 - end_fsb - offset_fsb); 1783 + 1784 + /* 1785 + * Look up dirty folios for unwritten mappings within EOF. 1786 + * Providing this bypasses the flush iomap uses to trigger 1787 + * extent conversion when unwritten mappings have dirty 1788 + * pagecache in need of zeroing. 1789 + * 1790 + * Trim the mapping to the end pos of the lookup, which in turn 1791 + * was trimmed to the end of the batch if it became full before 1792 + * the end of the mapping. 1793 + */ 1794 + if (imap.br_state == XFS_EXT_UNWRITTEN && 1795 + offset_fsb < eof_fsb) { 1796 + loff_t len = min(count, 1797 + XFS_FSB_TO_B(mp, imap.br_blockcount)); 1798 + 1799 + end = iomap_fill_dirty_folios(iter, offset, len); 1800 + end_fsb = min_t(xfs_fileoff_t, end_fsb, 1801 + XFS_B_TO_FSB(mp, end)); 1786 1802 } 1803 + 1804 + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); 1787 1805 } 1788 1806 1789 1807 /*

+4

include/linux/iomap.h

··· 9 9 #include <linux/types.h> 10 10 #include <linux/mm_types.h> 11 11 #include <linux/blkdev.h> 12 + #include <linux/pagevec.h> 12 13 13 14 struct address_space; 14 15 struct fiemap_extent_info; ··· 243 242 unsigned flags; 244 243 struct iomap iomap; 245 244 struct iomap srcmap; 245 + struct folio_batch *fbatch; 246 246 void *private; 247 247 }; 248 248 ··· 352 350 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, 353 351 const struct iomap_ops *ops, 354 352 const struct iomap_write_ops *write_ops); 353 + loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, 354 + loff_t length); 355 355 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, 356 356 bool *did_zero, const struct iomap_ops *ops, 357 357 const struct iomap_write_ops *write_ops, void *private);

+2

include/linux/pagemap.h

··· 977 977 pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); 978 978 unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, 979 979 pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); 980 + unsigned filemap_get_folios_dirty(struct address_space *mapping, 981 + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); 980 982 981 983 struct folio *read_cache_folio(struct address_space *, pgoff_t index, 982 984 filler_t *filler, struct file *file);

+58

mm/filemap.c

··· 2366 2366 } 2367 2367 EXPORT_SYMBOL(filemap_get_folios_tag); 2368 2368 2369 + /** 2370 + * filemap_get_folios_dirty - Get a batch of dirty folios 2371 + * @mapping: The address_space to search 2372 + * @start: The starting folio index 2373 + * @end: The final folio index (inclusive) 2374 + * @fbatch: The batch to fill 2375 + * 2376 + * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except 2377 + * the returned folios are presumed to be dirty or undergoing writeback. Dirty 2378 + * state is presumed because we don't block on folio lock nor want to miss 2379 + * folios. Callers that need to can recheck state upon locking the folio. 2380 + * 2381 + * This may not return all dirty folios if the batch gets filled up. 2382 + * 2383 + * Return: The number of folios found. 2384 + * Also update @start to be positioned for traversal of the next folio. 2385 + */ 2386 + unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, 2387 + pgoff_t end, struct folio_batch *fbatch) 2388 + { 2389 + XA_STATE(xas, &mapping->i_pages, *start); 2390 + struct folio *folio; 2391 + 2392 + rcu_read_lock(); 2393 + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { 2394 + if (xa_is_value(folio)) 2395 + continue; 2396 + if (folio_trylock(folio)) { 2397 + bool clean = !folio_test_dirty(folio) && 2398 + !folio_test_writeback(folio); 2399 + folio_unlock(folio); 2400 + if (clean) { 2401 + folio_put(folio); 2402 + continue; 2403 + } 2404 + } 2405 + if (!folio_batch_add(fbatch, folio)) { 2406 + unsigned long nr = folio_nr_pages(folio); 2407 + *start = folio->index + nr; 2408 + goto out; 2409 + } 2410 + } 2411 + /* 2412 + * We come here when there is no folio beyond @end. We take care to not 2413 + * overflow the index @start as it confuses some of the callers. This 2414 + * breaks the iteration when there is a folio at index -1 but that is 2415 + * already broke anyway. 2416 + */ 2417 + if (end == (pgoff_t)-1) 2418 + *start = (pgoff_t)-1; 2419 + else 2420 + *start = end + 1; 2421 + out: 2422 + rcu_read_unlock(); 2423 + 2424 + return folio_batch_count(fbatch); 2425 + } 2426 + 2369 2427 /* 2370 2428 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 2371 2429 * a _large_ part of the i/o request. Imagine the worst scenario:

Configure Feed

Configure Feed