Merge tag 'xfs-merge-7.1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+4

Documentation/admin-guide/xfs.rst

··· 550 550 is limited by the capabilities of the backing zoned device, file system 551 551 size and the max_open_zones mount option. 552 552 553 + nr_open_zones (Min: 0 Default: Varies Max: UINTMAX) 554 + This read-only attribute exposes the current number of open zones 555 + used by the file system. 556 + 553 557 zonegc_low_space (Min: 0 Default: 0 Max: 100) 554 558 Define a percentage for how much of the unused space that GC should keep 555 559 available for writing. A high value will reclaim more of the space

+1 -5

fs/iomap/buffered-io.c

··· 1647 1647 while ((ret = iomap_iter(&iter, ops)) > 0) { 1648 1648 const struct iomap *srcmap = iomap_iter_srcmap(&iter); 1649 1649 1650 - if (WARN_ON_ONCE((iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && 1651 - srcmap->type != IOMAP_UNWRITTEN)) 1652 - return -EIO; 1653 - 1654 1650 if (!(iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && 1655 1651 (srcmap->type == IOMAP_HOLE || 1656 1652 srcmap->type == IOMAP_UNWRITTEN)) { 1657 1653 s64 status; 1658 1654 1659 - if (range_dirty) { 1655 + if (range_dirty && srcmap->type == IOMAP_UNWRITTEN) { 1660 1656 range_dirty = false; 1661 1657 status = iomap_zero_iter_flush_and_stale(&iter); 1662 1658 } else {

+2 -11

fs/xfs/libxfs/xfs_ag.c

··· 110 110 struct xfs_group *xg) 111 111 { 112 112 #ifdef __KERNEL__ 113 - struct xfs_perag *pag = to_perag(xg); 114 - 115 - cancel_delayed_work_sync(&pag->pag_blockgc_work); 116 - xfs_buf_cache_destroy(&pag->pag_bcache); 113 + cancel_delayed_work_sync(&to_perag(xg)->pag_blockgc_work); 117 114 #endif 118 115 } 119 116 ··· 232 235 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 233 236 #endif /* __KERNEL__ */ 234 237 235 - error = xfs_buf_cache_init(&pag->pag_bcache); 236 - if (error) 237 - goto out_free_perag; 238 - 239 238 /* 240 239 * Pre-calculated geometry 241 240 */ ··· 243 250 244 251 error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG); 245 252 if (error) 246 - goto out_buf_cache_destroy; 253 + goto out_free_perag; 247 254 248 255 return 0; 249 256 250 - out_buf_cache_destroy: 251 - xfs_buf_cache_destroy(&pag->pag_bcache); 252 257 out_free_perag: 253 258 kfree(pag); 254 259 return error;

-2

fs/xfs/libxfs/xfs_ag.h

··· 85 85 int pag_ici_reclaimable; /* reclaimable inodes */ 86 86 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ 87 87 88 - struct xfs_buf_cache pag_bcache; 89 - 90 88 /* background prealloc block trimming */ 91 89 struct delayed_work pag_blockgc_work; 92 90 #endif /* __KERNEL__ */

+4 -1

fs/xfs/libxfs/xfs_fs.h

··· 995 995 __u32 rg_sick; /* o: sick things in ag */ 996 996 __u32 rg_checked; /* o: checked metadata in ag */ 997 997 __u32 rg_flags; /* i/o: flags for this ag */ 998 - __u32 rg_reserved[27]; /* o: zero */ 998 + __u32 rg_writepointer; /* o: write pointer block offset for zoned */ 999 + __u32 rg_reserved[26]; /* o: zero */ 999 1000 }; 1000 1001 #define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */ 1001 1002 #define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ 1002 1003 #define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */ 1003 1004 #define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ 1004 1005 #define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ 1006 + 1007 + #define XFS_RTGROUP_GEOM_WRITEPOINTER (1U << 0) /* write pointer */ 1005 1008 1006 1009 /* Health monitor event domains */ 1007 1010

+82 -156

fs/xfs/xfs_buf.c

··· 31 31 * 32 32 * xfs_buf_stale: 33 33 * b_sema (caller holds) 34 - * b_lock 34 + * b_lockref.lock 35 35 * lru_lock 36 36 * 37 37 * xfs_buf_rele: 38 - * b_lock 38 + * b_lockref.lock 39 39 * lru_lock 40 40 * 41 41 * xfs_buftarg_drain_rele 42 42 * lru_lock 43 - * b_lock (trylock due to inversion) 43 + * b_lockref.lock (trylock due to inversion) 44 44 * 45 45 * xfs_buftarg_isolate 46 46 * lru_lock 47 - * b_lock (trylock due to inversion) 47 + * b_lockref.lock (trylock due to inversion) 48 48 */ 49 49 50 50 static void xfs_buf_submit(struct xfs_buf *bp); ··· 78 78 */ 79 79 bp->b_flags &= ~_XBF_DELWRI_Q; 80 80 81 - spin_lock(&bp->b_lock); 81 + spin_lock(&bp->b_lockref.lock); 82 82 atomic_set(&bp->b_lru_ref, 0); 83 - if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 84 - (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) 85 - bp->b_hold--; 86 - 87 - ASSERT(bp->b_hold >= 1); 88 - spin_unlock(&bp->b_lock); 83 + if (!__lockref_is_dead(&bp->b_lockref)) 84 + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); 85 + spin_unlock(&bp->b_lockref.lock); 89 86 } 90 87 91 88 static void ··· 274 277 * inserting into the hash table are safe (and will have to wait for 275 278 * the unlock to do anything non-trivial). 276 279 */ 277 - bp->b_hold = 1; 280 + lockref_init(&bp->b_lockref); 278 281 sema_init(&bp->b_sema, 0); /* held, no waiters */ 279 - 280 - spin_lock_init(&bp->b_lock); 281 282 atomic_set(&bp->b_lru_ref, 1); 282 283 init_completion(&bp->b_iowait); 283 284 INIT_LIST_HEAD(&bp->b_lru); ··· 363 368 .obj_cmpfn = _xfs_buf_obj_cmp, 364 369 }; 365 370 366 - int 367 - xfs_buf_cache_init( 368 - struct xfs_buf_cache *bch) 369 - { 370 - return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); 371 - } 372 - 373 - void 374 - xfs_buf_cache_destroy( 375 - struct xfs_buf_cache *bch) 376 - { 377 - rhashtable_destroy(&bch->bc_hash); 378 - } 379 - 380 371 static int 381 372 xfs_buf_map_verify( 382 373 struct xfs_buftarg *btp, ··· 418 437 return 0; 419 438 } 420 439 421 - static bool 422 - xfs_buf_try_hold( 423 - struct xfs_buf *bp) 424 - { 425 - spin_lock(&bp->b_lock); 426 - if (bp->b_hold == 0) { 427 - spin_unlock(&bp->b_lock); 428 - return false; 429 - } 430 - bp->b_hold++; 431 - spin_unlock(&bp->b_lock); 432 - return true; 433 - } 434 - 435 440 static inline int 436 441 xfs_buf_lookup( 437 - struct xfs_buf_cache *bch, 442 + struct xfs_buftarg *btp, 438 443 struct xfs_buf_map *map, 439 444 xfs_buf_flags_t flags, 440 445 struct xfs_buf **bpp) ··· 429 462 int error; 430 463 431 464 rcu_read_lock(); 432 - bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); 433 - if (!bp || !xfs_buf_try_hold(bp)) { 465 + bp = rhashtable_lookup(&btp->bt_hash, map, xfs_buf_hash_params); 466 + if (!bp || !lockref_get_not_dead(&bp->b_lockref)) { 434 467 rcu_read_unlock(); 435 468 return -ENOENT; 436 469 } ··· 454 487 static int 455 488 xfs_buf_find_insert( 456 489 struct xfs_buftarg *btp, 457 - struct xfs_buf_cache *bch, 458 490 struct xfs_perag *pag, 459 491 struct xfs_buf_map *cmap, 460 492 struct xfs_buf_map *map, ··· 473 507 new_bp->b_pag = pag; 474 508 475 509 rcu_read_lock(); 476 - bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, 510 + bp = rhashtable_lookup_get_insert_fast(&btp->bt_hash, 477 511 &new_bp->b_rhash_head, xfs_buf_hash_params); 478 512 if (IS_ERR(bp)) { 479 513 rcu_read_unlock(); 480 514 error = PTR_ERR(bp); 481 515 goto out_free_buf; 482 516 } 483 - if (bp && xfs_buf_try_hold(bp)) { 517 + if (bp && lockref_get_not_dead(&bp->b_lockref)) { 484 518 /* found an existing buffer */ 485 519 rcu_read_unlock(); 486 520 error = xfs_buf_find_lock(bp, flags); ··· 515 549 return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); 516 550 } 517 551 518 - static inline struct xfs_buf_cache * 519 - xfs_buftarg_buf_cache( 520 - struct xfs_buftarg *btp, 521 - struct xfs_perag *pag) 522 - { 523 - if (pag) 524 - return &pag->pag_bcache; 525 - return btp->bt_cache; 526 - } 527 - 528 552 /* 529 553 * Assembles a buffer covering the specified range. The code is optimised for 530 554 * cache hits, as metadata intensive workloads will see 3 orders of magnitude ··· 528 572 xfs_buf_flags_t flags, 529 573 struct xfs_buf **bpp) 530 574 { 531 - struct xfs_buf_cache *bch; 532 575 struct xfs_perag *pag; 533 576 struct xfs_buf *bp = NULL; 534 577 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; ··· 544 589 return error; 545 590 546 591 pag = xfs_buftarg_get_pag(btp, &cmap); 547 - bch = xfs_buftarg_buf_cache(btp, pag); 548 592 549 - error = xfs_buf_lookup(bch, &cmap, flags, &bp); 593 + error = xfs_buf_lookup(btp, &cmap, flags, &bp); 550 594 if (error && error != -ENOENT) 551 595 goto out_put_perag; 552 596 ··· 557 603 goto out_put_perag; 558 604 559 605 /* xfs_buf_find_insert() consumes the perag reference. */ 560 - error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, 606 + error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, 561 607 flags, &bp); 562 608 if (error) 563 609 return error; ··· 810 856 { 811 857 trace_xfs_buf_hold(bp, _RET_IP_); 812 858 813 - spin_lock(&bp->b_lock); 814 - bp->b_hold++; 815 - spin_unlock(&bp->b_lock); 859 + lockref_get(&bp->b_lockref); 816 860 } 817 861 818 862 static void 819 - xfs_buf_rele_uncached( 863 + xfs_buf_destroy( 820 864 struct xfs_buf *bp) 821 865 { 822 - ASSERT(list_empty(&bp->b_lru)); 866 + ASSERT(__lockref_is_dead(&bp->b_lockref)); 867 + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 823 868 824 - spin_lock(&bp->b_lock); 825 - if (--bp->b_hold) { 826 - spin_unlock(&bp->b_lock); 827 - return; 869 + if (!xfs_buf_is_uncached(bp)) { 870 + rhashtable_remove_fast(&bp->b_target->bt_hash, 871 + &bp->b_rhash_head, xfs_buf_hash_params); 872 + 873 + if (bp->b_pag) 874 + xfs_perag_put(bp->b_pag); 828 875 } 829 - spin_unlock(&bp->b_lock); 876 + 830 877 xfs_buf_free(bp); 831 - } 832 - 833 - static void 834 - xfs_buf_rele_cached( 835 - struct xfs_buf *bp) 836 - { 837 - struct xfs_buftarg *btp = bp->b_target; 838 - struct xfs_perag *pag = bp->b_pag; 839 - struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); 840 - bool freebuf = false; 841 - 842 - trace_xfs_buf_rele(bp, _RET_IP_); 843 - 844 - spin_lock(&bp->b_lock); 845 - ASSERT(bp->b_hold >= 1); 846 - if (bp->b_hold > 1) { 847 - bp->b_hold--; 848 - goto out_unlock; 849 - } 850 - 851 - /* we are asked to drop the last reference */ 852 - if (atomic_read(&bp->b_lru_ref)) { 853 - /* 854 - * If the buffer is added to the LRU, keep the reference to the 855 - * buffer for the LRU and clear the (now stale) dispose list 856 - * state flag, else drop the reference. 857 - */ 858 - if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) 859 - bp->b_state &= ~XFS_BSTATE_DISPOSE; 860 - else 861 - bp->b_hold--; 862 - } else { 863 - bp->b_hold--; 864 - /* 865 - * most of the time buffers will already be removed from the 866 - * LRU, so optimise that case by checking for the 867 - * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 868 - * was on was the disposal list 869 - */ 870 - if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 871 - list_lru_del_obj(&btp->bt_lru, &bp->b_lru); 872 - } else { 873 - ASSERT(list_empty(&bp->b_lru)); 874 - } 875 - 876 - ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 877 - rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, 878 - xfs_buf_hash_params); 879 - if (pag) 880 - xfs_perag_put(pag); 881 - freebuf = true; 882 - } 883 - 884 - out_unlock: 885 - spin_unlock(&bp->b_lock); 886 - 887 - if (freebuf) 888 - xfs_buf_free(bp); 889 878 } 890 879 891 880 /* ··· 839 942 struct xfs_buf *bp) 840 943 { 841 944 trace_xfs_buf_rele(bp, _RET_IP_); 842 - if (xfs_buf_is_uncached(bp)) 843 - xfs_buf_rele_uncached(bp); 844 - else 845 - xfs_buf_rele_cached(bp); 945 + 946 + if (lockref_put_or_lock(&bp->b_lockref)) 947 + return; 948 + if (!--bp->b_lockref.count) { 949 + if (xfs_buf_is_uncached(bp) || !atomic_read(&bp->b_lru_ref)) 950 + goto kill; 951 + list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru); 952 + } 953 + spin_unlock(&bp->b_lockref.lock); 954 + return; 955 + 956 + kill: 957 + lockref_mark_dead(&bp->b_lockref); 958 + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); 959 + spin_unlock(&bp->b_lockref.lock); 960 + 961 + xfs_buf_destroy(bp); 846 962 } 847 963 848 964 /* ··· 1164 1254 1165 1255 /* 1166 1256 * To simulate an I/O failure, the buffer must be locked and held with at least 1167 - * three references. The LRU reference is dropped by the stale call. The buf 1168 - * item reference is dropped via ioend processing. The third reference is owned 1169 - * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. 1257 + * two references. 1258 + * 1259 + * The buf item reference is dropped via ioend processing. The second reference 1260 + * is owned by the caller and is dropped on I/O completion if the buffer is 1261 + * XBF_ASYNC. 1170 1262 */ 1171 1263 void 1172 1264 xfs_buf_ioend_fail( ··· 1424 1512 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1425 1513 struct list_head *dispose = arg; 1426 1514 1427 - if (!spin_trylock(&bp->b_lock)) 1515 + if (!spin_trylock(&bp->b_lockref.lock)) 1428 1516 return LRU_SKIP; 1429 - if (bp->b_hold > 1) { 1517 + if (bp->b_lockref.count > 0) { 1430 1518 /* need to wait, so skip it this pass */ 1431 - spin_unlock(&bp->b_lock); 1519 + spin_unlock(&bp->b_lockref.lock); 1432 1520 trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 1433 1521 return LRU_SKIP; 1434 1522 } 1435 1523 1436 - /* 1437 - * clear the LRU reference count so the buffer doesn't get 1438 - * ignored in xfs_buf_rele(). 1439 - */ 1440 - atomic_set(&bp->b_lru_ref, 0); 1441 - bp->b_state |= XFS_BSTATE_DISPOSE; 1524 + lockref_mark_dead(&bp->b_lockref); 1442 1525 list_lru_isolate_move(lru, item, dispose); 1443 - spin_unlock(&bp->b_lock); 1526 + spin_unlock(&bp->b_lockref.lock); 1444 1527 return LRU_REMOVED; 1445 1528 } 1446 1529 ··· 1488 1581 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1489 1582 (long long)xfs_buf_daddr(bp)); 1490 1583 } 1491 - xfs_buf_rele(bp); 1584 + xfs_buf_destroy(bp); 1492 1585 } 1493 1586 if (loop++ != 0) 1494 1587 delay(100); ··· 1517 1610 struct list_head *dispose = arg; 1518 1611 1519 1612 /* 1520 - * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1521 - * If we fail to get the lock, just skip it. 1613 + * We are inverting the lru lock vs bp->b_lockref.lock order here, so 1614 + * use a trylock. If we fail to get the lock, just skip the buffer. 1522 1615 */ 1523 - if (!spin_trylock(&bp->b_lock)) 1616 + if (!spin_trylock(&bp->b_lockref.lock)) 1524 1617 return LRU_SKIP; 1618 + 1619 + /* 1620 + * If the buffer is in use, remove it from the LRU for now. We can't 1621 + * free it while someone is using it, and we should also not count 1622 + * eviction passed for it, just as if it hadn't been added to the LRU 1623 + * yet. 1624 + */ 1625 + if (bp->b_lockref.count > 0) { 1626 + list_lru_isolate(lru, &bp->b_lru); 1627 + spin_unlock(&bp->b_lockref.lock); 1628 + return LRU_REMOVED; 1629 + } 1630 + 1525 1631 /* 1526 1632 * Decrement the b_lru_ref count unless the value is already 1527 1633 * zero. If the value is already zero, we need to reclaim the 1528 1634 * buffer, otherwise it gets another trip through the LRU. 1529 1635 */ 1530 1636 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1531 - spin_unlock(&bp->b_lock); 1637 + spin_unlock(&bp->b_lockref.lock); 1532 1638 return LRU_ROTATE; 1533 1639 } 1534 1640 1535 - bp->b_state |= XFS_BSTATE_DISPOSE; 1641 + lockref_mark_dead(&bp->b_lockref); 1536 1642 list_lru_isolate_move(lru, item, dispose); 1537 - spin_unlock(&bp->b_lock); 1643 + spin_unlock(&bp->b_lockref.lock); 1538 1644 return LRU_REMOVED; 1539 1645 } 1540 1646 ··· 1567 1647 struct xfs_buf *bp; 1568 1648 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1569 1649 list_del_init(&bp->b_lru); 1570 - xfs_buf_rele(bp); 1650 + xfs_buf_destroy(bp); 1571 1651 } 1572 1652 1573 1653 return freed; ··· 1590 1670 ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); 1591 1671 percpu_counter_destroy(&btp->bt_readahead_count); 1592 1672 list_lru_destroy(&btp->bt_lru); 1673 + rhashtable_destroy(&btp->bt_hash); 1593 1674 } 1594 1675 1595 1676 void ··· 1685 1764 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, 1686 1765 DEFAULT_RATELIMIT_BURST); 1687 1766 1688 - if (list_lru_init(&btp->bt_lru)) 1767 + if (rhashtable_init(&btp->bt_hash, &xfs_buf_hash_params)) 1689 1768 return -ENOMEM; 1769 + if (list_lru_init(&btp->bt_lru)) 1770 + goto out_destroy_hash; 1690 1771 if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) 1691 1772 goto out_destroy_lru; 1692 1773 ··· 1706 1783 percpu_counter_destroy(&btp->bt_readahead_count); 1707 1784 out_destroy_lru: 1708 1785 list_lru_destroy(&btp->bt_lru); 1786 + out_destroy_hash: 1787 + rhashtable_destroy(&btp->bt_hash); 1709 1788 return -ENOMEM; 1710 1789 } 1711 1790 ··· 1756 1831 return btp; 1757 1832 1758 1833 error_free: 1834 + fs_put_dax(btp->bt_daxdev, mp); 1759 1835 kfree(btp); 1760 1836 return ERR_PTR(error); 1761 1837 }

+3 -17

fs/xfs/xfs_buf.h

··· 14 14 #include <linux/dax.h> 15 15 #include <linux/uio.h> 16 16 #include <linux/list_lru.h> 17 + #include <linux/lockref.h> 17 18 18 19 extern struct kmem_cache *xfs_buf_cache; 19 20 ··· 70 69 { XBF_TRYLOCK, "TRYLOCK" } 71 70 72 71 /* 73 - * Internal state flags. 74 - */ 75 - #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ 76 - 77 - struct xfs_buf_cache { 78 - struct rhashtable bc_hash; 79 - }; 80 - 81 - int xfs_buf_cache_init(struct xfs_buf_cache *bch); 82 - void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); 83 - 84 - /* 85 72 * The xfs_buftarg contains 2 notions of "sector size" - 86 73 * 87 74 * 1) The metadata sector size, which is the minimum unit and ··· 106 117 unsigned int bt_awu_min; 107 118 unsigned int bt_awu_max; 108 119 109 - /* built-in cache, if we're not using the perag one */ 110 - struct xfs_buf_cache bt_cache[]; 120 + struct rhashtable bt_hash; 111 121 }; 112 122 113 123 struct xfs_buf_map { ··· 147 159 148 160 xfs_daddr_t b_rhash_key; /* buffer cache index */ 149 161 int b_length; /* size of buffer in BBs */ 150 - unsigned int b_hold; /* reference count */ 162 + struct lockref b_lockref; /* refcount + lock */ 151 163 atomic_t b_lru_ref; /* lru reclaim ref count */ 152 164 xfs_buf_flags_t b_flags; /* status flags */ 153 165 struct semaphore b_sema; /* semaphore for lockables */ ··· 157 169 * bt_lru_lock and not by b_sema 158 170 */ 159 171 struct list_head b_lru; /* lru list */ 160 - spinlock_t b_lock; /* internal state lock */ 161 - unsigned int b_state; /* internal state flags */ 162 172 wait_queue_head_t b_waiters; /* unpin waiters */ 163 173 struct list_head b_list; 164 174 struct xfs_perag *b_pag;

+2 -9

fs/xfs/xfs_buf_mem.c

··· 58 58 struct xfs_buftarg *btp; 59 59 int error; 60 60 61 - btp = kzalloc_flex(*btp, bt_cache, 1); 61 + btp = kzalloc_obj(*btp); 62 62 if (!btp) 63 63 return -ENOMEM; 64 64 ··· 81 81 /* ensure all writes are below EOF to avoid pagecache zeroing */ 82 82 i_size_write(inode, inode->i_sb->s_maxbytes); 83 83 84 - error = xfs_buf_cache_init(btp->bt_cache); 85 - if (error) 86 - goto out_file; 87 - 88 84 /* Initialize buffer target */ 89 85 btp->bt_mount = mp; 90 86 btp->bt_dev = (dev_t)-1U; ··· 91 95 92 96 error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr); 93 97 if (error) 94 - goto out_bcache; 98 + goto out_file; 95 99 96 100 trace_xmbuf_create(btp); 97 101 98 102 *btpp = btp; 99 103 return 0; 100 104 101 - out_bcache: 102 - xfs_buf_cache_destroy(btp->bt_cache); 103 105 out_file: 104 106 fput(file); 105 107 out_free_btp: ··· 116 122 trace_xmbuf_free(btp); 117 123 118 124 xfs_destroy_buftarg(btp); 119 - xfs_buf_cache_destroy(btp->bt_cache); 120 125 fput(btp->bt_file); 121 126 kfree(btp); 122 127 }

+2 -2

fs/xfs/xfs_extent_busy.c

··· 690 690 container_of(l2, struct xfs_extent_busy, list); 691 691 s32 diff; 692 692 693 - diff = b1->group->xg_gno - b2->group->xg_gno; 693 + diff = cmp_int(b1->group->xg_gno, b2->group->xg_gno); 694 694 if (!diff) 695 - diff = b1->bno - b2->bno; 695 + diff = cmp_int(b1->bno, b2->bno); 696 696 return diff; 697 697 } 698 698

+1 -1

fs/xfs/xfs_extfree_item.c

··· 387 387 struct xfs_extent_free_item *ra = xefi_entry(a); 388 388 struct xfs_extent_free_item *rb = xefi_entry(b); 389 389 390 - return ra->xefi_group->xg_gno - rb->xefi_group->xg_gno; 390 + return cmp_int(ra->xefi_group->xg_gno, rb->xefi_group->xg_gno); 391 391 } 392 392 393 393 /* Log a free extent to the intent item. */

+86 -26

fs/xfs/xfs_file.c

··· 560 560 flags, ac); 561 561 } 562 562 563 + /* 564 + * We need to lock the test/set EOF update as we can be racing with 565 + * other IO completions here to update the EOF. Failing to serialise 566 + * here can result in EOF moving backwards and Bad Things Happen when 567 + * that occurs. 568 + * 569 + * As IO completion only ever extends EOF, we can do an unlocked check 570 + * here to avoid taking the spinlock. If we land within the current EOF, 571 + * then we do not need to do an extending update at all, and we don't 572 + * need to take the lock to check this. If we race with an update moving 573 + * EOF, then we'll either still be beyond EOF and need to take the lock, 574 + * or we'll be within EOF and we don't need to take it at all. 575 + */ 576 + static int 577 + xfs_dio_endio_set_isize( 578 + struct inode *inode, 579 + loff_t offset, 580 + ssize_t size) 581 + { 582 + struct xfs_inode *ip = XFS_I(inode); 583 + 584 + if (offset + size <= i_size_read(inode)) 585 + return 0; 586 + 587 + spin_lock(&ip->i_flags_lock); 588 + if (offset + size <= i_size_read(inode)) { 589 + spin_unlock(&ip->i_flags_lock); 590 + return 0; 591 + } 592 + 593 + i_size_write(inode, offset + size); 594 + spin_unlock(&ip->i_flags_lock); 595 + 596 + return xfs_setfilesize(ip, offset, size); 597 + } 598 + 599 + static int 600 + xfs_zoned_dio_write_end_io( 601 + struct kiocb *iocb, 602 + ssize_t size, 603 + int error, 604 + unsigned flags) 605 + { 606 + struct inode *inode = file_inode(iocb->ki_filp); 607 + struct xfs_inode *ip = XFS_I(inode); 608 + unsigned int nofs_flag; 609 + 610 + ASSERT(!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 611 + 612 + trace_xfs_end_io_direct_write(ip, iocb->ki_pos, size); 613 + 614 + if (xfs_is_shutdown(ip->i_mount)) 615 + return -EIO; 616 + 617 + if (error || !size) 618 + return error; 619 + 620 + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); 621 + 622 + nofs_flag = memalloc_nofs_save(); 623 + error = xfs_dio_endio_set_isize(inode, iocb->ki_pos, size); 624 + memalloc_nofs_restore(nofs_flag); 625 + 626 + return error; 627 + } 628 + 563 629 static int 564 630 xfs_dio_write_end_io( 565 631 struct kiocb *iocb, ··· 638 572 loff_t offset = iocb->ki_pos; 639 573 unsigned int nofs_flag; 640 574 641 - ASSERT(!xfs_is_zoned_inode(ip) || 642 - !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 575 + ASSERT(!xfs_is_zoned_inode(ip)); 643 576 644 577 trace_xfs_end_io_direct_write(ip, offset, size); 645 578 ··· 688 623 * with the on-disk inode size being outside the in-core inode size. We 689 624 * have no other method of updating EOF for AIO, so always do it here 690 625 * if necessary. 691 - * 692 - * We need to lock the test/set EOF update as we can be racing with 693 - * other IO completions here to update the EOF. Failing to serialise 694 - * here can result in EOF moving backwards and Bad Things Happen when 695 - * that occurs. 696 - * 697 - * As IO completion only ever extends EOF, we can do an unlocked check 698 - * here to avoid taking the spinlock. If we land within the current EOF, 699 - * then we do not need to do an extending update at all, and we don't 700 - * need to take the lock to check this. If we race with an update moving 701 - * EOF, then we'll either still be beyond EOF and need to take the lock, 702 - * or we'll be within EOF and we don't need to take it at all. 703 626 */ 704 - if (offset + size <= i_size_read(inode)) 705 - goto out; 706 - 707 - spin_lock(&ip->i_flags_lock); 708 - if (offset + size > i_size_read(inode)) { 709 - i_size_write(inode, offset + size); 710 - spin_unlock(&ip->i_flags_lock); 711 - error = xfs_setfilesize(ip, offset, size); 712 - } else { 713 - spin_unlock(&ip->i_flags_lock); 714 - } 627 + error = xfs_dio_endio_set_isize(inode, offset, size); 715 628 716 629 out: 717 630 memalloc_nofs_restore(nofs_flag); ··· 731 688 static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 732 689 .bio_set = &iomap_ioend_bioset, 733 690 .submit_io = xfs_dio_zoned_submit_io, 734 - .end_io = xfs_dio_write_end_io, 691 + .end_io = xfs_zoned_dio_write_end_io, 735 692 }; 736 693 737 694 /* ··· 1305 1262 /* Offset should be less than i_size */ 1306 1263 if (offset >= isize) 1307 1264 return -EINVAL; 1265 + 1266 + /* 1267 + * Let writeback clean up EOF folio state before we bump i_size. The 1268 + * insert flushes before it starts shifting and under certain 1269 + * circumstances we can write back blocks that should technically be 1270 + * considered post-eof (and thus should not be submitted for writeback). 1271 + * 1272 + * For example, a large, dirty folio that spans EOF and is backed by 1273 + * post-eof COW fork preallocation can cause block remap into the data 1274 + * fork. This shifts back out beyond EOF, but creates an expectedly 1275 + * written post-eof block. The insert is going to flush, unmap and 1276 + * cancel prealloc across this whole range, so flush EOF now before we 1277 + * bump i_size to provide consistent behavior. 1278 + */ 1279 + error = filemap_write_and_wait_range(inode->i_mapping, isize, isize); 1280 + if (error) 1281 + return error; 1308 1282 1309 1283 error = xfs_falloc_setsize(file, isize + len); 1310 1284 if (error)

+19

fs/xfs/xfs_ioctl.c

··· 37 37 #include "xfs_ioctl.h" 38 38 #include "xfs_xattr.h" 39 39 #include "xfs_rtbitmap.h" 40 + #include "xfs_rtrmap_btree.h" 40 41 #include "xfs_file.h" 41 42 #include "xfs_exchrange.h" 42 43 #include "xfs_handle.h" 43 44 #include "xfs_rtgroup.h" 44 45 #include "xfs_healthmon.h" 45 46 #include "xfs_verify_media.h" 47 + #include "xfs_zone_priv.h" 48 + #include "xfs_zone_alloc.h" 46 49 47 50 #include <linux/mount.h> 48 51 #include <linux/fileattr.h> ··· 416 413 { 417 414 struct xfs_rtgroup *rtg; 418 415 struct xfs_rtgroup_geometry rgeo; 416 + xfs_rgblock_t highest_rgbno; 419 417 int error; 420 418 421 419 if (copy_from_user(&rgeo, arg, sizeof(rgeo))) ··· 436 432 xfs_rtgroup_put(rtg); 437 433 if (error) 438 434 return error; 435 + 436 + if (xfs_has_zoned(mp)) { 437 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 438 + if (rtg->rtg_open_zone) { 439 + rgeo.rg_writepointer = rtg->rtg_open_zone->oz_allocated; 440 + } else { 441 + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); 442 + if (highest_rgbno == NULLRGBLOCK) 443 + rgeo.rg_writepointer = 0; 444 + else 445 + rgeo.rg_writepointer = highest_rgbno + 1; 446 + } 447 + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 448 + rgeo.rg_flags |= XFS_RTGROUP_GEOM_WRITEPOINTER; 449 + } 439 450 440 451 if (copy_to_user(arg, &rgeo, sizeof(rgeo))) 441 452 return -EFAULT;

+112 -34

fs/xfs/xfs_iomap.c

··· 1593 1593 { 1594 1594 struct iomap_iter *iter = 1595 1595 container_of(iomap, struct iomap_iter, iomap); 1596 + struct address_space *mapping = inode->i_mapping; 1596 1597 struct xfs_zone_alloc_ctx *ac = iter->private; 1597 1598 struct xfs_inode *ip = XFS_I(inode); 1598 1599 struct xfs_mount *mp = ip->i_mount; ··· 1618 1617 if (error) 1619 1618 return error; 1620 1619 1620 + restart: 1621 1621 error = xfs_ilock_for_iomap(ip, flags, &lockmode); 1622 1622 if (error) 1623 1623 return error; ··· 1656 1654 &smap)) 1657 1655 smap.br_startoff = end_fsb; /* fake hole until EOF */ 1658 1656 if (smap.br_startoff > offset_fsb) { 1659 - /* 1660 - * We never need to allocate blocks for zeroing a hole. 1661 - */ 1662 - if (flags & IOMAP_ZERO) { 1663 - xfs_hole_to_iomap(ip, iomap, offset_fsb, 1664 - smap.br_startoff); 1665 - goto out_unlock; 1666 - } 1667 1657 end_fsb = min(end_fsb, smap.br_startoff); 1668 1658 } else { 1669 1659 end_fsb = min(end_fsb, ··· 1686 1692 end_fsb = min(end_fsb, got.br_startoff); 1687 1693 count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, 1688 1694 XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); 1695 + 1696 + /* 1697 + * When zeroing, don't allocate blocks for holes as they are already 1698 + * zeroes, but we need to ensure that no extents exist in both the data 1699 + * and COW fork to ensure this really is a hole. 1700 + * 1701 + * A window exists where we might observe a hole in both forks with 1702 + * valid data in cache. Writeback removes the COW fork blocks on 1703 + * submission but doesn't remap into the data fork until completion. If 1704 + * the data fork was previously a hole, we'll fail to zero. Until we 1705 + * find a way to avoid this transient state, check for dirty pagecache 1706 + * and flush to wait on blocks to land in the data fork. 1707 + */ 1708 + if ((flags & IOMAP_ZERO) && srcmap->type == IOMAP_HOLE) { 1709 + if (filemap_range_needs_writeback(mapping, offset, 1710 + offset + count - 1)) { 1711 + xfs_iunlock(ip, lockmode); 1712 + error = filemap_write_and_wait_range(mapping, offset, 1713 + offset + count - 1); 1714 + if (error) 1715 + return error; 1716 + goto restart; 1717 + } 1718 + 1719 + xfs_hole_to_iomap(ip, iomap, offset_fsb, end_fsb); 1720 + goto out_unlock; 1721 + } 1689 1722 1690 1723 /* 1691 1724 * The block reservation is supposed to cover all blocks that the ··· 1788 1767 struct xfs_mount *mp = ip->i_mount; 1789 1768 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 1790 1769 xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); 1770 + xfs_fileoff_t cow_fsb = NULLFILEOFF; 1771 + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); 1791 1772 struct xfs_bmbt_irec imap, cmap; 1792 1773 struct xfs_iext_cursor icur, ccur; 1793 1774 xfs_fsblock_t prealloc_blocks = 0; ··· 1834 1811 goto out_unlock; 1835 1812 1836 1813 /* 1837 - * Search the data fork first to look up our source mapping. We 1838 - * always need the data fork map, as we have to return it to the 1839 - * iomap code so that the higher level write code can read data in to 1840 - * perform read-modify-write cycles for unaligned writes. 1814 + * Search the data fork first to look up our source mapping. We always 1815 + * need the data fork map, as we have to return it to the iomap code so 1816 + * that the higher level write code can read data in to perform 1817 + * read-modify-write cycles for unaligned writes. 1818 + * 1819 + * Then search the COW fork extent list even if we did not find a data 1820 + * fork extent. This serves two purposes: first this implements the 1821 + * speculative preallocation using cowextsize, so that we also unshare 1822 + * block adjacent to shared blocks instead of just the shared blocks 1823 + * themselves. Second the lookup in the extent list is generally faster 1824 + * than going out to the shared extent tree. 1841 1825 */ 1842 1826 eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); 1843 1827 if (eof) 1844 1828 imap.br_startoff = end_fsb; /* fake hole until the end */ 1829 + if (xfs_is_cow_inode(ip)) { 1830 + if (!ip->i_cowfp) { 1831 + ASSERT(!xfs_is_reflink_inode(ip)); 1832 + xfs_ifork_init_cow(ip); 1833 + } 1834 + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, 1835 + &ccur, &cmap); 1836 + if (!cow_eof) 1837 + cow_fsb = cmap.br_startoff; 1838 + } 1845 1839 1846 - /* We never need to allocate blocks for zeroing or unsharing a hole. */ 1847 - if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) && 1848 - imap.br_startoff > offset_fsb) { 1840 + /* We never need to allocate blocks for unsharing a hole. */ 1841 + if ((flags & IOMAP_UNSHARE) && imap.br_startoff > offset_fsb) { 1849 1842 xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); 1850 1843 goto out_unlock; 1844 + } 1845 + 1846 + /* 1847 + * We may need to zero over a hole in the data fork if it's fronted by 1848 + * COW blocks and dirty pagecache. Scan such file ranges for dirty 1849 + * cache and fill the iomap batch with folios that need zeroing. 1850 + */ 1851 + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { 1852 + loff_t start, end; 1853 + unsigned int fbatch_count; 1854 + 1855 + imap.br_blockcount = imap.br_startoff - offset_fsb; 1856 + imap.br_startoff = offset_fsb; 1857 + imap.br_startblock = HOLESTARTBLOCK; 1858 + imap.br_state = XFS_EXT_NORM; 1859 + 1860 + if (cow_fsb == NULLFILEOFF) 1861 + goto found_imap; 1862 + if (cow_fsb > offset_fsb) { 1863 + xfs_trim_extent(&imap, offset_fsb, 1864 + cow_fsb - offset_fsb); 1865 + goto found_imap; 1866 + } 1867 + 1868 + /* no zeroing beyond eof, so split at the boundary */ 1869 + if (offset_fsb >= eof_fsb) 1870 + goto found_imap; 1871 + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) 1872 + xfs_trim_extent(&imap, offset_fsb, 1873 + eof_fsb - offset_fsb); 1874 + 1875 + /* COW fork blocks overlap the hole */ 1876 + xfs_trim_extent(&imap, offset_fsb, 1877 + cmap.br_startoff + cmap.br_blockcount - offset_fsb); 1878 + start = XFS_FSB_TO_B(mp, imap.br_startoff); 1879 + end = XFS_FSB_TO_B(mp, imap.br_startoff + imap.br_blockcount); 1880 + fbatch_count = iomap_fill_dirty_folios(iter, &start, end, 1881 + &iomap_flags); 1882 + xfs_trim_extent(&imap, offset_fsb, 1883 + XFS_B_TO_FSB(mp, start) - offset_fsb); 1884 + 1885 + /* 1886 + * Report the COW mapping if we have folios to zero. Otherwise 1887 + * ignore the COW blocks as preallocation and report a hole. 1888 + */ 1889 + if (fbatch_count) { 1890 + xfs_trim_extent(&cmap, imap.br_startoff, 1891 + imap.br_blockcount); 1892 + imap.br_startoff = end_fsb; /* fake hole */ 1893 + goto found_cow; 1894 + } 1895 + goto found_imap; 1851 1896 } 1852 1897 1853 1898 /* ··· 1924 1833 * unwritten extent. 1925 1834 */ 1926 1835 if (flags & IOMAP_ZERO) { 1927 - xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); 1928 - 1929 1836 if (isnullstartblock(imap.br_startblock) && 1930 1837 offset_fsb >= eof_fsb) 1931 1838 goto convert_delay; ··· 1956 1867 } 1957 1868 1958 1869 /* 1959 - * Search the COW fork extent list even if we did not find a data fork 1960 - * extent. This serves two purposes: first this implements the 1961 - * speculative preallocation using cowextsize, so that we also unshare 1962 - * block adjacent to shared blocks instead of just the shared blocks 1963 - * themselves. Second the lookup in the extent list is generally faster 1964 - * than going out to the shared extent tree. 1870 + * Now that we've handled any operation specific special cases, at this 1871 + * point we can report a COW mapping if found. 1965 1872 */ 1966 - if (xfs_is_cow_inode(ip)) { 1967 - if (!ip->i_cowfp) { 1968 - ASSERT(!xfs_is_reflink_inode(ip)); 1969 - xfs_ifork_init_cow(ip); 1970 - } 1971 - cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, 1972 - &ccur, &cmap); 1973 - if (!cow_eof && cmap.br_startoff <= offset_fsb) { 1974 - trace_xfs_reflink_cow_found(ip, &cmap); 1975 - goto found_cow; 1976 - } 1873 + if (xfs_is_cow_inode(ip) && 1874 + !cow_eof && cmap.br_startoff <= offset_fsb) { 1875 + trace_xfs_reflink_cow_found(ip, &cmap); 1876 + goto found_cow; 1977 1877 } 1978 1878 1979 1879 if (imap.br_startoff <= offset_fsb) {

+11 -28

fs/xfs/xfs_iops.c

··· 901 901 902 902 /* 903 903 * Truncate file. Must have write permission and not be a directory. 904 - * 905 - * Caution: The caller of this function is responsible for calling 906 - * setattr_prepare() or otherwise verifying the change is fine. 907 904 */ 908 - STATIC int 909 - xfs_setattr_size( 905 + int 906 + xfs_vn_setattr_size( 910 907 struct mnt_idmap *idmap, 911 908 struct dentry *dentry, 912 - struct xfs_inode *ip, 913 909 struct iattr *iattr) 914 910 { 911 + struct inode *inode = d_inode(dentry); 912 + struct xfs_inode *ip = XFS_I(inode); 915 913 struct xfs_mount *mp = ip->i_mount; 916 - struct inode *inode = VFS_I(ip); 917 - xfs_off_t oldsize, newsize; 914 + xfs_off_t oldsize = inode->i_size; 915 + xfs_off_t newsize = iattr->ia_size; 918 916 struct xfs_trans *tp; 919 917 int error; 920 918 uint lock_flags = 0; ··· 925 927 ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| 926 928 ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0); 927 929 928 - oldsize = inode->i_size; 929 - newsize = iattr->ia_size; 930 + trace_xfs_setattr(ip); 931 + 932 + error = xfs_vn_change_ok(idmap, dentry, iattr); 933 + if (error) 934 + return error; 930 935 931 936 /* 932 937 * Short circuit the truncate case for zero length files. ··· 1110 1109 xfs_inode_clear_eofblocks_tag(ip); 1111 1110 } 1112 1111 1113 - ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); 1114 1112 setattr_copy(idmap, inode, iattr); 1115 1113 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1116 1114 ··· 1127 1127 out_trans_cancel: 1128 1128 xfs_trans_cancel(tp); 1129 1129 goto out_unlock; 1130 - } 1131 - 1132 - int 1133 - xfs_vn_setattr_size( 1134 - struct mnt_idmap *idmap, 1135 - struct dentry *dentry, 1136 - struct iattr *iattr) 1137 - { 1138 - struct xfs_inode *ip = XFS_I(d_inode(dentry)); 1139 - int error; 1140 - 1141 - trace_xfs_setattr(ip); 1142 - 1143 - error = xfs_vn_change_ok(idmap, dentry, iattr); 1144 - if (error) 1145 - return error; 1146 - return xfs_setattr_size(idmap, dentry, ip, iattr); 1147 1130 } 1148 1131 1149 1132 STATIC int

+36 -39

fs/xfs/xfs_mount.c

··· 44 44 #include "xfs_healthmon.h" 45 45 46 46 static DEFINE_MUTEX(xfs_uuid_table_mutex); 47 - static int xfs_uuid_table_size; 48 - static uuid_t *xfs_uuid_table; 47 + static DEFINE_XARRAY_ALLOC(xfs_uuid_table); 48 + 49 + static uuid_t * 50 + xfs_uuid_search( 51 + uuid_t *new_uuid) 52 + { 53 + unsigned long index = 0; 54 + uuid_t *uuid; 55 + 56 + xa_for_each(&xfs_uuid_table, index, uuid) { 57 + if (uuid_equal(uuid, new_uuid)) 58 + return uuid; 59 + } 60 + return NULL; 61 + } 62 + 63 + static void 64 + xfs_uuid_delete( 65 + uuid_t *uuid, 66 + unsigned int index) 67 + { 68 + ASSERT(uuid_equal(xa_load(&xfs_uuid_table, index), uuid)); 69 + xa_erase(&xfs_uuid_table, index); 70 + } 49 71 50 72 void 51 73 xfs_uuid_table_free(void) 52 74 { 53 - if (xfs_uuid_table_size == 0) 54 - return; 55 - kfree(xfs_uuid_table); 56 - xfs_uuid_table = NULL; 57 - xfs_uuid_table_size = 0; 75 + ASSERT(xa_empty(&xfs_uuid_table)); 76 + xa_destroy(&xfs_uuid_table); 58 77 } 59 78 60 79 /* ··· 85 66 struct xfs_mount *mp) 86 67 { 87 68 uuid_t *uuid = &mp->m_sb.sb_uuid; 88 - int hole, i; 69 + int ret; 89 70 90 71 /* Publish UUID in struct super_block */ 91 72 super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid)); ··· 99 80 } 100 81 101 82 mutex_lock(&xfs_uuid_table_mutex); 102 - for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { 103 - if (uuid_is_null(&xfs_uuid_table[i])) { 104 - hole = i; 105 - continue; 106 - } 107 - if (uuid_equal(uuid, &xfs_uuid_table[i])) 108 - goto out_duplicate; 83 + if (unlikely(xfs_uuid_search(uuid))) { 84 + xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", 85 + uuid); 86 + mutex_unlock(&xfs_uuid_table_mutex); 87 + return -EINVAL; 109 88 } 110 89 111 - if (hole < 0) { 112 - xfs_uuid_table = krealloc(xfs_uuid_table, 113 - (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), 114 - GFP_KERNEL | __GFP_NOFAIL); 115 - hole = xfs_uuid_table_size++; 116 - } 117 - xfs_uuid_table[hole] = *uuid; 90 + ret = xa_alloc(&xfs_uuid_table, &mp->m_uuid_table_index, uuid, 91 + xa_limit_32b, GFP_KERNEL); 118 92 mutex_unlock(&xfs_uuid_table_mutex); 119 - 120 - return 0; 121 - 122 - out_duplicate: 123 - mutex_unlock(&xfs_uuid_table_mutex); 124 - xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); 125 - return -EINVAL; 93 + return ret; 126 94 } 127 95 128 96 STATIC void ··· 117 111 struct xfs_mount *mp) 118 112 { 119 113 uuid_t *uuid = &mp->m_sb.sb_uuid; 120 - int i; 121 114 122 115 if (xfs_has_nouuid(mp)) 123 116 return; 124 117 125 118 mutex_lock(&xfs_uuid_table_mutex); 126 - for (i = 0; i < xfs_uuid_table_size; i++) { 127 - if (uuid_is_null(&xfs_uuid_table[i])) 128 - continue; 129 - if (!uuid_equal(uuid, &xfs_uuid_table[i])) 130 - continue; 131 - memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); 132 - break; 133 - } 134 - ASSERT(i < xfs_uuid_table_size); 119 + xfs_uuid_delete(uuid, mp->m_uuid_table_index); 135 120 mutex_unlock(&xfs_uuid_table_mutex); 136 121 } 137 122

+3

fs/xfs/xfs_mount.h

··· 346 346 347 347 /* Private data referring to a health monitor object. */ 348 348 struct xfs_healthmon __rcu *m_healthmon; 349 + 350 + /* Index of uuid record in the uuid xarray. */ 351 + unsigned int m_uuid_table_index; 349 352 } xfs_mount_t; 350 353 351 354 #define M_IGEO(mp) (&(mp)->m_ino_geo)

+42 -1

fs/xfs/xfs_qm_syscalls.c

··· 391 391 return error; 392 392 } 393 393 394 + /* 395 + * Fill out the default quota limits for an ID that has no dquot on disk. 396 + * Returns 0 if default limits are configured 397 + * and were filled in, -ENOENT otherwise. 398 + */ 399 + static int 400 + xfs_qm_scall_getquota_fill_defaults( 401 + struct xfs_mount *mp, 402 + xfs_dqtype_t type, 403 + struct qc_dqblk *dst) 404 + { 405 + struct xfs_def_quota *defq; 406 + 407 + defq = xfs_get_defquota(mp->m_quotainfo, type); 408 + 409 + if (!defq->blk.soft && !defq->blk.hard && 410 + !defq->ino.soft && !defq->ino.hard && 411 + !defq->rtb.soft && !defq->rtb.hard) { 412 + return -ENOENT; 413 + } 414 + 415 + memset(dst, 0, sizeof(*dst)); 416 + dst->d_spc_softlimit = XFS_FSB_TO_B(mp, defq->blk.soft); 417 + dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, defq->blk.hard); 418 + dst->d_ino_softlimit = defq->ino.soft; 419 + dst->d_ino_hardlimit = defq->ino.hard; 420 + dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, defq->rtb.soft); 421 + dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, defq->rtb.hard); 422 + 423 + return 0; 424 + } 425 + 394 426 /* Fill out the quota context. */ 395 427 static void 396 428 xfs_qm_scall_getquota_fill_qc( ··· 483 451 * set doalloc. If it doesn't exist, we'll get ENOENT back. 484 452 */ 485 453 error = xfs_qm_dqget(mp, id, type, false, &dqp); 486 - if (error) 454 + if (error) { 455 + /* 456 + * If there is no dquot on disk and default limits are 457 + * configured, return them with zero usage so that 458 + * unprivileged users can see what limits apply to them. 459 + */ 460 + if (error == -ENOENT && id != 0 && 461 + !xfs_qm_scall_getquota_fill_defaults(mp, type, dst)) 462 + return 0; 487 463 return error; 464 + } 488 465 489 466 /* 490 467 * If everything's NULL, this dquot doesn't quite exist as far as

+1 -1

fs/xfs/xfs_refcount_item.c

··· 266 266 struct xfs_refcount_intent *ra = ci_entry(a); 267 267 struct xfs_refcount_intent *rb = ci_entry(b); 268 268 269 - return ra->ri_group->xg_gno - rb->ri_group->xg_gno; 269 + return cmp_int(ra->ri_group->xg_gno, rb->ri_group->xg_gno); 270 270 } 271 271 272 272 /* Log refcount updates in the intent item. */

+1 -1

fs/xfs/xfs_rmap_item.c

··· 267 267 struct xfs_rmap_intent *ra = ri_entry(a); 268 268 struct xfs_rmap_intent *rb = ri_entry(b); 269 269 270 - return ra->ri_group->xg_gno - rb->ri_group->xg_gno; 270 + return cmp_int(ra->ri_group->xg_gno, rb->ri_group->xg_gno); 271 271 } 272 272 273 273 /* Log rmap updates in the intent item. */

+19 -1

fs/xfs/xfs_sysfs.c

··· 13 13 #include "xfs_log.h" 14 14 #include "xfs_log_priv.h" 15 15 #include "xfs_mount.h" 16 + #include "xfs_zone_priv.h" 16 17 #include "xfs_zones.h" 18 + #include "xfs_zone_alloc.h" 17 19 18 20 struct xfs_sysfs_attr { 19 21 struct attribute attr; ··· 721 719 XFS_SYSFS_ATTR_RO(max_open_zones); 722 720 723 721 static ssize_t 722 + nr_open_zones_show( 723 + struct kobject *kobj, 724 + char *buf) 725 + { 726 + struct xfs_zone_info *zi = zoned_to_mp(kobj)->m_zone_info; 727 + 728 + return sysfs_emit(buf, "%u\n", READ_ONCE(zi->zi_nr_open_zones)); 729 + } 730 + XFS_SYSFS_ATTR_RO(nr_open_zones); 731 + 732 + static ssize_t 724 733 zonegc_low_space_store( 725 734 struct kobject *kobj, 726 735 const char *buf, 727 736 size_t count) 728 737 { 738 + struct xfs_mount *mp = zoned_to_mp(kobj); 729 739 int ret; 730 740 unsigned int val; 731 741 ··· 748 734 if (val > 100) 749 735 return -EINVAL; 750 736 751 - zoned_to_mp(kobj)->m_zonegc_low_space = val; 737 + if (mp->m_zonegc_low_space != val) { 738 + mp->m_zonegc_low_space = val; 739 + xfs_zone_gc_wakeup(mp); 740 + } 752 741 753 742 return count; 754 743 } ··· 768 751 769 752 static struct attribute *xfs_zoned_attrs[] = { 770 753 ATTR_LIST(max_open_zones), 754 + ATTR_LIST(nr_open_zones), 771 755 ATTR_LIST(zonegc_low_space), 772 756 NULL, 773 757 };

+7 -5

fs/xfs/xfs_trace.h

··· 394 394 DEFINE_ZONE_EVENT(xfs_zone_opened); 395 395 DEFINE_ZONE_EVENT(xfs_zone_reset); 396 396 DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened); 397 + DEFINE_ZONE_EVENT(xfs_zone_gc_target_stolen); 397 398 398 399 TRACE_EVENT(xfs_zone_free_blocks, 399 400 TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, ··· 462 461 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); 463 462 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks); 464 463 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); 464 + DEFINE_ZONE_ALLOC_EVENT(xfs_zone_spurious_open); 465 465 466 466 TRACE_EVENT(xfs_zone_gc_select_victim, 467 467 TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket), ··· 742 740 __entry->dev = bp->b_target->bt_dev; 743 741 __entry->bno = xfs_buf_daddr(bp); 744 742 __entry->nblks = bp->b_length; 745 - __entry->hold = bp->b_hold; 743 + __entry->hold = bp->b_lockref.count; 746 744 __entry->pincount = atomic_read(&bp->b_pin_count); 747 745 __entry->lockval = bp->b_sema.count; 748 746 __entry->flags = bp->b_flags; ··· 816 814 __entry->bno = xfs_buf_daddr(bp); 817 815 __entry->length = bp->b_length; 818 816 __entry->flags = flags; 819 - __entry->hold = bp->b_hold; 817 + __entry->hold = bp->b_lockref.count; 820 818 __entry->pincount = atomic_read(&bp->b_pin_count); 821 819 __entry->lockval = bp->b_sema.count; 822 820 __entry->caller_ip = caller_ip; ··· 860 858 __entry->dev = bp->b_target->bt_dev; 861 859 __entry->bno = xfs_buf_daddr(bp); 862 860 __entry->length = bp->b_length; 863 - __entry->hold = bp->b_hold; 861 + __entry->hold = bp->b_lockref.count; 864 862 __entry->pincount = atomic_read(&bp->b_pin_count); 865 863 __entry->lockval = bp->b_sema.count; 866 864 __entry->error = error; ··· 904 902 __entry->buf_bno = xfs_buf_daddr(bip->bli_buf); 905 903 __entry->buf_len = bip->bli_buf->b_length; 906 904 __entry->buf_flags = bip->bli_buf->b_flags; 907 - __entry->buf_hold = bip->bli_buf->b_hold; 905 + __entry->buf_hold = bip->bli_buf->b_lockref.count; 908 906 __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); 909 907 __entry->buf_lockval = bip->bli_buf->b_sema.count; 910 908 __entry->li_flags = bip->bli_item.li_flags; ··· 5208 5206 __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; 5209 5207 __entry->bno = xfs_buf_daddr(bp); 5210 5208 __entry->nblks = bp->b_length; 5211 - __entry->hold = bp->b_hold; 5209 + __entry->hold = bp->b_lockref.count; 5212 5210 __entry->pincount = atomic_read(&bp->b_pin_count); 5213 5211 __entry->lockval = bp->b_sema.count; 5214 5212 __entry->flags = bp->b_flags;

+143 -51

fs/xfs/xfs_zone_alloc.c

··· 174 174 WRITE_ONCE(rtg->rtg_open_zone, NULL); 175 175 176 176 spin_lock(&zi->zi_open_zones_lock); 177 - if (oz->oz_is_gc) { 178 - ASSERT(current == zi->zi_gc_thread); 179 - zi->zi_open_gc_zone = NULL; 180 - } else { 177 + if (oz->oz_is_gc) 178 + zi->zi_nr_open_gc_zones--; 179 + else 181 180 zi->zi_nr_open_zones--; 182 - list_del_init(&oz->oz_entry); 183 - } 181 + list_del_init(&oz->oz_entry); 184 182 spin_unlock(&zi->zi_open_zones_lock); 185 - xfs_open_zone_put(oz); 186 183 187 - wake_up_all(&zi->zi_zone_wait); 184 + if (oz->oz_is_gc) 185 + wake_up_process(zi->zi_gc_thread); 186 + else 187 + wake_up_all(&zi->zi_zone_wait); 188 + 188 189 if (used < rtg_blocks(rtg)) 189 190 xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 191 + xfs_open_zone_put(oz); 190 192 } 191 193 192 - static void 193 - xfs_zone_record_blocks( 194 - struct xfs_trans *tp, 194 + static inline void 195 + xfs_zone_inc_written( 195 196 struct xfs_open_zone *oz, 196 - xfs_fsblock_t fsbno, 197 197 xfs_filblks_t len) 198 198 { 199 - struct xfs_mount *mp = tp->t_mountp; 200 - struct xfs_rtgroup *rtg = oz->oz_rtg; 201 - struct xfs_inode *rmapip = rtg_rmap(rtg); 199 + xfs_assert_ilocked(rtg_rmap(oz->oz_rtg), XFS_ILOCK_EXCL); 202 200 203 - trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); 204 - 205 - xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 206 - xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); 207 - rmapip->i_used_blocks += len; 208 - ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); 209 201 oz->oz_written += len; 210 - if (oz->oz_written == rtg_blocks(rtg)) 202 + if (oz->oz_written == rtg_blocks(oz->oz_rtg)) 211 203 xfs_open_zone_mark_full(oz); 212 - xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 213 204 } 214 205 215 206 /* ··· 218 227 trace_xfs_zone_skip_blocks(oz, 0, len); 219 228 220 229 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 221 - oz->oz_written += len; 222 - if (oz->oz_written == rtg_blocks(rtg)) 223 - xfs_open_zone_mark_full(oz); 230 + xfs_zone_inc_written(oz, len); 224 231 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 225 232 226 233 xfs_add_frextents(rtg_mount(rtg), len); ··· 233 244 xfs_fsblock_t old_startblock) 234 245 { 235 246 struct xfs_bmbt_irec data; 247 + struct xfs_rtgroup *rtg = oz->oz_rtg; 248 + struct xfs_inode *rmapip = rtg_rmap(rtg); 236 249 int nmaps = 1; 237 250 int error; 238 251 ··· 293 302 } 294 303 } 295 304 296 - xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount); 305 + trace_xfs_zone_record_blocks(oz, 306 + xfs_rtb_to_rgbno(tp->t_mountp, new->br_startblock), 307 + new->br_blockcount); 308 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 309 + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); 310 + rmapip->i_used_blocks += new->br_blockcount; 311 + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); 312 + xfs_zone_inc_written(oz, new->br_blockcount); 313 + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 297 314 298 315 /* Map the new blocks into the data fork. */ 299 316 xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); ··· 559 560 struct xfs_open_zone *oz, 560 561 unsigned int goodness) 561 562 { 563 + if (oz->oz_is_gc) 564 + return false; 565 + 562 566 if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 563 567 return false; 564 568 ··· 683 681 if (oz) 684 682 goto out_unlock; 685 683 686 - if (pack_tight) 684 + if (pack_tight) { 687 685 oz = xfs_select_open_zone_mru(zi, write_hint); 688 - if (oz) 689 - goto out_unlock; 686 + if (oz) 687 + goto out_unlock; 688 + } 690 689 691 690 /* 692 691 * See if we can open a new zone and use that so that data for different ··· 698 695 goto out_unlock; 699 696 700 697 /* 701 - * Try to find an zone that is an ok match to colocate data with. 698 + * Try to find a zone that is an ok match to colocate data with. 702 699 */ 703 700 oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); 704 701 if (oz) ··· 1235 1232 kfree(zi); 1236 1233 } 1237 1234 1235 + static int 1236 + xfs_report_zones( 1237 + struct xfs_mount *mp, 1238 + struct xfs_init_zones *iz) 1239 + { 1240 + struct xfs_rtgroup *rtg = NULL; 1241 + 1242 + while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1243 + xfs_rgblock_t write_pointer; 1244 + int error; 1245 + 1246 + error = xfs_query_write_pointer(iz, rtg, &write_pointer); 1247 + if (!error) 1248 + error = xfs_init_zone(iz, rtg, write_pointer); 1249 + if (error) { 1250 + xfs_rtgroup_rele(rtg); 1251 + return error; 1252 + } 1253 + } 1254 + 1255 + return 0; 1256 + } 1257 + 1258 + static inline bool 1259 + xfs_zone_is_conv( 1260 + struct xfs_rtgroup *rtg) 1261 + { 1262 + return !bdev_zone_is_seq(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 1263 + xfs_gbno_to_daddr(rtg_group(rtg), 0)); 1264 + } 1265 + 1266 + static struct xfs_open_zone * 1267 + xfs_find_fullest_conventional_open_zone( 1268 + struct xfs_mount *mp) 1269 + { 1270 + struct xfs_zone_info *zi = mp->m_zone_info; 1271 + struct xfs_open_zone *found = NULL, *oz; 1272 + 1273 + spin_lock(&zi->zi_open_zones_lock); 1274 + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 1275 + if (!xfs_zone_is_conv(oz->oz_rtg)) 1276 + continue; 1277 + if (!found || oz->oz_allocated > found->oz_allocated) 1278 + found = oz; 1279 + } 1280 + spin_unlock(&zi->zi_open_zones_lock); 1281 + 1282 + return found; 1283 + } 1284 + 1285 + /* 1286 + * Find the fullest conventional zones and remove them from the open zone pool 1287 + * until we are at the open zone limit. 1288 + * 1289 + * We can end up with spurious "open" zones when the last blocks in a fully 1290 + * written zone were invalidate as there is no write pointer for conventional 1291 + * zones. 1292 + * 1293 + * If we are still over the limit when there is no conventional open zone left, 1294 + * the user overrode the max open zones limit using the max_open_zones mount 1295 + * option we should fail. 1296 + */ 1297 + static int 1298 + xfs_finish_spurious_open_zones( 1299 + struct xfs_mount *mp, 1300 + struct xfs_init_zones *iz) 1301 + { 1302 + struct xfs_zone_info *zi = mp->m_zone_info; 1303 + 1304 + while (zi->zi_nr_open_zones > mp->m_max_open_zones) { 1305 + struct xfs_open_zone *oz; 1306 + xfs_filblks_t adjust; 1307 + 1308 + oz = xfs_find_fullest_conventional_open_zone(mp); 1309 + if (!oz) { 1310 + xfs_err(mp, 1311 + "too many open zones for max_open_zones limit (%u/%u)", 1312 + zi->zi_nr_open_zones, mp->m_max_open_zones); 1313 + return -EINVAL; 1314 + } 1315 + 1316 + xfs_rtgroup_lock(oz->oz_rtg, XFS_RTGLOCK_RMAP); 1317 + adjust = rtg_blocks(oz->oz_rtg) - oz->oz_written; 1318 + trace_xfs_zone_spurious_open(oz, oz->oz_written, adjust); 1319 + oz->oz_written = rtg_blocks(oz->oz_rtg); 1320 + xfs_open_zone_mark_full(oz); 1321 + xfs_rtgroup_unlock(oz->oz_rtg, XFS_RTGLOCK_RMAP); 1322 + iz->available -= adjust; 1323 + iz->reclaimable += adjust; 1324 + } 1325 + 1326 + return 0; 1327 + } 1328 + 1238 1329 int 1239 1330 xfs_mount_zones( 1240 1331 struct xfs_mount *mp) ··· 1337 1240 .zone_capacity = mp->m_groups[XG_TYPE_RTG].blocks, 1338 1241 .zone_size = xfs_rtgroup_raw_size(mp), 1339 1242 }; 1340 - struct xfs_rtgroup *rtg = NULL; 1341 1243 int error; 1342 1244 1343 1245 if (!mp->m_rtdev_targp) { ··· 1366 1270 if (!mp->m_zone_info) 1367 1271 return -ENOMEM; 1368 1272 1369 - xfs_info(mp, "%u zones of %u blocks (%u max open zones)", 1370 - mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones); 1371 - trace_xfs_zones_mount(mp); 1273 + error = xfs_report_zones(mp, &iz); 1274 + if (error) 1275 + goto out_free_zone_info; 1276 + 1277 + error = xfs_finish_spurious_open_zones(mp, &iz); 1278 + if (error) 1279 + goto out_free_zone_info; 1280 + 1281 + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); 1282 + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 1283 + iz.available + iz.reclaimable); 1372 1284 1373 1285 /* 1374 1286 * The writeback code switches between inodes regularly to provide ··· 1402 1298 XFS_FSB_TO_B(mp, min(iz.zone_capacity, XFS_MAX_BMBT_EXTLEN)) >> 1403 1299 PAGE_SHIFT; 1404 1300 1405 - while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1406 - xfs_rgblock_t write_pointer; 1407 - 1408 - error = xfs_query_write_pointer(&iz, rtg, &write_pointer); 1409 - if (!error) 1410 - error = xfs_init_zone(&iz, rtg, write_pointer); 1411 - if (error) { 1412 - xfs_rtgroup_rele(rtg); 1413 - goto out_free_zone_info; 1414 - } 1415 - } 1416 - 1417 - xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); 1418 - xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 1419 - iz.available + iz.reclaimable); 1420 - 1421 1301 /* 1422 1302 * The user may configure GC to free up a percentage of unused blocks. 1423 1303 * By default this is 0. GC will always trigger at the minimum level ··· 1412 1324 error = xfs_zone_gc_mount(mp); 1413 1325 if (error) 1414 1326 goto out_free_zone_info; 1327 + 1328 + xfs_info(mp, "%u zones of %u blocks (%u max open zones)", 1329 + mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones); 1330 + trace_xfs_zones_mount(mp); 1415 1331 return 0; 1416 1332 1417 1333 out_free_zone_info:

+4

fs/xfs/xfs_zone_alloc.h

··· 51 51 void xfs_unmount_zones(struct xfs_mount *mp); 52 52 void xfs_zone_gc_start(struct xfs_mount *mp); 53 53 void xfs_zone_gc_stop(struct xfs_mount *mp); 54 + void xfs_zone_gc_wakeup(struct xfs_mount *mp); 54 55 #else 55 56 static inline int xfs_mount_zones(struct xfs_mount *mp) 56 57 { ··· 64 63 { 65 64 } 66 65 static inline void xfs_zone_gc_stop(struct xfs_mount *mp) 66 + { 67 + } 68 + static inline void xfs_zone_gc_wakeup(struct xfs_mount *mp) 67 69 { 68 70 } 69 71 #endif /* CONFIG_XFS_RT */

+146 -131

fs/xfs/xfs_zone_gc.c

··· 125 125 */ 126 126 struct xfs_zone_gc_data { 127 127 struct xfs_mount *mp; 128 + struct xfs_open_zone *oz; 128 129 129 130 /* bioset used to allocate the gc_bios */ 130 131 struct bio_set bio_set; ··· 171 170 s64 available, free, threshold; 172 171 s32 remainder; 173 172 173 + /* If we have no reclaimable blocks, running GC is useless. */ 174 174 if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) 175 175 return false; 176 176 177 + /* 178 + * In order to avoid file fragmentation as much as possible, we should 179 + * make sure that we can open enough zones. So trigger GC if the number 180 + * of blocks immediately available for writes is lower than the total 181 + * number of blocks from all possible open zones. 182 + */ 177 183 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 178 - 179 184 if (available < 180 185 xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 181 186 return true; 182 187 183 - free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 188 + /* 189 + * For cases where the user wants to be more aggressive with GC, 190 + * the sysfs attribute zonegc_low_space may be set to a non zero value, 191 + * to indicate that GC should try to maintain at least zonegc_low_space 192 + * percent of the free space to be directly available for writing. Check 193 + * this here. 194 + */ 195 + if (!mp->m_zonegc_low_space) 196 + return false; 184 197 198 + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 185 199 threshold = div_s64_rem(free, 100, &remainder); 186 200 threshold = threshold * mp->m_zonegc_low_space + 187 201 remainder * div_s64(mp->m_zonegc_low_space, 100); 188 202 189 - if (available < threshold) 190 - return true; 191 - 192 - return false; 203 + return available < threshold; 193 204 } 194 205 195 206 static struct xfs_zone_gc_data * ··· 375 362 } 376 363 377 364 static bool 378 - xfs_zone_gc_iter_next( 365 + xfs_zone_gc_iter_irec( 379 366 struct xfs_mount *mp, 380 367 struct xfs_zone_gc_iter *iter, 381 368 struct xfs_rmap_irec *chunk_rec, ··· 383 370 { 384 371 struct xfs_rmap_irec *irec; 385 372 int error; 386 - 387 - if (!iter->victim_rtg) 388 - return false; 389 373 390 374 retry: 391 375 if (iter->rec_idx == iter->rec_count) { ··· 525 515 return true; 526 516 } 527 517 528 - static struct xfs_open_zone * 529 - xfs_zone_gc_steal_open( 530 - struct xfs_zone_info *zi) 518 + static int 519 + xfs_zone_gc_steal_open_zone( 520 + struct xfs_zone_gc_data *data) 531 521 { 522 + struct xfs_zone_info *zi = data->mp->m_zone_info; 532 523 struct xfs_open_zone *oz, *found = NULL; 533 524 534 525 spin_lock(&zi->zi_open_zones_lock); ··· 537 526 if (!found || oz->oz_allocated < found->oz_allocated) 538 527 found = oz; 539 528 } 540 - 541 - if (found) { 542 - found->oz_is_gc = true; 543 - list_del_init(&found->oz_entry); 544 - zi->zi_nr_open_zones--; 529 + if (!found) { 530 + spin_unlock(&zi->zi_open_zones_lock); 531 + return -EIO; 545 532 } 546 533 534 + trace_xfs_zone_gc_target_stolen(found->oz_rtg); 535 + found->oz_is_gc = true; 536 + zi->zi_nr_open_zones--; 537 + zi->zi_nr_open_gc_zones++; 547 538 spin_unlock(&zi->zi_open_zones_lock); 548 - return found; 549 - } 550 539 551 - static struct xfs_open_zone * 552 - xfs_zone_gc_select_target( 553 - struct xfs_mount *mp) 554 - { 555 - struct xfs_zone_info *zi = mp->m_zone_info; 556 - struct xfs_open_zone *oz = zi->zi_open_gc_zone; 557 - 558 - /* 559 - * We need to wait for pending writes to finish. 560 - */ 561 - if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 562 - return NULL; 563 - 564 - ASSERT(zi->zi_nr_open_zones <= 565 - mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 566 - oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 567 - if (oz) 568 - trace_xfs_zone_gc_target_opened(oz->oz_rtg); 569 - spin_lock(&zi->zi_open_zones_lock); 570 - zi->zi_open_gc_zone = oz; 571 - spin_unlock(&zi->zi_open_zones_lock); 572 - return oz; 540 + atomic_inc(&found->oz_ref); 541 + data->oz = found; 542 + return 0; 573 543 } 574 544 575 545 /* 576 - * Ensure we have a valid open zone to write the GC data to. 577 - * 578 - * If the current target zone has space keep writing to it, else first wait for 579 - * all pending writes and then pick a new one. 546 + * Ensure we have a valid open zone to write to. 580 547 */ 581 - static struct xfs_open_zone * 582 - xfs_zone_gc_ensure_target( 583 - struct xfs_mount *mp) 548 + static bool 549 + xfs_zone_gc_select_target( 550 + struct xfs_zone_gc_data *data) 584 551 { 585 - struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 552 + struct xfs_zone_info *zi = data->mp->m_zone_info; 586 553 587 - if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 588 - return xfs_zone_gc_select_target(mp); 589 - return oz; 554 + if (data->oz) { 555 + /* 556 + * If we have space available, just keep using the existing 557 + * zone. 558 + */ 559 + if (data->oz->oz_allocated < rtg_blocks(data->oz->oz_rtg)) 560 + return true; 561 + 562 + /* 563 + * Wait for all writes to the current zone to finish before 564 + * picking a new one. 565 + */ 566 + if (data->oz->oz_written < rtg_blocks(data->oz->oz_rtg)) 567 + return false; 568 + 569 + xfs_open_zone_put(data->oz); 570 + } 571 + 572 + /* 573 + * Open a new zone when there is none currently in use. 574 + */ 575 + ASSERT(zi->zi_nr_open_zones <= 576 + data->mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 577 + data->oz = xfs_open_zone(data->mp, WRITE_LIFE_NOT_SET, true); 578 + if (!data->oz) 579 + return false; 580 + trace_xfs_zone_gc_target_opened(data->oz->oz_rtg); 581 + atomic_inc(&data->oz->oz_ref); 582 + spin_lock(&zi->zi_open_zones_lock); 583 + zi->zi_nr_open_gc_zones++; 584 + list_add_tail(&data->oz->oz_entry, &zi->zi_open_zones); 585 + spin_unlock(&zi->zi_open_zones_lock); 586 + return true; 590 587 } 591 588 592 589 static void ··· 609 590 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 610 591 } 611 592 612 - static struct xfs_open_zone * 593 + static bool 613 594 xfs_zone_gc_alloc_blocks( 614 595 struct xfs_zone_gc_data *data, 615 596 xfs_extlen_t *count_fsb, ··· 617 598 bool *is_seq) 618 599 { 619 600 struct xfs_mount *mp = data->mp; 620 - struct xfs_open_zone *oz; 621 - 622 - oz = xfs_zone_gc_ensure_target(mp); 623 - if (!oz) 624 - return NULL; 601 + struct xfs_open_zone *oz = data->oz; 625 602 626 603 *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available)); 627 604 ··· 639 624 spin_unlock(&mp->m_sb_lock); 640 625 641 626 if (!*count_fsb) 642 - return NULL; 627 + return false; 643 628 644 629 *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0); 645 630 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); ··· 647 632 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); 648 633 oz->oz_allocated += *count_fsb; 649 634 atomic_inc(&oz->oz_ref); 650 - return oz; 635 + return true; 651 636 } 652 637 653 638 static void ··· 673 658 } 674 659 675 660 static bool 661 + xfs_zone_gc_can_start_chunk( 662 + struct xfs_zone_gc_data *data) 663 + { 664 + 665 + if (xfs_is_shutdown(data->mp)) 666 + return false; 667 + if (!data->scratch_available) 668 + return false; 669 + 670 + if (!data->iter.victim_rtg) { 671 + if (kthread_should_stop() || kthread_should_park()) 672 + return false; 673 + if (!xfs_zoned_need_gc(data->mp)) 674 + return false; 675 + if (!xfs_zone_gc_select_victim(data)) 676 + return false; 677 + } 678 + 679 + return xfs_zone_gc_select_target(data); 680 + } 681 + 682 + static bool 676 683 xfs_zone_gc_start_chunk( 677 684 struct xfs_zone_gc_data *data) 678 685 { 679 686 struct xfs_zone_gc_iter *iter = &data->iter; 680 687 struct xfs_mount *mp = data->mp; 681 688 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 682 - struct xfs_open_zone *oz; 683 689 struct xfs_rmap_irec irec; 684 690 struct xfs_gc_bio *chunk; 685 691 struct xfs_inode *ip; ··· 708 672 xfs_daddr_t daddr; 709 673 bool is_seq; 710 674 711 - if (xfs_is_shutdown(mp)) 675 + if (!xfs_zone_gc_can_start_chunk(data)) 712 676 return false; 713 677 714 - if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 678 + set_current_state(TASK_RUNNING); 679 + if (!xfs_zone_gc_iter_irec(mp, iter, &irec, &ip)) 715 680 return false; 716 - oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 717 - &is_seq); 718 - if (!oz) { 681 + 682 + if (!xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 683 + &is_seq)) { 719 684 xfs_irele(ip); 720 685 return false; 721 686 } ··· 736 699 chunk->new_daddr = daddr; 737 700 chunk->is_seq = is_seq; 738 701 chunk->data = data; 739 - chunk->oz = oz; 702 + chunk->oz = data->oz; 740 703 chunk->victim_rtg = iter->victim_rtg; 741 704 atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); 742 705 atomic_inc(&chunk->victim_rtg->rtg_gccount); ··· 1022 985 } while (next); 1023 986 } 1024 987 1025 - static bool 1026 - xfs_zone_gc_should_start_new_work( 1027 - struct xfs_zone_gc_data *data) 1028 - { 1029 - struct xfs_open_zone *oz; 1030 - 1031 - if (xfs_is_shutdown(data->mp)) 1032 - return false; 1033 - if (!data->scratch_available) 1034 - return false; 1035 - 1036 - oz = xfs_zone_gc_ensure_target(data->mp); 1037 - if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 1038 - return false; 1039 - 1040 - if (!data->iter.victim_rtg) { 1041 - if (kthread_should_stop() || kthread_should_park()) 1042 - return false; 1043 - if (!xfs_zoned_need_gc(data->mp)) 1044 - return false; 1045 - if (!xfs_zone_gc_select_victim(data)) 1046 - return false; 1047 - } 1048 - 1049 - return true; 1050 - } 1051 - 1052 988 /* 1053 989 * Handle the work to read and write data for GC and to reset the zones, 1054 990 * including handling all completions. ··· 1071 1061 } 1072 1062 blk_finish_plug(&plug); 1073 1063 1074 - if (xfs_zone_gc_should_start_new_work(data)) { 1075 - set_current_state(TASK_RUNNING); 1076 - blk_start_plug(&plug); 1077 - while (xfs_zone_gc_start_chunk(data)) 1078 - ; 1079 - blk_finish_plug(&plug); 1080 - } 1064 + blk_start_plug(&plug); 1065 + while (xfs_zone_gc_start_chunk(data)) 1066 + ; 1067 + blk_finish_plug(&plug); 1081 1068 } 1082 1069 1083 1070 /* ··· 1134 1127 } 1135 1128 xfs_clear_zonegc_running(mp); 1136 1129 1130 + if (data->oz) 1131 + xfs_open_zone_put(data->oz); 1137 1132 if (data->iter.victim_rtg) 1138 1133 xfs_rtgroup_rele(data->iter.victim_rtg); 1139 1134 ··· 1160 1151 kthread_park(mp->m_zone_info->zi_gc_thread); 1161 1152 } 1162 1153 1154 + void 1155 + xfs_zone_gc_wakeup( 1156 + struct xfs_mount *mp) 1157 + { 1158 + struct super_block *sb = mp->m_super; 1159 + 1160 + /* 1161 + * If we are unmounting the file system we must not try to 1162 + * wake gc as m_zone_info might have been freed already. 1163 + */ 1164 + if (down_read_trylock(&sb->s_umount)) { 1165 + if (!xfs_is_readonly(mp)) 1166 + wake_up_process(mp->m_zone_info->zi_gc_thread); 1167 + up_read(&sb->s_umount); 1168 + } 1169 + } 1170 + 1163 1171 int 1164 1172 xfs_zone_gc_mount( 1165 1173 struct xfs_mount *mp) 1166 1174 { 1167 1175 struct xfs_zone_info *zi = mp->m_zone_info; 1168 1176 struct xfs_zone_gc_data *data; 1169 - struct xfs_open_zone *oz; 1170 1177 int error; 1171 1178 1179 + data = xfs_zone_gc_data_alloc(mp); 1180 + if (!data) 1181 + return -ENOMEM; 1182 + 1172 1183 /* 1173 - * If there are no free zones available for GC, pick the open zone with 1184 + * If there are no free zones available for GC, or the number of open 1185 + * zones has reached the open zone limit, pick the open zone with 1174 1186 * the least used space to GC into. This should only happen after an 1175 - * unclean shutdown near ENOSPC while GC was ongoing. 1176 - * 1177 - * We also need to do this for the first gc zone allocation if we 1178 - * unmounted while at the open limit. 1187 + * unclean shutdown while GC was ongoing. Otherwise a GC zone will 1188 + * be selected from the free zone pool on demand. 1179 1189 */ 1180 1190 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1181 - zi->zi_nr_open_zones == mp->m_max_open_zones) 1182 - oz = xfs_zone_gc_steal_open(zi); 1183 - else 1184 - oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1185 - if (!oz) { 1186 - xfs_warn(mp, "unable to allocate a zone for gc"); 1187 - error = -EIO; 1188 - goto out; 1189 - } 1190 - 1191 - trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1192 - zi->zi_open_gc_zone = oz; 1193 - 1194 - data = xfs_zone_gc_data_alloc(mp); 1195 - if (!data) { 1196 - error = -ENOMEM; 1197 - goto out_put_gc_zone; 1191 + zi->zi_nr_open_zones >= mp->m_max_open_zones) { 1192 + error = xfs_zone_gc_steal_open_zone(data); 1193 + if (error) { 1194 + xfs_warn(mp, "unable to steal an open zone for gc"); 1195 + goto out_free_gc_data; 1196 + } 1198 1197 } 1199 1198 1200 1199 zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, ··· 1210 1193 if (IS_ERR(zi->zi_gc_thread)) { 1211 1194 xfs_warn(mp, "unable to create zone gc thread"); 1212 1195 error = PTR_ERR(zi->zi_gc_thread); 1213 - goto out_free_gc_data; 1196 + goto out_put_oz; 1214 1197 } 1215 1198 1216 1199 /* xfs_zone_gc_start will unpark for rw mounts */ 1217 1200 kthread_park(zi->zi_gc_thread); 1218 1201 return 0; 1219 1202 1203 + out_put_oz: 1204 + if (data->oz) 1205 + xfs_open_zone_put(data->oz); 1220 1206 out_free_gc_data: 1221 1207 kfree(data); 1222 - out_put_gc_zone: 1223 - xfs_open_zone_put(zi->zi_open_gc_zone); 1224 - out: 1225 1208 return error; 1226 1209 } 1227 1210 ··· 1232 1215 struct xfs_zone_info *zi = mp->m_zone_info; 1233 1216 1234 1217 kthread_stop(zi->zi_gc_thread); 1235 - if (zi->zi_open_gc_zone) 1236 - xfs_open_zone_put(zi->zi_open_gc_zone); 1237 1218 }

+14 -9

fs/xfs/xfs_zone_info.c

··· 30 30 struct seq_file *m, 31 31 struct xfs_open_zone *oz) 32 32 { 33 - seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", 33 + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s %s\n", 34 34 rtg_rgno(oz->oz_rtg), 35 35 oz->oz_allocated, oz->oz_written, 36 36 rtg_rmap(oz->oz_rtg)->i_used_blocks, 37 - xfs_write_hint_to_str(oz->oz_write_hint)); 37 + xfs_write_hint_to_str(oz->oz_write_hint), 38 + oz->oz_is_gc ? "(GC)" : ""); 38 39 } 39 40 40 41 static void ··· 59 58 spin_unlock(&zi->zi_used_buckets_lock); 60 59 61 60 full = mp->m_sb.sb_rgcount; 62 - if (zi->zi_open_gc_zone) 63 - full--; 64 61 full -= zi->zi_nr_open_zones; 62 + full -= zi->zi_nr_open_gc_zones; 65 63 full -= atomic_read(&zi->zi_nr_free_zones); 66 64 full -= reclaimable; 67 65 ··· 90 90 seq_printf(m, "\tRT GC required: %d\n", 91 91 xfs_zoned_need_gc(mp)); 92 92 93 + seq_printf(m, "\ttotal number of zones: %u\n", 94 + mp->m_sb.sb_rgcount); 93 95 seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); 94 - seq_puts(m, "\topen zones:\n"); 96 + 95 97 spin_lock(&zi->zi_open_zones_lock); 98 + seq_printf(m, "\tmax open zones: %u\n", 99 + mp->m_max_open_zones); 100 + seq_printf(m, "\tnr open zones: %u\n", 101 + zi->zi_nr_open_zones); 102 + seq_printf(m, "\tnr open GC zones: %u\n", 103 + zi->zi_nr_open_gc_zones); 104 + seq_puts(m, "\topen zones:\n"); 96 105 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) 97 106 xfs_show_open_zone(m, oz); 98 - if (zi->zi_open_gc_zone) { 99 - seq_puts(m, "\topen gc zone:\n"); 100 - xfs_show_open_zone(m, zi->zi_open_gc_zone); 101 - } 102 107 spin_unlock(&zi->zi_open_zones_lock); 103 108 seq_puts(m, "\tused blocks distribution (fully written zones):\n"); 104 109 xfs_show_full_zone_used_distribution(m, mp);

+3 -12

fs/xfs/xfs_zone_priv.h

··· 32 32 */ 33 33 enum rw_hint oz_write_hint; 34 34 35 - /* 36 - * Is this open zone used for garbage collection? There can only be a 37 - * single open GC zone, which is pointed to by zi_open_gc_zone in 38 - * struct xfs_zone_info. Constant over the life time of an open zone. 39 - */ 35 + /* Is this open zone used for garbage collection? */ 40 36 bool oz_is_gc; 41 37 42 38 /* ··· 64 68 spinlock_t zi_open_zones_lock; 65 69 struct list_head zi_open_zones; 66 70 unsigned int zi_nr_open_zones; 71 + unsigned int zi_nr_open_gc_zones; 67 72 68 73 /* 69 74 * Free zone search cursor and number of free zones: ··· 78 81 wait_queue_head_t zi_zone_wait; 79 82 80 83 /* 81 - * Pointer to the GC thread, and the current open zone used by GC 82 - * (if any). 83 - * 84 - * zi_open_gc_zone is mostly private to the GC thread, but can be read 85 - * for debugging from other threads, in which case zi_open_zones_lock 86 - * must be taken to access it. 84 + * Pointer to the GC thread. 87 85 */ 88 86 struct task_struct *zi_gc_thread; 89 - struct xfs_open_zone *zi_open_gc_zone; 90 87 91 88 /* 92 89 * List of zones that need a reset:

Configure Feed

Configure Feed