Merge tag 'xfs-fixes-6.15-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+50

Documentation/admin-guide/xfs.rst

··· 124 124 controls the size of each buffer and so is also relevant to 125 125 this case. 126 126 127 + lifetime (default) or nolifetime 128 + Enable data placement based on write life time hints provided 129 + by the user. This turns on co-allocation of data of similar 130 + life times when statistically favorable to reduce garbage 131 + collection cost. 132 + 133 + These options are only available for zoned rt file systems. 134 + 127 135 logbsize=value 128 136 Set the size of each in-memory log buffer. The size may be 129 137 specified in bytes, or in kilobytes with a "k" suffix. ··· 150 142 section, and a real-time section. The real-time section is 151 143 optional, and the log section can be separate from the data 152 144 section or contained within it. 145 + 146 + max_open_zones=value 147 + Specify the max number of zones to keep open for writing on a 148 + zoned rt device. Many open zones aids file data separation 149 + but may impact performance on HDDs. 150 + 151 + If ``max_open_zones`` is not specified, the value is determined 152 + by the capabilities and the size of the zoned rt device. 153 153 154 154 noalign 155 155 Data allocations will not be aligned at stripe unit ··· 558 542 nice Relative priority of scheduling the threads. These are the 559 543 same nice levels that can be applied to userspace processes. 560 544 ============ =========== 545 + 546 + Zoned Filesystems 547 + ================= 548 + 549 + For zoned file systems, the following attribute is exposed in: 550 + 551 + /sys/fs/xfs/<dev>/zoned/ 552 + 553 + max_open_zones (Min: 1 Default: Varies Max: UINTMAX) 554 + This read-only attribute exposes the maximum number of open zones 555 + available for data placement. The value is determined at mount time and 556 + is limited by the capabilities of the backing zoned device, file system 557 + size and the max_open_zones mount option. 558 + 559 + Zoned Filesystems 560 + ================= 561 + 562 + For zoned file systems, the following attributes are exposed in: 563 + 564 + /sys/fs/xfs/<dev>/zoned/ 565 + 566 + max_open_zones (Min: 1 Default: Varies Max: UINTMAX) 567 + This read-only attribute exposes the maximum number of open zones 568 + available for data placement. The value is determined at mount time and 569 + is limited by the capabilities of the backing zoned device, file system 570 + size and the max_open_zones mount option. 571 + 572 + zonegc_low_space (Min: 0 Default: 0 Max: 100) 573 + Define a percentage for how much of the unused space that GC should keep 574 + available for writing. A high value will reclaim more of the space 575 + occupied by unused blocks, creating a larger buffer against write 576 + bursts at the cost of increased write amplification. Regardless 577 + of this value, garbage collection will always aim to free a minimum 578 + amount of blocks to keep max_open_zones open for data placement purposes.

+1

fs/xfs/xfs_buf.c

··· 105 105 { 106 106 unsigned int size = BBTOB(bp->b_length); 107 107 108 + might_sleep(); 108 109 trace_xfs_buf_free(bp, _RET_IP_); 109 110 110 111 ASSERT(list_empty(&bp->b_lru));

+1 -1

fs/xfs/xfs_buf_mem.c

··· 165 165 folio_set_dirty(folio); 166 166 folio_unlock(folio); 167 167 168 - bp->b_addr = folio_address(folio); 168 + bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos); 169 169 return 0; 170 170 } 171 171

+1 -2

fs/xfs/xfs_dquot.c

··· 1186 1186 if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && 1187 1187 (lip->li_lsn == qlip->qli_flush_lsn || 1188 1188 test_bit(XFS_LI_FAILED, &lip->li_flags))) { 1189 - 1190 1189 spin_lock(&ailp->ail_lock); 1191 - xfs_clear_li_failed(lip); 1190 + clear_bit(XFS_LI_FAILED, &lip->li_flags); 1192 1191 if (lip->li_lsn == qlip->qli_flush_lsn) { 1193 1192 /* xfs_ail_update_finish() drops the AIL lock */ 1194 1193 tail_lsn = xfs_ail_delete_one(ailp, lip);

+33 -18

fs/xfs/xfs_fsmap.c

··· 876 876 const struct xfs_fsmap *keys, 877 877 struct xfs_getfsmap_info *info) 878 878 { 879 + struct xfs_fsmap key0 = *keys; /* struct copy */ 879 880 struct xfs_mount *mp = tp->t_mountp; 880 881 struct xfs_rtgroup *rtg = NULL; 881 882 struct xfs_btree_cur *bt_cur = NULL; ··· 888 887 int error = 0; 889 888 890 889 eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); 891 - if (keys[0].fmr_physical >= eofs) 890 + if (key0.fmr_physical >= eofs) 892 891 return 0; 893 892 893 + /* 894 + * On zoned filesystems with an internal rt volume, the volume comes 895 + * immediately after the end of the data volume. However, the 896 + * xfs_rtblock_t address space is relative to the start of the data 897 + * device, which means that the first @rtstart fsblocks do not actually 898 + * point anywhere. If a fsmap query comes in with the low key starting 899 + * below @rtstart, report it as "owned by filesystem". 900 + */ 894 901 rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); 895 - if (keys[0].fmr_physical < rtstart_daddr) { 902 + if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) { 896 903 struct xfs_fsmap_irec frec = { 897 904 .owner = XFS_RMAP_OWN_FS, 898 905 .len_daddr = rtstart_daddr, 899 906 }; 900 907 901 - /* Adjust the low key if we are continuing from where we left off. */ 902 - if (keys[0].fmr_length > 0) { 903 - info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; 904 - return 0; 908 + /* 909 + * Adjust the start of the query range if we're picking up from 910 + * a previous round, and only emit the record if we haven't 911 + * already gone past. 912 + */ 913 + key0.fmr_physical += key0.fmr_length; 914 + if (key0.fmr_physical < rtstart_daddr) { 915 + error = xfs_getfsmap_helper(tp, info, &frec); 916 + if (error) 917 + return error; 918 + 919 + key0.fmr_physical = rtstart_daddr; 905 920 } 906 921 907 - /* Fabricate an rmap entry for space occupied by the data dev */ 908 - error = xfs_getfsmap_helper(tp, info, &frec); 909 - if (error) 910 - return error; 922 + /* Zero the other fields to avoid further adjustments. */ 923 + key0.fmr_owner = 0; 924 + key0.fmr_offset = 0; 925 + key0.fmr_length = 0; 911 926 } 912 927 913 - start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); 914 - end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + 915 - min(eofs - 1, keys[1].fmr_physical)); 916 - 928 + start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical); 929 + end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); 917 930 info->missing_owner = XFS_FMR_OWN_FREE; 918 931 919 932 /* ··· 935 920 * low to the fsmap low key and max out the high key to the end 936 921 * of the rtgroup. 937 922 */ 938 - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); 939 - error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); 923 + info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset); 924 + error = xfs_fsmap_owner_to_rmap(&info->low, &key0); 940 925 if (error) 941 926 return error; 942 - info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); 943 - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); 927 + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length); 928 + xfs_getfsmap_set_irec_flags(&info->low, &key0); 944 929 945 930 /* Adjust the low key if we are continuing from where we left off. */ 946 931 if (info->low.rm_blockcount == 0) {

-6

fs/xfs/xfs_inode_item.c

··· 1089 1089 * state. Whilst the inode is in the AIL, it should have a valid buffer 1090 1090 * pointer for push operations to access - it is only safe to remove the 1091 1091 * inode from the buffer once it has been removed from the AIL. 1092 - * 1093 - * We also clear the failed bit before removing the item from the AIL 1094 - * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer 1095 - * references the inode item owns and needs to hold until we've fully 1096 - * aborted the inode log item and detached it from the buffer. 1097 1092 */ 1098 - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); 1099 1093 xfs_trans_ail_delete(&iip->ili_item, 0); 1100 1094 1101 1095 /*

+1 -1

fs/xfs/xfs_log.c

··· 2888 2888 * 2889 2889 * 1. the current iclog is active and has no data; the previous iclog 2890 2890 * is in the active or dirty state. 2891 - * 2. the current iclog is drity, and the previous iclog is in the 2891 + * 2. the current iclog is dirty, and the previous iclog is in the 2892 2892 * active or dirty state. 2893 2893 * 2894 2894 * We may sleep if:

+1

fs/xfs/xfs_mount.h

··· 229 229 bool m_finobt_nores; /* no per-AG finobt resv. */ 230 230 bool m_update_sb; /* sb needs update in mount */ 231 231 unsigned int m_max_open_zones; 232 + unsigned int m_zonegc_low_space; 232 233 233 234 /* 234 235 * Bitsets of per-fs metadata that have been checked and/or are sick.

+32

fs/xfs/xfs_sysfs.c

··· 718 718 } 719 719 XFS_SYSFS_ATTR_RO(max_open_zones); 720 720 721 + static ssize_t 722 + zonegc_low_space_store( 723 + struct kobject *kobj, 724 + const char *buf, 725 + size_t count) 726 + { 727 + int ret; 728 + unsigned int val; 729 + 730 + ret = kstrtouint(buf, 0, &val); 731 + if (ret) 732 + return ret; 733 + 734 + if (val > 100) 735 + return -EINVAL; 736 + 737 + zoned_to_mp(kobj)->m_zonegc_low_space = val; 738 + 739 + return count; 740 + } 741 + 742 + static ssize_t 743 + zonegc_low_space_show( 744 + struct kobject *kobj, 745 + char *buf) 746 + { 747 + return sysfs_emit(buf, "%u\n", 748 + zoned_to_mp(kobj)->m_zonegc_low_space); 749 + } 750 + XFS_SYSFS_ATTR_RW(zonegc_low_space); 751 + 721 752 static struct attribute *xfs_zoned_attrs[] = { 722 753 ATTR_LIST(max_open_zones), 754 + ATTR_LIST(zonegc_low_space), 723 755 NULL, 724 756 }; 725 757 ATTRIBUTE_GROUPS(xfs_zoned);

+2 -3

fs/xfs/xfs_trans_ail.c

··· 909 909 return; 910 910 } 911 911 912 - /* xfs_ail_update_finish() drops the AIL lock */ 913 - xfs_clear_li_failed(lip); 912 + clear_bit(XFS_LI_FAILED, &lip->li_flags); 914 913 tail_lsn = xfs_ail_delete_one(ailp, lip); 915 - xfs_ail_update_finish(ailp, tail_lsn); 914 + xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */ 916 915 } 917 916 918 917 int

-28

fs/xfs/xfs_trans_priv.h

··· 167 167 } 168 168 #endif 169 169 170 - static inline void 171 - xfs_clear_li_failed( 172 - struct xfs_log_item *lip) 173 - { 174 - struct xfs_buf *bp = lip->li_buf; 175 - 176 - ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); 177 - lockdep_assert_held(&lip->li_ailp->ail_lock); 178 - 179 - if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { 180 - lip->li_buf = NULL; 181 - xfs_buf_rele(bp); 182 - } 183 - } 184 - 185 - static inline void 186 - xfs_set_li_failed( 187 - struct xfs_log_item *lip, 188 - struct xfs_buf *bp) 189 - { 190 - lockdep_assert_held(&lip->li_ailp->ail_lock); 191 - 192 - if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { 193 - xfs_buf_hold(bp); 194 - lip->li_buf = bp; 195 - } 196 - } 197 - 198 170 #endif /* __XFS_TRANS_PRIV_H__ */

+7

fs/xfs/xfs_zone_alloc.c

··· 1201 1201 xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 1202 1202 iz.available + iz.reclaimable); 1203 1203 1204 + /* 1205 + * The user may configure GC to free up a percentage of unused blocks. 1206 + * By default this is 0. GC will always trigger at the minimum level 1207 + * for keeping max_open_zones available for data placement. 1208 + */ 1209 + mp->m_zonegc_low_space = 0; 1210 + 1204 1211 error = xfs_zone_gc_mount(mp); 1205 1212 if (error) 1206 1213 goto out_free_zone_info;

+14 -2

fs/xfs/xfs_zone_gc.c

··· 162 162 163 163 /* 164 164 * We aim to keep enough zones free in stock to fully use the open zone limit 165 - * for data placement purposes. 165 + * for data placement purposes. Additionally, the m_zonegc_low_space tunable 166 + * can be set to make sure a fraction of the unused blocks are available for 167 + * writing. 166 168 */ 167 169 bool 168 170 xfs_zoned_need_gc( 169 171 struct xfs_mount *mp) 170 172 { 173 + s64 available, free; 174 + 171 175 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) 172 176 return false; 173 - if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < 177 + 178 + available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 179 + 180 + if (available < 174 181 mp->m_groups[XG_TYPE_RTG].blocks * 175 182 (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 176 183 return true; 184 + 185 + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 186 + if (available < mult_frac(free, mp->m_zonegc_low_space, 100)) 187 + return true; 188 + 177 189 return false; 178 190 } 179 191

Configure Feed

Configure Feed