Merge tag 'xfs-fixes-6.18-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+10 -1

fs/xfs/Kconfig

··· 119 119 120 120 See the xfs man page in section 5 for additional information. 121 121 122 + This option is mandatory to support zoned block devices. For these 123 + devices, the realtime subvolume must be backed by a zoned block 124 + device and a regular block device used as the main device (for 125 + metadata). If the zoned block device is a host-managed SMR hard-disk 126 + containing conventional zones at the beginning of its address space, 127 + XFS will use the disk conventional zones as the main device and the 128 + remaining sequential write required zones as the backing storage for 129 + the realtime subvolume. 130 + 122 131 If unsure, say N. 123 132 124 133 config XFS_DRAIN_INTENTS ··· 165 156 bool "XFS online metadata check usage data collection" 166 157 default y 167 158 depends on XFS_ONLINE_SCRUB 168 - select DEBUG_FS 159 + depends on DEBUG_FS 169 160 help 170 161 If you say Y here, the kernel will gather usage data about 171 162 the online metadata check subsystem. This includes the number

+31 -3

fs/xfs/scrub/nlinks.c

··· 376 376 return error; 377 377 } 378 378 379 + static uint 380 + xchk_nlinks_ilock_dir( 381 + struct xfs_inode *ip) 382 + { 383 + uint lock_mode = XFS_ILOCK_SHARED; 384 + 385 + /* 386 + * We're going to scan the directory entries, so we must be ready to 387 + * pull the data fork mappings into memory if they aren't already. 388 + */ 389 + if (xfs_need_iread_extents(&ip->i_df)) 390 + lock_mode = XFS_ILOCK_EXCL; 391 + 392 + /* 393 + * We're going to scan the parent pointers, so we must be ready to 394 + * pull the attr fork mappings into memory if they aren't already. 395 + */ 396 + if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) && 397 + xfs_need_iread_extents(&ip->i_af)) 398 + lock_mode = XFS_ILOCK_EXCL; 399 + 400 + /* 401 + * Take the IOLOCK so that other threads cannot start a directory 402 + * update while we're scanning. 403 + */ 404 + lock_mode |= XFS_IOLOCK_SHARED; 405 + xfs_ilock(ip, lock_mode); 406 + return lock_mode; 407 + } 408 + 379 409 /* Walk a directory to bump the observed link counts of the children. */ 380 410 STATIC int 381 411 xchk_nlinks_collect_dir( ··· 424 394 return 0; 425 395 426 396 /* Prevent anyone from changing this directory while we walk it. */ 427 - xfs_ilock(dp, XFS_IOLOCK_SHARED); 428 - lock_mode = xfs_ilock_data_map_shared(dp); 397 + lock_mode = xchk_nlinks_ilock_dir(dp); 429 398 430 399 /* 431 400 * The dotdot entry of an unlinked directory still points to the last ··· 481 452 xchk_iscan_abort(&xnc->collect_iscan); 482 453 out_unlock: 483 454 xfs_iunlock(dp, lock_mode); 484 - xfs_iunlock(dp, XFS_IOLOCK_SHARED); 485 455 return error; 486 456 } 487 457

+1 -1

fs/xfs/xfs_buf.c

··· 1751 1751 const char *descr) 1752 1752 { 1753 1753 /* The maximum size of the buftarg is only known once the sb is read. */ 1754 - btp->bt_nr_sectors = (xfs_daddr_t)-1; 1754 + btp->bt_nr_sectors = XFS_BUF_DADDR_MAX; 1755 1755 1756 1756 /* Set up device logical sector size mask */ 1757 1757 btp->bt_logical_sectorsize = logical_sectorsize;

+1

fs/xfs/xfs_buf.h

··· 22 22 */ 23 23 struct xfs_buf; 24 24 25 + #define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX) 25 26 #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) 26 27 27 28 #define XBF_READ (1u << 0) /* buffer intended for reading from device */

-1

fs/xfs/xfs_mount.h

··· 236 236 bool m_update_sb; /* sb needs update in mount */ 237 237 unsigned int m_max_open_zones; 238 238 unsigned int m_zonegc_low_space; 239 - struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */ 240 239 241 240 /* max_atomic_write mount option value */ 242 241 unsigned long long m_awu_max_bytes;

+42 -11

fs/xfs/xfs_super.c

··· 102 102 * Table driven mount option parser. 103 103 */ 104 104 enum { 105 - Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, 105 + Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, 106 106 Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, 107 107 Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, 108 108 Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, ··· 114 114 Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, 115 115 }; 116 116 117 + #define fsparam_dead(NAME) \ 118 + __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL) 119 + 117 120 static const struct fs_parameter_spec xfs_fs_parameters[] = { 121 + /* 122 + * These mount options were supposed to be deprecated in September 2025 123 + * but the deprecation warning was buggy, so not all users were 124 + * notified. The deprecation is now obnoxiously loud and postponed to 125 + * September 2030. 126 + */ 127 + fsparam_dead("attr2"), 128 + fsparam_dead("noattr2"), 129 + fsparam_dead("ikeep"), 130 + fsparam_dead("noikeep"), 131 + 118 132 fsparam_u32("logbufs", Opt_logbufs), 119 133 fsparam_string("logbsize", Opt_logbsize), 120 134 fsparam_string("logdev", Opt_logdev), ··· 800 786 801 787 truncate_inode_pages_final(&inode->i_data); 802 788 clear_inode(inode); 789 + 790 + if (IS_ENABLED(CONFIG_XFS_RT) && 791 + S_ISREG(inode->i_mode) && inode->i_private) { 792 + xfs_open_zone_put(inode->i_private); 793 + inode->i_private = NULL; 794 + } 803 795 } 804 796 805 797 static void ··· 1393 1373 static inline void 1394 1374 xfs_fs_warn_deprecated( 1395 1375 struct fs_context *fc, 1396 - struct fs_parameter *param, 1397 - uint64_t flag, 1398 - bool value) 1376 + struct fs_parameter *param) 1399 1377 { 1400 - /* Don't print the warning if reconfiguring and current mount point 1401 - * already had the flag set 1378 + /* 1379 + * Always warn about someone passing in a deprecated mount option. 1380 + * Previously we wouldn't print the warning if we were reconfiguring 1381 + * and current mount point already had the flag set, but that was not 1382 + * the right thing to do. 1383 + * 1384 + * Many distributions mount the root filesystem with no options in the 1385 + * initramfs and rely on mount -a to remount the root fs with the 1386 + * options in fstab. However, the old behavior meant that there would 1387 + * never be a warning about deprecated mount options for the root fs in 1388 + * /etc/fstab. On a single-fs system, that means no warning at all. 1389 + * 1390 + * Compounding this problem are distribution scripts that copy 1391 + * /proc/mounts to fstab, which means that we can't remove mount 1392 + * options unless we're 100% sure they have only ever been advertised 1393 + * in /proc/mounts in response to explicitly provided mount options. 1402 1394 */ 1403 - if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && 1404 - !!(XFS_M(fc->root->d_sb)->m_features & flag) == value) 1405 - return; 1406 1395 xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); 1407 1396 } 1408 1397 ··· 1437 1408 return opt; 1438 1409 1439 1410 switch (opt) { 1411 + case Op_deprecated: 1412 + xfs_fs_warn_deprecated(fc, param); 1413 + return 0; 1440 1414 case Opt_logbufs: 1441 1415 parsing_mp->m_logbufs = result.uint_32; 1442 1416 return 0; ··· 1560 1528 xfs_mount_set_dax_mode(parsing_mp, result.uint_32); 1561 1529 return 0; 1562 1530 #endif 1563 - /* Following mount options will be removed in September 2025 */ 1564 1531 case Opt_max_open_zones: 1565 1532 parsing_mp->m_max_open_zones = result.uint_32; 1566 1533 return 0; ··· 2252 2221 struct xfs_mount *mp; 2253 2222 int i; 2254 2223 2255 - mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); 2224 + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); 2256 2225 if (!mp) 2257 2226 return -ENOMEM; 2258 2227

+60 -88

fs/xfs/xfs_zone_alloc.c

··· 26 26 #include "xfs_trace.h" 27 27 #include "xfs_mru_cache.h" 28 28 29 + static void 30 + xfs_open_zone_free_rcu( 31 + struct callback_head *cb) 32 + { 33 + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); 34 + 35 + xfs_rtgroup_rele(oz->oz_rtg); 36 + kfree(oz); 37 + } 38 + 29 39 void 30 40 xfs_open_zone_put( 31 41 struct xfs_open_zone *oz) 32 42 { 33 - if (atomic_dec_and_test(&oz->oz_ref)) { 34 - xfs_rtgroup_rele(oz->oz_rtg); 35 - kfree(oz); 36 - } 43 + if (atomic_dec_and_test(&oz->oz_ref)) 44 + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); 37 45 } 38 46 39 47 static inline uint32_t ··· 622 614 } 623 615 624 616 /* 625 - * Try to pack inodes that are written back after they were closed tight instead 626 - * of trying to open new zones for them or spread them to the least recently 627 - * used zone. This optimizes the data layout for workloads that untar or copy 628 - * a lot of small files. Right now this does not separate multiple such 617 + * Try to tightly pack small files that are written back after they were closed 618 + * instead of trying to open new zones for them or spread them to the least 619 + * recently used zone. This optimizes the data layout for workloads that untar 620 + * or copy a lot of small files. Right now this does not separate multiple such 629 621 * streams. 630 622 */ 631 623 static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) 632 624 { 625 + struct xfs_mount *mp = ip->i_mount; 626 + size_t zone_capacity = 627 + XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks); 628 + 629 + /* 630 + * Do not pack write files that are already using a full zone to avoid 631 + * fragmentation. 632 + */ 633 + if (i_size_read(VFS_I(ip)) >= zone_capacity) 634 + return false; 635 + 633 636 return !inode_is_open_for_write(VFS_I(ip)) && 634 637 !(ip->i_diflags & XFS_DIFLAG_APPEND); 635 638 } ··· 765 746 } 766 747 767 748 /* 768 - * Cache the last zone written to for an inode so that it is considered first 769 - * for subsequent writes. 770 - */ 771 - struct xfs_zone_cache_item { 772 - struct xfs_mru_cache_elem mru; 773 - struct xfs_open_zone *oz; 774 - }; 775 - 776 - static inline struct xfs_zone_cache_item * 777 - xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) 778 - { 779 - return container_of(mru, struct xfs_zone_cache_item, mru); 780 - } 781 - 782 - static void 783 - xfs_zone_cache_free_func( 784 - void *data, 785 - struct xfs_mru_cache_elem *mru) 786 - { 787 - struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); 788 - 789 - xfs_open_zone_put(item->oz); 790 - kfree(item); 791 - } 792 - 793 - /* 794 749 * Check if we have a cached last open zone available for the inode and 795 750 * if yes return a reference to it. 796 751 */ 797 752 static struct xfs_open_zone * 798 - xfs_cached_zone( 799 - struct xfs_mount *mp, 800 - struct xfs_inode *ip) 753 + xfs_get_cached_zone( 754 + struct xfs_inode *ip) 801 755 { 802 - struct xfs_mru_cache_elem *mru; 803 - struct xfs_open_zone *oz; 756 + struct xfs_open_zone *oz; 804 757 805 - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); 806 - if (!mru) 807 - return NULL; 808 - oz = xfs_zone_cache_item(mru)->oz; 758 + rcu_read_lock(); 759 + oz = VFS_I(ip)->i_private; 809 760 if (oz) { 810 761 /* 811 762 * GC only steals open zones at mount time, so no GC zones 812 763 * should end up in the cache. 813 764 */ 814 765 ASSERT(!oz->oz_is_gc); 815 - ASSERT(atomic_read(&oz->oz_ref) > 0); 816 - atomic_inc(&oz->oz_ref); 766 + if (!atomic_inc_not_zero(&oz->oz_ref)) 767 + oz = NULL; 817 768 } 818 - xfs_mru_cache_done(mp->m_zone_cache); 769 + rcu_read_unlock(); 770 + 819 771 return oz; 820 772 } 821 773 822 774 /* 823 - * Update the last used zone cache for a given inode. 775 + * Stash our zone in the inode so that is is reused for future allocations. 824 776 * 825 - * The caller must have a reference on the open zone. 777 + * The open_zone structure will be pinned until either the inode is freed or 778 + * until the cached open zone is replaced with a different one because the 779 + * current one was full when we tried to use it. This means we keep any 780 + * open zone around forever as long as any inode that used it for the last 781 + * write is cached, which slightly increases the memory use of cached inodes 782 + * that were every written to, but significantly simplifies the cached zone 783 + * lookup. Because the open_zone is clearly marked as full when all data 784 + * in the underlying RTG was written, the caching is always safe. 826 785 */ 827 786 static void 828 - xfs_zone_cache_create_association( 829 - struct xfs_inode *ip, 830 - struct xfs_open_zone *oz) 787 + xfs_set_cached_zone( 788 + struct xfs_inode *ip, 789 + struct xfs_open_zone *oz) 831 790 { 832 - struct xfs_mount *mp = ip->i_mount; 833 - struct xfs_zone_cache_item *item = NULL; 834 - struct xfs_mru_cache_elem *mru; 791 + struct xfs_open_zone *old_oz; 835 792 836 - ASSERT(atomic_read(&oz->oz_ref) > 0); 837 793 atomic_inc(&oz->oz_ref); 838 - 839 - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); 840 - if (mru) { 841 - /* 842 - * If we have an association already, update it to point to the 843 - * new zone. 844 - */ 845 - item = xfs_zone_cache_item(mru); 846 - xfs_open_zone_put(item->oz); 847 - item->oz = oz; 848 - xfs_mru_cache_done(mp->m_zone_cache); 849 - return; 850 - } 851 - 852 - item = kmalloc(sizeof(*item), GFP_KERNEL); 853 - if (!item) { 854 - xfs_open_zone_put(oz); 855 - return; 856 - } 857 - item->oz = oz; 858 - xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); 794 + old_oz = xchg(&VFS_I(ip)->i_private, oz); 795 + if (old_oz) 796 + xfs_open_zone_put(old_oz); 859 797 } 860 798 861 799 static void ··· 856 880 * the inode is still associated with a zone and use that if so. 857 881 */ 858 882 if (!*oz) 859 - *oz = xfs_cached_zone(mp, ip); 883 + *oz = xfs_get_cached_zone(ip); 860 884 861 885 if (!*oz) { 862 886 select_zone: 863 887 *oz = xfs_select_zone(mp, write_hint, pack_tight); 864 888 if (!*oz) 865 889 goto out_error; 866 - 867 - xfs_zone_cache_create_association(ip, *oz); 890 + xfs_set_cached_zone(ip, *oz); 868 891 } 869 892 870 893 alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), ··· 941 966 xfs_open_zone_put(oz); 942 967 } 943 968 spin_unlock(&zi->zi_open_zones_lock); 969 + 970 + /* 971 + * Wait for all open zones to be freed so that they drop the group 972 + * references: 973 + */ 974 + rcu_barrier(); 944 975 } 945 976 946 977 struct xfs_init_zones { ··· 1260 1279 error = xfs_zone_gc_mount(mp); 1261 1280 if (error) 1262 1281 goto out_free_zone_info; 1263 - 1264 - /* 1265 - * Set up a mru cache to track inode to open zone for data placement 1266 - * purposes. The magic values for group count and life time is the 1267 - * same as the defaults for file streams, which seems sane enough. 1268 - */ 1269 - xfs_mru_cache_create(&mp->m_zone_cache, mp, 1270 - 5000, 10, xfs_zone_cache_free_func); 1271 1282 return 0; 1272 1283 1273 1284 out_free_zone_info: ··· 1273 1300 { 1274 1301 xfs_zone_gc_unmount(mp); 1275 1302 xfs_free_zone_info(mp->m_zone_info); 1276 - xfs_mru_cache_destroy(mp->m_zone_cache); 1277 1303 }

+46 -35

fs/xfs/xfs_zone_gc.c

··· 491 491 struct xfs_rtgroup *victim_rtg = NULL; 492 492 unsigned int bucket; 493 493 494 - if (xfs_is_shutdown(mp)) 495 - return false; 496 - 497 - if (iter->victim_rtg) 498 - return true; 499 - 500 - /* 501 - * Don't start new work if we are asked to stop or park. 502 - */ 503 - if (kthread_should_stop() || kthread_should_park()) 504 - return false; 505 - 506 - if (!xfs_zoned_need_gc(mp)) 507 - return false; 508 - 509 494 spin_lock(&zi->zi_used_buckets_lock); 510 495 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 511 496 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); ··· 960 975 } while (next); 961 976 } 962 977 978 + static bool 979 + xfs_zone_gc_should_start_new_work( 980 + struct xfs_zone_gc_data *data) 981 + { 982 + if (xfs_is_shutdown(data->mp)) 983 + return false; 984 + if (!xfs_zone_gc_space_available(data)) 985 + return false; 986 + 987 + if (!data->iter.victim_rtg) { 988 + if (kthread_should_stop() || kthread_should_park()) 989 + return false; 990 + if (!xfs_zoned_need_gc(data->mp)) 991 + return false; 992 + if (!xfs_zone_gc_select_victim(data)) 993 + return false; 994 + } 995 + 996 + return true; 997 + } 998 + 963 999 /* 964 1000 * Handle the work to read and write data for GC and to reset the zones, 965 1001 * including handling all completions. ··· 988 982 * Note that the order of the chunks is preserved so that we don't undo the 989 983 * optimal order established by xfs_zone_gc_query(). 990 984 */ 991 - static bool 985 + static void 992 986 xfs_zone_gc_handle_work( 993 987 struct xfs_zone_gc_data *data) 994 988 { ··· 1002 996 zi->zi_reset_list = NULL; 1003 997 spin_unlock(&zi->zi_reset_list_lock); 1004 998 1005 - if (!xfs_zone_gc_select_victim(data) || 1006 - !xfs_zone_gc_space_available(data)) { 1007 - if (list_empty(&data->reading) && 1008 - list_empty(&data->writing) && 1009 - list_empty(&data->resetting) && 1010 - !reset_list) 1011 - return false; 1012 - } 1013 - 1014 - __set_current_state(TASK_RUNNING); 1015 - try_to_freeze(); 1016 - 1017 - if (reset_list) 999 + if (reset_list) { 1000 + set_current_state(TASK_RUNNING); 1018 1001 xfs_zone_gc_reset_zones(data, reset_list); 1002 + } 1019 1003 1020 1004 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1021 1005 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1022 1006 break; 1007 + set_current_state(TASK_RUNNING); 1023 1008 xfs_zone_gc_finish_reset(chunk); 1024 1009 } 1025 1010 1026 1011 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1027 1012 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1028 1013 break; 1014 + set_current_state(TASK_RUNNING); 1029 1015 xfs_zone_gc_finish_chunk(chunk); 1030 1016 } 1031 1017 ··· 1025 1027 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1026 1028 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1027 1029 break; 1030 + set_current_state(TASK_RUNNING); 1028 1031 xfs_zone_gc_write_chunk(chunk); 1029 1032 } 1030 1033 blk_finish_plug(&plug); 1031 1034 1032 - blk_start_plug(&plug); 1033 - while (xfs_zone_gc_start_chunk(data)) 1034 - ; 1035 - blk_finish_plug(&plug); 1036 - return true; 1035 + if (xfs_zone_gc_should_start_new_work(data)) { 1036 + set_current_state(TASK_RUNNING); 1037 + blk_start_plug(&plug); 1038 + while (xfs_zone_gc_start_chunk(data)) 1039 + ; 1040 + blk_finish_plug(&plug); 1041 + } 1037 1042 } 1038 1043 1039 1044 /* ··· 1060 1059 for (;;) { 1061 1060 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1062 1061 xfs_set_zonegc_running(mp); 1063 - if (xfs_zone_gc_handle_work(data)) 1062 + 1063 + xfs_zone_gc_handle_work(data); 1064 + 1065 + /* 1066 + * Only sleep if nothing set the state to running. Else check for 1067 + * work again as someone might have queued up more work and woken 1068 + * us in the meantime. 1069 + */ 1070 + if (get_current_state() == TASK_RUNNING) { 1071 + try_to_freeze(); 1064 1072 continue; 1073 + } 1065 1074 1066 1075 if (list_empty(&data->reading) && 1067 1076 list_empty(&data->writing) &&

+2

fs/xfs/xfs_zone_priv.h

··· 44 44 * the life time of an open zone. 45 45 */ 46 46 struct xfs_rtgroup *oz_rtg; 47 + 48 + struct rcu_head oz_rcu; 47 49 }; 48 50 49 51 /*

Configure Feed

Configure Feed