Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'xfs-fixes-6.18-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Carlos Maiolino:
"The main highlight here is a fix for a bug brought in by the removal
of attr2 mount option, where some installations might actually have
'attr2' explicitly configured in fstab preventing system to boot by
not being able to remount the rootfs as RW.

Besides that there are a couple fix to the zonefs implementation,
changing XFS_ONLINE_SCRUB_STATS to depend on DEBUG_FS (was select
before), and some other minor changes"

* tag 'xfs-fixes-6.18-rc3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
xfs: fix locking in xchk_nlinks_collect_dir
xfs: loudly complain about defunct mount options
xfs: always warn about deprecated mount options
xfs: don't set bt_nr_sectors to a negative number
xfs: don't use __GFP_NOFAIL in xfs_init_fs_context
xfs: cache open zone in inode->i_private
xfs: avoid busy loops in GCD
xfs: XFS_ONLINE_SCRUB_STATS should depend on DEBUG_FS
xfs: do not tightly pack-write large files
xfs: Improve CONFIG_XFS_RT Kconfig help

+193 -140
+10 -1
fs/xfs/Kconfig
··· 119 119 120 120 See the xfs man page in section 5 for additional information. 121 121 122 + This option is mandatory to support zoned block devices. For these 123 + devices, the realtime subvolume must be backed by a zoned block 124 + device and a regular block device used as the main device (for 125 + metadata). If the zoned block device is a host-managed SMR hard-disk 126 + containing conventional zones at the beginning of its address space, 127 + XFS will use the disk conventional zones as the main device and the 128 + remaining sequential write required zones as the backing storage for 129 + the realtime subvolume. 130 + 122 131 If unsure, say N. 123 132 124 133 config XFS_DRAIN_INTENTS ··· 165 156 bool "XFS online metadata check usage data collection" 166 157 default y 167 158 depends on XFS_ONLINE_SCRUB 168 - select DEBUG_FS 159 + depends on DEBUG_FS 169 160 help 170 161 If you say Y here, the kernel will gather usage data about 171 162 the online metadata check subsystem. This includes the number
+31 -3
fs/xfs/scrub/nlinks.c
··· 376 376 return error; 377 377 } 378 378 379 + static uint 380 + xchk_nlinks_ilock_dir( 381 + struct xfs_inode *ip) 382 + { 383 + uint lock_mode = XFS_ILOCK_SHARED; 384 + 385 + /* 386 + * We're going to scan the directory entries, so we must be ready to 387 + * pull the data fork mappings into memory if they aren't already. 388 + */ 389 + if (xfs_need_iread_extents(&ip->i_df)) 390 + lock_mode = XFS_ILOCK_EXCL; 391 + 392 + /* 393 + * We're going to scan the parent pointers, so we must be ready to 394 + * pull the attr fork mappings into memory if they aren't already. 395 + */ 396 + if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) && 397 + xfs_need_iread_extents(&ip->i_af)) 398 + lock_mode = XFS_ILOCK_EXCL; 399 + 400 + /* 401 + * Take the IOLOCK so that other threads cannot start a directory 402 + * update while we're scanning. 403 + */ 404 + lock_mode |= XFS_IOLOCK_SHARED; 405 + xfs_ilock(ip, lock_mode); 406 + return lock_mode; 407 + } 408 + 379 409 /* Walk a directory to bump the observed link counts of the children. */ 380 410 STATIC int 381 411 xchk_nlinks_collect_dir( ··· 424 394 return 0; 425 395 426 396 /* Prevent anyone from changing this directory while we walk it. */ 427 - xfs_ilock(dp, XFS_IOLOCK_SHARED); 428 - lock_mode = xfs_ilock_data_map_shared(dp); 397 + lock_mode = xchk_nlinks_ilock_dir(dp); 429 398 430 399 /* 431 400 * The dotdot entry of an unlinked directory still points to the last ··· 481 452 xchk_iscan_abort(&xnc->collect_iscan); 482 453 out_unlock: 483 454 xfs_iunlock(dp, lock_mode); 484 - xfs_iunlock(dp, XFS_IOLOCK_SHARED); 485 455 return error; 486 456 } 487 457
+1 -1
fs/xfs/xfs_buf.c
··· 1751 1751 const char *descr) 1752 1752 { 1753 1753 /* The maximum size of the buftarg is only known once the sb is read. */ 1754 - btp->bt_nr_sectors = (xfs_daddr_t)-1; 1754 + btp->bt_nr_sectors = XFS_BUF_DADDR_MAX; 1755 1755 1756 1756 /* Set up device logical sector size mask */ 1757 1757 btp->bt_logical_sectorsize = logical_sectorsize;
+1
fs/xfs/xfs_buf.h
··· 22 22 */ 23 23 struct xfs_buf; 24 24 25 + #define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX) 25 26 #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) 26 27 27 28 #define XBF_READ (1u << 0) /* buffer intended for reading from device */
-1
fs/xfs/xfs_mount.h
··· 236 236 bool m_update_sb; /* sb needs update in mount */ 237 237 unsigned int m_max_open_zones; 238 238 unsigned int m_zonegc_low_space; 239 - struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */ 240 239 241 240 /* max_atomic_write mount option value */ 242 241 unsigned long long m_awu_max_bytes;
+42 -11
fs/xfs/xfs_super.c
··· 102 102 * Table driven mount option parser. 103 103 */ 104 104 enum { 105 - Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, 105 + Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, 106 106 Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, 107 107 Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, 108 108 Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, ··· 114 114 Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, 115 115 }; 116 116 117 + #define fsparam_dead(NAME) \ 118 + __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL) 119 + 117 120 static const struct fs_parameter_spec xfs_fs_parameters[] = { 121 + /* 122 + * These mount options were supposed to be deprecated in September 2025 123 + * but the deprecation warning was buggy, so not all users were 124 + * notified. The deprecation is now obnoxiously loud and postponed to 125 + * September 2030. 126 + */ 127 + fsparam_dead("attr2"), 128 + fsparam_dead("noattr2"), 129 + fsparam_dead("ikeep"), 130 + fsparam_dead("noikeep"), 131 + 118 132 fsparam_u32("logbufs", Opt_logbufs), 119 133 fsparam_string("logbsize", Opt_logbsize), 120 134 fsparam_string("logdev", Opt_logdev), ··· 800 786 801 787 truncate_inode_pages_final(&inode->i_data); 802 788 clear_inode(inode); 789 + 790 + if (IS_ENABLED(CONFIG_XFS_RT) && 791 + S_ISREG(inode->i_mode) && inode->i_private) { 792 + xfs_open_zone_put(inode->i_private); 793 + inode->i_private = NULL; 794 + } 803 795 } 804 796 805 797 static void ··· 1393 1373 static inline void 1394 1374 xfs_fs_warn_deprecated( 1395 1375 struct fs_context *fc, 1396 - struct fs_parameter *param, 1397 - uint64_t flag, 1398 - bool value) 1376 + struct fs_parameter *param) 1399 1377 { 1400 - /* Don't print the warning if reconfiguring and current mount point 1401 - * already had the flag set 1378 + /* 1379 + * Always warn about someone passing in a deprecated mount option. 1380 + * Previously we wouldn't print the warning if we were reconfiguring 1381 + * and current mount point already had the flag set, but that was not 1382 + * the right thing to do. 1383 + * 1384 + * Many distributions mount the root filesystem with no options in the 1385 + * initramfs and rely on mount -a to remount the root fs with the 1386 + * options in fstab. However, the old behavior meant that there would 1387 + * never be a warning about deprecated mount options for the root fs in 1388 + * /etc/fstab. On a single-fs system, that means no warning at all. 1389 + * 1390 + * Compounding this problem are distribution scripts that copy 1391 + * /proc/mounts to fstab, which means that we can't remove mount 1392 + * options unless we're 100% sure they have only ever been advertised 1393 + * in /proc/mounts in response to explicitly provided mount options. 1402 1394 */ 1403 - if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && 1404 - !!(XFS_M(fc->root->d_sb)->m_features & flag) == value) 1405 - return; 1406 1395 xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); 1407 1396 } 1408 1397 ··· 1437 1408 return opt; 1438 1409 1439 1410 switch (opt) { 1411 + case Op_deprecated: 1412 + xfs_fs_warn_deprecated(fc, param); 1413 + return 0; 1440 1414 case Opt_logbufs: 1441 1415 parsing_mp->m_logbufs = result.uint_32; 1442 1416 return 0; ··· 1560 1528 xfs_mount_set_dax_mode(parsing_mp, result.uint_32); 1561 1529 return 0; 1562 1530 #endif 1563 - /* Following mount options will be removed in September 2025 */ 1564 1531 case Opt_max_open_zones: 1565 1532 parsing_mp->m_max_open_zones = result.uint_32; 1566 1533 return 0; ··· 2252 2221 struct xfs_mount *mp; 2253 2222 int i; 2254 2223 2255 - mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); 2224 + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); 2256 2225 if (!mp) 2257 2226 return -ENOMEM; 2258 2227
+60 -88
fs/xfs/xfs_zone_alloc.c
··· 26 26 #include "xfs_trace.h" 27 27 #include "xfs_mru_cache.h" 28 28 29 + static void 30 + xfs_open_zone_free_rcu( 31 + struct callback_head *cb) 32 + { 33 + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); 34 + 35 + xfs_rtgroup_rele(oz->oz_rtg); 36 + kfree(oz); 37 + } 38 + 29 39 void 30 40 xfs_open_zone_put( 31 41 struct xfs_open_zone *oz) 32 42 { 33 - if (atomic_dec_and_test(&oz->oz_ref)) { 34 - xfs_rtgroup_rele(oz->oz_rtg); 35 - kfree(oz); 36 - } 43 + if (atomic_dec_and_test(&oz->oz_ref)) 44 + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); 37 45 } 38 46 39 47 static inline uint32_t ··· 622 614 } 623 615 624 616 /* 625 - * Try to pack inodes that are written back after they were closed tight instead 626 - * of trying to open new zones for them or spread them to the least recently 627 - * used zone. This optimizes the data layout for workloads that untar or copy 628 - * a lot of small files. Right now this does not separate multiple such 617 + * Try to tightly pack small files that are written back after they were closed 618 + * instead of trying to open new zones for them or spread them to the least 619 + * recently used zone. This optimizes the data layout for workloads that untar 620 + * or copy a lot of small files. Right now this does not separate multiple such 629 621 * streams. 630 622 */ 631 623 static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) 632 624 { 625 + struct xfs_mount *mp = ip->i_mount; 626 + size_t zone_capacity = 627 + XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks); 628 + 629 + /* 630 + * Do not pack write files that are already using a full zone to avoid 631 + * fragmentation. 632 + */ 633 + if (i_size_read(VFS_I(ip)) >= zone_capacity) 634 + return false; 635 + 633 636 return !inode_is_open_for_write(VFS_I(ip)) && 634 637 !(ip->i_diflags & XFS_DIFLAG_APPEND); 635 638 } ··· 765 746 } 766 747 767 748 /* 768 - * Cache the last zone written to for an inode so that it is considered first 769 - * for subsequent writes. 770 - */ 771 - struct xfs_zone_cache_item { 772 - struct xfs_mru_cache_elem mru; 773 - struct xfs_open_zone *oz; 774 - }; 775 - 776 - static inline struct xfs_zone_cache_item * 777 - xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) 778 - { 779 - return container_of(mru, struct xfs_zone_cache_item, mru); 780 - } 781 - 782 - static void 783 - xfs_zone_cache_free_func( 784 - void *data, 785 - struct xfs_mru_cache_elem *mru) 786 - { 787 - struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); 788 - 789 - xfs_open_zone_put(item->oz); 790 - kfree(item); 791 - } 792 - 793 - /* 794 749 * Check if we have a cached last open zone available for the inode and 795 750 * if yes return a reference to it. 796 751 */ 797 752 static struct xfs_open_zone * 798 - xfs_cached_zone( 799 - struct xfs_mount *mp, 800 - struct xfs_inode *ip) 753 + xfs_get_cached_zone( 754 + struct xfs_inode *ip) 801 755 { 802 - struct xfs_mru_cache_elem *mru; 803 - struct xfs_open_zone *oz; 756 + struct xfs_open_zone *oz; 804 757 805 - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); 806 - if (!mru) 807 - return NULL; 808 - oz = xfs_zone_cache_item(mru)->oz; 758 + rcu_read_lock(); 759 + oz = VFS_I(ip)->i_private; 809 760 if (oz) { 810 761 /* 811 762 * GC only steals open zones at mount time, so no GC zones 812 763 * should end up in the cache. 813 764 */ 814 765 ASSERT(!oz->oz_is_gc); 815 - ASSERT(atomic_read(&oz->oz_ref) > 0); 816 - atomic_inc(&oz->oz_ref); 766 + if (!atomic_inc_not_zero(&oz->oz_ref)) 767 + oz = NULL; 817 768 } 818 - xfs_mru_cache_done(mp->m_zone_cache); 769 + rcu_read_unlock(); 770 + 819 771 return oz; 820 772 } 821 773 822 774 /* 823 - * Update the last used zone cache for a given inode. 775 + * Stash our zone in the inode so that is is reused for future allocations. 824 776 * 825 - * The caller must have a reference on the open zone. 777 + * The open_zone structure will be pinned until either the inode is freed or 778 + * until the cached open zone is replaced with a different one because the 779 + * current one was full when we tried to use it. This means we keep any 780 + * open zone around forever as long as any inode that used it for the last 781 + * write is cached, which slightly increases the memory use of cached inodes 782 + * that were every written to, but significantly simplifies the cached zone 783 + * lookup. Because the open_zone is clearly marked as full when all data 784 + * in the underlying RTG was written, the caching is always safe. 826 785 */ 827 786 static void 828 - xfs_zone_cache_create_association( 829 - struct xfs_inode *ip, 830 - struct xfs_open_zone *oz) 787 + xfs_set_cached_zone( 788 + struct xfs_inode *ip, 789 + struct xfs_open_zone *oz) 831 790 { 832 - struct xfs_mount *mp = ip->i_mount; 833 - struct xfs_zone_cache_item *item = NULL; 834 - struct xfs_mru_cache_elem *mru; 791 + struct xfs_open_zone *old_oz; 835 792 836 - ASSERT(atomic_read(&oz->oz_ref) > 0); 837 793 atomic_inc(&oz->oz_ref); 838 - 839 - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); 840 - if (mru) { 841 - /* 842 - * If we have an association already, update it to point to the 843 - * new zone. 844 - */ 845 - item = xfs_zone_cache_item(mru); 846 - xfs_open_zone_put(item->oz); 847 - item->oz = oz; 848 - xfs_mru_cache_done(mp->m_zone_cache); 849 - return; 850 - } 851 - 852 - item = kmalloc(sizeof(*item), GFP_KERNEL); 853 - if (!item) { 854 - xfs_open_zone_put(oz); 855 - return; 856 - } 857 - item->oz = oz; 858 - xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); 794 + old_oz = xchg(&VFS_I(ip)->i_private, oz); 795 + if (old_oz) 796 + xfs_open_zone_put(old_oz); 859 797 } 860 798 861 799 static void ··· 856 880 * the inode is still associated with a zone and use that if so. 857 881 */ 858 882 if (!*oz) 859 - *oz = xfs_cached_zone(mp, ip); 883 + *oz = xfs_get_cached_zone(ip); 860 884 861 885 if (!*oz) { 862 886 select_zone: 863 887 *oz = xfs_select_zone(mp, write_hint, pack_tight); 864 888 if (!*oz) 865 889 goto out_error; 866 - 867 - xfs_zone_cache_create_association(ip, *oz); 890 + xfs_set_cached_zone(ip, *oz); 868 891 } 869 892 870 893 alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), ··· 941 966 xfs_open_zone_put(oz); 942 967 } 943 968 spin_unlock(&zi->zi_open_zones_lock); 969 + 970 + /* 971 + * Wait for all open zones to be freed so that they drop the group 972 + * references: 973 + */ 974 + rcu_barrier(); 944 975 } 945 976 946 977 struct xfs_init_zones { ··· 1260 1279 error = xfs_zone_gc_mount(mp); 1261 1280 if (error) 1262 1281 goto out_free_zone_info; 1263 - 1264 - /* 1265 - * Set up a mru cache to track inode to open zone for data placement 1266 - * purposes. The magic values for group count and life time is the 1267 - * same as the defaults for file streams, which seems sane enough. 1268 - */ 1269 - xfs_mru_cache_create(&mp->m_zone_cache, mp, 1270 - 5000, 10, xfs_zone_cache_free_func); 1271 1282 return 0; 1272 1283 1273 1284 out_free_zone_info: ··· 1273 1300 { 1274 1301 xfs_zone_gc_unmount(mp); 1275 1302 xfs_free_zone_info(mp->m_zone_info); 1276 - xfs_mru_cache_destroy(mp->m_zone_cache); 1277 1303 }
+46 -35
fs/xfs/xfs_zone_gc.c
··· 491 491 struct xfs_rtgroup *victim_rtg = NULL; 492 492 unsigned int bucket; 493 493 494 - if (xfs_is_shutdown(mp)) 495 - return false; 496 - 497 - if (iter->victim_rtg) 498 - return true; 499 - 500 - /* 501 - * Don't start new work if we are asked to stop or park. 502 - */ 503 - if (kthread_should_stop() || kthread_should_park()) 504 - return false; 505 - 506 - if (!xfs_zoned_need_gc(mp)) 507 - return false; 508 - 509 494 spin_lock(&zi->zi_used_buckets_lock); 510 495 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 511 496 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); ··· 960 975 } while (next); 961 976 } 962 977 978 + static bool 979 + xfs_zone_gc_should_start_new_work( 980 + struct xfs_zone_gc_data *data) 981 + { 982 + if (xfs_is_shutdown(data->mp)) 983 + return false; 984 + if (!xfs_zone_gc_space_available(data)) 985 + return false; 986 + 987 + if (!data->iter.victim_rtg) { 988 + if (kthread_should_stop() || kthread_should_park()) 989 + return false; 990 + if (!xfs_zoned_need_gc(data->mp)) 991 + return false; 992 + if (!xfs_zone_gc_select_victim(data)) 993 + return false; 994 + } 995 + 996 + return true; 997 + } 998 + 963 999 /* 964 1000 * Handle the work to read and write data for GC and to reset the zones, 965 1001 * including handling all completions. ··· 988 982 * Note that the order of the chunks is preserved so that we don't undo the 989 983 * optimal order established by xfs_zone_gc_query(). 990 984 */ 991 - static bool 985 + static void 992 986 xfs_zone_gc_handle_work( 993 987 struct xfs_zone_gc_data *data) 994 988 { ··· 1002 996 zi->zi_reset_list = NULL; 1003 997 spin_unlock(&zi->zi_reset_list_lock); 1004 998 1005 - if (!xfs_zone_gc_select_victim(data) || 1006 - !xfs_zone_gc_space_available(data)) { 1007 - if (list_empty(&data->reading) && 1008 - list_empty(&data->writing) && 1009 - list_empty(&data->resetting) && 1010 - !reset_list) 1011 - return false; 1012 - } 1013 - 1014 - __set_current_state(TASK_RUNNING); 1015 - try_to_freeze(); 1016 - 1017 - if (reset_list) 999 + if (reset_list) { 1000 + set_current_state(TASK_RUNNING); 1018 1001 xfs_zone_gc_reset_zones(data, reset_list); 1002 + } 1019 1003 1020 1004 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1021 1005 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1022 1006 break; 1007 + set_current_state(TASK_RUNNING); 1023 1008 xfs_zone_gc_finish_reset(chunk); 1024 1009 } 1025 1010 1026 1011 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1027 1012 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1028 1013 break; 1014 + set_current_state(TASK_RUNNING); 1029 1015 xfs_zone_gc_finish_chunk(chunk); 1030 1016 } 1031 1017 ··· 1025 1027 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1026 1028 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1027 1029 break; 1030 + set_current_state(TASK_RUNNING); 1028 1031 xfs_zone_gc_write_chunk(chunk); 1029 1032 } 1030 1033 blk_finish_plug(&plug); 1031 1034 1032 - blk_start_plug(&plug); 1033 - while (xfs_zone_gc_start_chunk(data)) 1034 - ; 1035 - blk_finish_plug(&plug); 1036 - return true; 1035 + if (xfs_zone_gc_should_start_new_work(data)) { 1036 + set_current_state(TASK_RUNNING); 1037 + blk_start_plug(&plug); 1038 + while (xfs_zone_gc_start_chunk(data)) 1039 + ; 1040 + blk_finish_plug(&plug); 1041 + } 1037 1042 } 1038 1043 1039 1044 /* ··· 1060 1059 for (;;) { 1061 1060 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1062 1061 xfs_set_zonegc_running(mp); 1063 - if (xfs_zone_gc_handle_work(data)) 1062 + 1063 + xfs_zone_gc_handle_work(data); 1064 + 1065 + /* 1066 + * Only sleep if nothing set the state to running. Else check for 1067 + * work again as someone might have queued up more work and woken 1068 + * us in the meantime. 1069 + */ 1070 + if (get_current_state() == TASK_RUNNING) { 1071 + try_to_freeze(); 1064 1072 continue; 1073 + } 1065 1074 1066 1075 if (list_empty(&data->reading) && 1067 1076 list_empty(&data->writing) &&
+2
fs/xfs/xfs_zone_priv.h
··· 44 44 * the life time of an open zone. 45 45 */ 46 46 struct xfs_rtgroup *oz_rtg; 47 + 48 + struct rcu_head oz_rcu; 47 49 }; 48 50 49 51 /*