Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

btrfs: zoned: limit number of zones reclaimed in flush_space()

Limit the number of zones reclaimed in flush_space()'s RECLAIM_ZONES
state.

This prevents possibly long running reclaim sweeps to block other tasks in
the system, while the system is under pressure anyways, causing the
tasks to hang.

An example of this can be seen here, triggered by fstests generic/551:

generic/551 [ 27.042349] run fstests generic/551 at 2026-02-27 11:05:30
BTRFS: device fsid 78c16e29-20d9-4c8e-bc04-7ba431be38ff devid 1 transid 8 /dev/vdb (254:16) scanned by mount (806)
BTRFS info (device vdb): first mount of filesystem 78c16e29-20d9-4c8e-bc04-7ba431be38ff
BTRFS info (device vdb): using crc32c checksum algorithm
BTRFS info (device vdb): host-managed zoned block device /dev/vdb, 64 zones of 268435456 bytes
BTRFS info (device vdb): zoned mode enabled with zone size 268435456
BTRFS info (device vdb): checking UUID tree
BTRFS info (device vdb): enabling free space tree
INFO: task kworker/u38:1:90 blocked for more than 120 seconds.
Not tainted 7.0.0-rc1+ #345
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/u38:1 state:D stack:0 pid:90 tgid:90 ppid:2 task_flags:0x4208060 flags:0x00080000
Workqueue: events_unbound btrfs_async_reclaim_data_space
Call Trace:
<TASK>
__schedule+0x34f/0xe70
schedule+0x41/0x140
schedule_timeout+0xa3/0x110
? mark_held_locks+0x40/0x70
? lockdep_hardirqs_on_prepare+0xd8/0x1c0
? trace_hardirqs_on+0x18/0x100
? lockdep_hardirqs_on+0x84/0x130
? _raw_spin_unlock_irq+0x33/0x50
wait_for_completion+0xa4/0x150
? __flush_work+0x24c/0x550
__flush_work+0x339/0x550
? __pfx_wq_barrier_func+0x10/0x10
? wait_for_completion+0x39/0x150
flush_space+0x243/0x660
? find_held_lock+0x2b/0x80
? kvm_sched_clock_read+0x11/0x20
? local_clock_noinstr+0x17/0x110
? local_clock+0x15/0x30
? lock_release+0x1b7/0x4b0
do_async_reclaim_data_space+0xe8/0x160
btrfs_async_reclaim_data_space+0x19/0x30
process_one_work+0x20a/0x5f0
? lock_is_held_type+0xcd/0x130
worker_thread+0x1e2/0x3c0
? __pfx_worker_thread+0x10/0x10
kthread+0x103/0x150
? __pfx_kthread+0x10/0x10
ret_from_fork+0x20d/0x320
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>

Showing all locks held in the system:
1 lock held by khungtaskd/67:
#0: ffffffff824d58e0 (rcu_read_lock){....}-{1:3}, at: debug_show_all_locks+0x3d/0x194
2 locks held by kworker/u38:1/90:
#0: ffff8881000aa158 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x3c4/0x5f0
#1: ffffc90000c17e58 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1c0/0x5f0
5 locks held by kworker/u39:1/191:
#0: ffff8881000aa158 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x3c4/0x5f0
#1: ffffc90000dfbe58 ((work_completion)(&fs_info->reclaim_bgs_work)){+.+.}-{0:0}, at: process_one_work+0x1c0/0x5f0
#2: ffff888101da0420 (sb_writers#9){.+.+}-{0:0}, at: process_one_work+0x20a/0x5f0
#3: ffff88811040a648 (&fs_info->reclaim_bgs_lock){+.+.}-{4:4}, at: btrfs_reclaim_bgs_work+0x1de/0x770
#4: ffff888110408a18 (&fs_info->cleaner_mutex){+.+.}-{4:4}, at: btrfs_relocate_block_group+0x95a/0x20f0
1 lock held by aio-dio-write-v/980:
#0: ffff888110093008 (&sb->s_type->i_mutex_key#15){++++}-{4:4}, at: btrfs_inode_lock+0x51/0xb0

=============================================

To prevent these long running reclaims from blocking the system, only
reclaim 5 block_groups in the RECLAIM_ZONES state of flush_space(). Also
as these reclaims are now constrained, it opens up the use for a
synchronous call to brtfs_reclaim_block_groups(), eliminating the need
to place the reclaim task on a workqueue and then flushing the workqueue
again.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>

authored by

Johannes Thumshirn and committed by
David Sterba
ad0c23c9 bd0ffde1

+14 -6
+9 -4
fs/btrfs/block-group.c
··· 1909 1909 return true; 1910 1910 } 1911 1911 1912 - static int btrfs_reclaim_block_group(struct btrfs_block_group *bg) 1912 + static int btrfs_reclaim_block_group(struct btrfs_block_group *bg, int *reclaimed) 1913 1913 { 1914 1914 struct btrfs_fs_info *fs_info = bg->fs_info; 1915 1915 struct btrfs_space_info *space_info = bg->space_info; ··· 2036 2036 if (space_info->total_bytes < old_total) 2037 2037 btrfs_set_periodic_reclaim_ready(space_info, true); 2038 2038 spin_unlock(&space_info->lock); 2039 + if (!ret) 2040 + (*reclaimed)++; 2039 2041 2040 2042 return ret; 2041 2043 } 2042 2044 2043 - static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info) 2045 + void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit) 2044 2046 { 2045 2047 struct btrfs_block_group *bg; 2046 2048 struct btrfs_space_info *space_info; 2047 2049 LIST_HEAD(retry_list); 2050 + int reclaimed = 0; 2048 2051 2049 2052 if (!btrfs_should_reclaim(fs_info)) 2050 2053 return; ··· 2083 2080 2084 2081 space_info = bg->space_info; 2085 2082 spin_unlock(&fs_info->unused_bgs_lock); 2086 - ret = btrfs_reclaim_block_group(bg); 2083 + ret = btrfs_reclaim_block_group(bg, &reclaimed); 2087 2084 2088 2085 if (ret && !READ_ONCE(space_info->periodic_reclaim)) 2089 2086 btrfs_link_bg_list(bg, &retry_list); ··· 2102 2099 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 2103 2100 goto end; 2104 2101 spin_lock(&fs_info->unused_bgs_lock); 2102 + if (reclaimed >= limit) 2103 + break; 2105 2104 } 2106 2105 spin_unlock(&fs_info->unused_bgs_lock); 2107 2106 mutex_unlock(&fs_info->reclaim_bgs_lock); ··· 2119 2114 struct btrfs_fs_info *fs_info = 2120 2115 container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 2121 2116 2122 - btrfs_reclaim_block_groups(fs_info); 2117 + btrfs_reclaim_block_groups(fs_info, -1); 2123 2118 } 2124 2119 2125 2120 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
+1
fs/btrfs/block-group.h
··· 350 350 struct btrfs_chunk_map *map); 351 351 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); 352 352 void btrfs_mark_bg_unused(struct btrfs_block_group *bg); 353 + void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit); 353 354 void btrfs_reclaim_bgs_work(struct work_struct *work); 354 355 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); 355 356 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
+4 -2
fs/btrfs/space-info.c
··· 212 212 213 213 #define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) 214 214 215 + #define BTRFS_ZONED_SYNC_RECLAIM_BATCH (5) 216 + 215 217 /* 216 218 * Calculate chunk size depending on volume type (regular or zoned). 217 219 */ ··· 920 918 if (btrfs_is_zoned(fs_info)) { 921 919 btrfs_reclaim_sweep(fs_info); 922 920 btrfs_delete_unused_bgs(fs_info); 923 - btrfs_reclaim_bgs(fs_info); 924 - flush_work(&fs_info->reclaim_bgs_work); 921 + btrfs_reclaim_block_groups(fs_info, 922 + BTRFS_ZONED_SYNC_RECLAIM_BATCH); 925 923 ASSERT(current->journal_info == NULL); 926 924 ret = btrfs_commit_current_transaction(root); 927 925 } else {