Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-6.10-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
"Fix a regression in extent map shrinker behaviour.

In the past weeks we got reports from users that there are huge
latency spikes or freezes. This was bisected to newly added shrinker
of extent maps (it was added to fix a build up of the structures in
memory).

I'm assuming that the freezes would happen to many users after release
so I'd like to get it merged now so it's in 6.10. Although the diff
size is not small the changes are relatively straightforward, the
reporters verified the fixes and we did testing on our side.

The fixes:

- adjust behaviour under memory pressure and check lock or scheduling
conditions, bail out if needed

- synchronize tracking of the scanning progress so inode ranges are
not skipped or work duplicated

- do a delayed iput when scanning a root so evicting an inode does
not slow things down in case of lots of dirty data, also fix
lockdep warning, a deadlock could happen when writing the dirty
data would need to start a transaction"

* tag 'for-6.10-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: avoid races when tracking progress for extent map shrinking
btrfs: stop extent map shrinker if reschedule is needed
btrfs: use delayed iput during extent map shrinking

+107 -37
+2
fs/btrfs/disk-io.c
··· 2856 2856 if (ret) 2857 2857 return ret; 2858 2858 2859 + spin_lock_init(&fs_info->extent_map_shrinker_lock); 2860 + 2859 2861 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); 2860 2862 if (ret) 2861 2863 return ret;
+94 -29
fs/btrfs/extent_map.c
··· 1028 1028 return ret; 1029 1029 } 1030 1030 1031 - static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan) 1031 + struct btrfs_em_shrink_ctx { 1032 + long nr_to_scan; 1033 + long scanned; 1034 + u64 last_ino; 1035 + u64 last_root; 1036 + }; 1037 + 1038 + static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) 1032 1039 { 1033 1040 const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); 1034 1041 struct extent_map_tree *tree = &inode->extent_tree; ··· 1064 1057 if (!down_read_trylock(&inode->i_mmap_lock)) 1065 1058 return 0; 1066 1059 1067 - write_lock(&tree->lock); 1060 + /* 1061 + * We want to be fast because we can be called from any path trying to 1062 + * allocate memory, so if the lock is busy we don't want to spend time 1063 + * waiting for it - either some task is about to do IO for the inode or 1064 + * we may have another task shrinking extent maps, here in this code, so 1065 + * skip this inode. 1066 + */ 1067 + if (!write_trylock(&tree->lock)) { 1068 + up_read(&inode->i_mmap_lock); 1069 + return 0; 1070 + } 1071 + 1068 1072 node = rb_first_cached(&tree->map); 1069 1073 while (node) { 1070 1074 struct extent_map *em; 1071 1075 1072 1076 em = rb_entry(node, struct extent_map, rb_node); 1073 1077 node = rb_next(node); 1074 - (*scanned)++; 1078 + ctx->scanned++; 1075 1079 1076 1080 if (em->flags & EXTENT_FLAG_PINNED) 1077 1081 goto next; ··· 1103 1085 free_extent_map(em); 1104 1086 nr_dropped++; 1105 1087 next: 1106 - if (*scanned >= nr_to_scan) 1088 + if (ctx->scanned >= ctx->nr_to_scan) 1107 1089 break; 1108 1090 1109 1091 /* 1110 - * Restart if we had to reschedule, and any extent maps that were 1111 - * pinned before may have become unpinned after we released the 1112 - * lock and took it again. 1092 + * Stop if we need to reschedule or there's contention on the 1093 + * lock. This is to avoid slowing other tasks trying to take the 1094 + * lock and because the shrinker might be called during a memory 1095 + * allocation path and we want to avoid taking a very long time 1096 + * and slowing down all sorts of tasks. 1113 1097 */ 1114 - if (cond_resched_rwlock_write(&tree->lock)) 1115 - node = rb_first_cached(&tree->map); 1098 + if (need_resched() || rwlock_needbreak(&tree->lock)) 1099 + break; 1116 1100 } 1117 1101 write_unlock(&tree->lock); 1118 1102 up_read(&inode->i_mmap_lock); ··· 1122 1102 return nr_dropped; 1123 1103 } 1124 1104 1125 - static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan) 1105 + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) 1126 1106 { 1127 - struct btrfs_fs_info *fs_info = root->fs_info; 1128 1107 struct btrfs_inode *inode; 1129 1108 long nr_dropped = 0; 1130 - u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1; 1109 + u64 min_ino = ctx->last_ino + 1; 1131 1110 1132 1111 inode = btrfs_find_first_inode(root, min_ino); 1133 1112 while (inode) { 1134 - nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan); 1113 + nr_dropped += btrfs_scan_inode(inode, ctx); 1135 1114 1136 1115 min_ino = btrfs_ino(inode) + 1; 1137 - fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode); 1138 - iput(&inode->vfs_inode); 1116 + ctx->last_ino = btrfs_ino(inode); 1117 + btrfs_add_delayed_iput(inode); 1139 1118 1140 - if (*scanned >= nr_to_scan) 1119 + if (ctx->scanned >= ctx->nr_to_scan) 1141 1120 break; 1142 1121 1143 - cond_resched(); 1122 + /* 1123 + * We may be called from memory allocation paths, so we don't 1124 + * want to take too much time and slowdown tasks. 1125 + */ 1126 + if (need_resched()) 1127 + break; 1128 + 1144 1129 inode = btrfs_find_first_inode(root, min_ino); 1145 1130 } 1146 1131 ··· 1157 1132 * inode if there is one or we will find out this was the last 1158 1133 * one and move to the next root. 1159 1134 */ 1160 - fs_info->extent_map_shrinker_last_root = btrfs_root_id(root); 1135 + ctx->last_root = btrfs_root_id(root); 1161 1136 } else { 1162 1137 /* 1163 1138 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so 1164 1139 * that when processing the next root we start from its first inode. 1165 1140 */ 1166 - fs_info->extent_map_shrinker_last_ino = 0; 1167 - fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1; 1141 + ctx->last_ino = 0; 1142 + ctx->last_root = btrfs_root_id(root) + 1; 1168 1143 } 1169 1144 1170 1145 return nr_dropped; ··· 1172 1147 1173 1148 long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) 1174 1149 { 1175 - const u64 start_root_id = fs_info->extent_map_shrinker_last_root; 1176 - u64 next_root_id = start_root_id; 1150 + struct btrfs_em_shrink_ctx ctx; 1151 + u64 start_root_id; 1152 + u64 next_root_id; 1177 1153 bool cycled = false; 1178 1154 long nr_dropped = 0; 1179 - long scanned = 0; 1155 + 1156 + ctx.scanned = 0; 1157 + ctx.nr_to_scan = nr_to_scan; 1158 + 1159 + /* 1160 + * In case we have multiple tasks running this shrinker, make the next 1161 + * one start from the next inode in case it starts before we finish. 1162 + */ 1163 + spin_lock(&fs_info->extent_map_shrinker_lock); 1164 + ctx.last_ino = fs_info->extent_map_shrinker_last_ino; 1165 + fs_info->extent_map_shrinker_last_ino++; 1166 + ctx.last_root = fs_info->extent_map_shrinker_last_root; 1167 + spin_unlock(&fs_info->extent_map_shrinker_lock); 1168 + 1169 + start_root_id = ctx.last_root; 1170 + next_root_id = ctx.last_root; 1180 1171 1181 1172 if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { 1182 1173 s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); 1183 1174 1184 - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr); 1175 + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, 1176 + nr, ctx.last_root, 1177 + ctx.last_ino); 1185 1178 } 1186 1179 1187 - while (scanned < nr_to_scan) { 1180 + /* 1181 + * We may be called from memory allocation paths, so we don't want to 1182 + * take too much time and slowdown tasks, so stop if we need reschedule. 1183 + */ 1184 + while (ctx.scanned < ctx.nr_to_scan && !need_resched()) { 1188 1185 struct btrfs_root *root; 1189 1186 unsigned long count; 1190 1187 ··· 1218 1171 spin_unlock(&fs_info->fs_roots_radix_lock); 1219 1172 if (start_root_id > 0 && !cycled) { 1220 1173 next_root_id = 0; 1221 - fs_info->extent_map_shrinker_last_root = 0; 1222 - fs_info->extent_map_shrinker_last_ino = 0; 1174 + ctx.last_root = 0; 1175 + ctx.last_ino = 0; 1223 1176 cycled = true; 1224 1177 continue; 1225 1178 } ··· 1233 1186 continue; 1234 1187 1235 1188 if (is_fstree(btrfs_root_id(root))) 1236 - nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan); 1189 + nr_dropped += btrfs_scan_root(root, &ctx); 1237 1190 1238 1191 btrfs_put_root(root); 1239 1192 } 1240 1193 1194 + /* 1195 + * In case of multiple tasks running this extent map shrinking code this 1196 + * isn't perfect but it's simple and silences things like KCSAN. It's 1197 + * not possible to know which task made more progress because we can 1198 + * cycle back to the first root and first inode if it's not the first 1199 + * time the shrinker ran, see the above logic. Also a task that started 1200 + * later may finish ealier than another task and made less progress. So 1201 + * make this simple and update to the progress of the last task that 1202 + * finished, with the occasional possiblity of having two consecutive 1203 + * runs of the shrinker process the same inodes. 1204 + */ 1205 + spin_lock(&fs_info->extent_map_shrinker_lock); 1206 + fs_info->extent_map_shrinker_last_ino = ctx.last_ino; 1207 + fs_info->extent_map_shrinker_last_root = ctx.last_root; 1208 + spin_unlock(&fs_info->extent_map_shrinker_lock); 1209 + 1241 1210 if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { 1242 1211 s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); 1243 1212 1244 - trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); 1213 + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, 1214 + nr, ctx.last_root, 1215 + ctx.last_ino); 1245 1216 } 1246 1217 1247 1218 return nr_dropped;
+1
fs/btrfs/fs.h
··· 630 630 s32 delalloc_batch; 631 631 632 632 struct percpu_counter evictable_extent_maps; 633 + spinlock_t extent_map_shrinker_lock; 633 634 u64 extent_map_shrinker_last_root; 634 635 u64 extent_map_shrinker_last_ino; 635 636
+10 -8
include/trace/events/btrfs.h
··· 2556 2556 2557 2557 TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, 2558 2558 2559 - TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr), 2559 + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr, 2560 + u64 last_root_id, u64 last_ino), 2560 2561 2561 - TP_ARGS(fs_info, nr_to_scan, nr), 2562 + TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino), 2562 2563 2563 2564 TP_STRUCT__entry_btrfs( 2564 2565 __field( long, nr_to_scan ) ··· 2571 2570 TP_fast_assign_btrfs(fs_info, 2572 2571 __entry->nr_to_scan = nr_to_scan; 2573 2572 __entry->nr = nr; 2574 - __entry->last_root_id = fs_info->extent_map_shrinker_last_root; 2575 - __entry->last_ino = fs_info->extent_map_shrinker_last_ino; 2573 + __entry->last_root_id = last_root_id; 2574 + __entry->last_ino = last_ino; 2576 2575 ), 2577 2576 2578 2577 TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", ··· 2582 2581 2583 2582 TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, 2584 2583 2585 - TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr), 2584 + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr, 2585 + u64 last_root_id, u64 last_ino), 2586 2586 2587 - TP_ARGS(fs_info, nr_dropped, nr), 2587 + TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino), 2588 2588 2589 2589 TP_STRUCT__entry_btrfs( 2590 2590 __field( long, nr_dropped ) ··· 2597 2595 TP_fast_assign_btrfs(fs_info, 2598 2596 __entry->nr_dropped = nr_dropped; 2599 2597 __entry->nr = nr; 2600 - __entry->last_root_id = fs_info->extent_map_shrinker_last_root; 2601 - __entry->last_ino = fs_info->extent_map_shrinker_last_ino; 2598 + __entry->last_root_id = last_root_id; 2599 + __entry->last_ino = last_ino; 2602 2600 ), 2603 2601 2604 2602 TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",