Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

btrfs: prevent direct reclaim during compressed readahead

Under memory pressure, direct reclaim can kick in during compressed
readahead. This puts the associated task into D-state. Then shrink_lruvec()
disables interrupts when acquiring the LRU lock. Under heavy pressure,
we've observed reclaim can run long enough that the CPU becomes prone to
CSD lock stalls since it cannot service incoming IPIs. Although the CSD
lock stalls are the worst case scenario, we have found many more subtle
occurrences of this latency on the order of seconds, over a minute in some
cases.

Prevent direct reclaim during compressed readahead. This is achieved by
using different GFP flags at key points when the bio is marked for
readahead.

There are two functions that allocate during compressed readahead:
btrfs_alloc_compr_folio() and add_ra_bio_pages(). Both currently use
GFP_NOFS which includes __GFP_DIRECT_RECLAIM.

For the internal API call btrfs_alloc_compr_folio(), the signature changes
to accept an additional gfp_t parameter. At the readahead call site, it
gets flags similar to GFP_NOFS but stripped of __GFP_DIRECT_RECLAIM.
__GFP_NOWARN is added since these allocations are allowed to fail. Demand
reads still use full GFP_NOFS and will enter reclaim if needed. All other
existing call sites of btrfs_alloc_compr_folio() now explicitly pass
GFP_NOFS to retain their current behavior.

add_ra_bio_pages() gains a bool parameter which allows callers to specify
if they want to allow direct reclaim or not. In either case, the
__GFP_NOWARN flag was added unconditionally since the allocations are
speculative.

There has been some previous work done on calling add_ra_bio_pages() [0].
This patch is complementary: where that patch reduces call frequency, this
patch reduces the latency associated with those calls.

[0] https://lore.kernel.org/linux-btrfs/656838ec1232314a2657716e59f4f15a8eadba64.1751492111.git.boris@bur.io/

Reviewed-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: JP Kobryn (Meta) <jp.kobryn@linux.dev>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>

authored by

JP Kobryn (Meta) and committed by
David Sterba
7ae37b2c 30d537f7

+45 -18
+34 -7
fs/btrfs/compression.c
··· 180 180 /* 181 181 * Common wrappers for page allocation from compression wrappers 182 182 */ 183 - struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) 183 + struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info, gfp_t gfp) 184 184 { 185 185 struct folio *folio = NULL; 186 186 ··· 200 200 return folio; 201 201 202 202 alloc: 203 - return folio_alloc(GFP_NOFS, fs_info->block_min_order); 203 + return folio_alloc(gfp, fs_info->block_min_order); 204 204 } 205 205 206 206 void btrfs_free_compr_folio(struct folio *folio) ··· 368 368 static noinline int add_ra_bio_pages(struct inode *inode, 369 369 u64 compressed_end, 370 370 struct compressed_bio *cb, 371 - int *memstall, unsigned long *pflags) 371 + int *memstall, unsigned long *pflags, 372 + bool direct_reclaim) 372 373 { 373 374 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 374 375 pgoff_t end_index; ··· 377 376 u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; 378 377 u64 isize = i_size_read(inode); 379 378 int ret; 379 + gfp_t constraint_gfp, cache_gfp; 380 380 struct folio *folio; 381 381 struct extent_map *em; 382 382 struct address_space *mapping = inode->i_mapping; ··· 406 404 return 0; 407 405 408 406 end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 407 + 408 + /* 409 + * Avoid direct reclaim when the caller does not allow it. Since 410 + * add_ra_bio_pages() is always speculative, suppress allocation warnings 411 + * in either case. 412 + */ 413 + if (!direct_reclaim) { 414 + constraint_gfp = ~(__GFP_FS | __GFP_DIRECT_RECLAIM) | __GFP_NOWARN; 415 + cache_gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; 416 + } else { 417 + constraint_gfp = (~__GFP_FS) | __GFP_NOWARN; 418 + cache_gfp = GFP_NOFS | __GFP_NOWARN; 419 + } 409 420 410 421 while (cur < compressed_end) { 411 422 pgoff_t page_end; ··· 449 434 continue; 450 435 } 451 436 452 - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS), 437 + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp), 453 438 0, NULL); 454 439 if (!folio) 455 440 break; 456 441 457 - if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { 442 + if (filemap_add_folio(mapping, folio, pg_index, cache_gfp)) { 458 443 /* There is already a page, skip to page end */ 459 444 cur += folio_size(folio); 460 445 folio_put(folio); ··· 547 532 unsigned int compressed_len; 548 533 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 549 534 u64 file_offset = bbio->file_offset; 535 + gfp_t gfp; 550 536 u64 em_len; 551 537 u64 em_start; 552 538 struct extent_map *em; 553 539 unsigned long pflags; 554 540 int memstall = 0; 555 541 int ret; 542 + 543 + /* 544 + * If this is a readahead bio, prevent direct reclaim. This is done to 545 + * avoid stalling on speculative allocations when memory pressure is 546 + * high. The demand fault will retry with GFP_NOFS and enter direct 547 + * reclaim if needed. 548 + */ 549 + if (bbio->bio.bi_opf & REQ_RAHEAD) 550 + gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; 551 + else 552 + gfp = GFP_NOFS; 556 553 557 554 /* we need the actual starting offset of this extent in the file */ 558 555 read_lock(&em_tree->lock); ··· 596 569 struct folio *folio; 597 570 u32 cur_len = min(compressed_len - i * min_folio_size, min_folio_size); 598 571 599 - folio = btrfs_alloc_compr_folio(fs_info); 572 + folio = btrfs_alloc_compr_folio(fs_info, gfp); 600 573 if (!folio) { 601 574 ret = -ENOMEM; 602 575 goto out_free_bio; ··· 612 585 ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len); 613 586 614 587 add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, 615 - &pflags); 588 + &pflags, !(bbio->bio.bi_opf & REQ_RAHEAD)); 616 589 617 590 cb->len = bbio->bio.bi_iter.bi_size; 618 591 cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
+1 -1
fs/btrfs/compression.h
··· 98 98 99 99 int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); 100 100 101 - struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info); 101 + struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info, gfp_t gfp); 102 102 void btrfs_free_compr_folio(struct folio *folio); 103 103 104 104 struct workspace_manager {
+1 -1
fs/btrfs/inode.c
··· 9980 9980 size_t bytes = min(min_folio_size, iov_iter_count(from)); 9981 9981 char *kaddr; 9982 9982 9983 - folio = btrfs_alloc_compr_folio(fs_info); 9983 + folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 9984 9984 if (!folio) { 9985 9985 ret = -ENOMEM; 9986 9986 goto out_cb;
+3 -3
fs/btrfs/lzo.c
··· 202 202 ASSERT((old_size >> sectorsize_bits) == (old_size + LZO_LEN - 1) >> sectorsize_bits); 203 203 204 204 if (!*out_folio) { 205 - *out_folio = btrfs_alloc_compr_folio(fs_info); 205 + *out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 206 206 if (!*out_folio) 207 207 return -ENOMEM; 208 208 } ··· 229 229 return -E2BIG; 230 230 231 231 if (!*out_folio) { 232 - *out_folio = btrfs_alloc_compr_folio(fs_info); 232 + *out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 233 233 if (!*out_folio) 234 234 return -ENOMEM; 235 235 } ··· 280 280 ASSERT(bio->bi_iter.bi_size == 0); 281 281 ASSERT(len); 282 282 283 - folio_out = btrfs_alloc_compr_folio(fs_info); 283 + folio_out = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 284 284 if (!folio_out) 285 285 return -ENOMEM; 286 286
+3 -3
fs/btrfs/zlib.c
··· 172 172 workspace->strm.total_in = 0; 173 173 workspace->strm.total_out = 0; 174 174 175 - out_folio = btrfs_alloc_compr_folio(fs_info); 175 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 176 176 if (out_folio == NULL) { 177 177 ret = -ENOMEM; 178 178 goto out; ··· 254 254 goto out; 255 255 } 256 256 257 - out_folio = btrfs_alloc_compr_folio(fs_info); 257 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 258 258 if (out_folio == NULL) { 259 259 ret = -ENOMEM; 260 260 goto out; ··· 291 291 goto out; 292 292 } 293 293 /* Get another folio for the stream end. */ 294 - out_folio = btrfs_alloc_compr_folio(fs_info); 294 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 295 295 if (out_folio == NULL) { 296 296 ret = -ENOMEM; 297 297 goto out;
+3 -3
fs/btrfs/zstd.c
··· 437 437 workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, start); 438 438 439 439 /* Allocate and map in the output buffer. */ 440 - out_folio = btrfs_alloc_compr_folio(fs_info); 440 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 441 441 if (out_folio == NULL) { 442 442 ret = -ENOMEM; 443 443 goto out; ··· 480 480 goto out; 481 481 } 482 482 483 - out_folio = btrfs_alloc_compr_folio(fs_info); 483 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 484 484 if (out_folio == NULL) { 485 485 ret = -ENOMEM; 486 486 goto out; ··· 553 553 ret = -E2BIG; 554 554 goto out; 555 555 } 556 - out_folio = btrfs_alloc_compr_folio(fs_info); 556 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 557 557 if (out_folio == NULL) { 558 558 ret = -ENOMEM; 559 559 goto out;