Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

fs: Move metadata bhs tracking to a separate struct

Instead of tracking metadata bhs for a mapping using i_private_list and
i_private_lock create a dedicated mapping_metadata_bhs struct for it.
So far this struct is embedded in address_space but that will be
switched for per-fs private inode parts later in the series. This also
changes the locking from bdev mapping's i_private_lock to a new lock
embedded in mapping_metadata_bhs to untangle the i_private_lock locking
for maintaining lists of metadata bhs and the locking for looking up /
reclaiming bdev's buffer heads. The locking in remove_assoc_map() gets
more complex due to this but overall this looks like a reasonable
tradeoff.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20260326095354.16340-72-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>

authored by

Jan Kara and committed by
Christian Brauner
521bea7c 8fed8176

+72 -71
+63 -71
fs/buffer.c
··· 469 469 * 470 470 * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), 471 471 * inode_has_buffers() and invalidate_inode_buffers() are provided for the 472 - * management of a list of dependent buffers at ->i_mapping->i_private_list. 472 + * management of a list of dependent buffers in mapping_metadata_bhs struct. 473 473 * 474 - * Locking is a little subtle: try_to_free_buffers() will remove buffers 475 - * from their controlling inode's queue when they are being freed. But 476 - * try_to_free_buffers() will be operating against the *blockdev* mapping 477 - * at the time, not against the S_ISREG file which depends on those buffers. 478 - * So the locking for i_private_list is via the i_private_lock in the address_space 479 - * which backs the buffers. Which is different from the address_space 480 - * against which the buffers are listed. So for a particular address_space, 481 - * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, 482 - * mapping->i_private_list will always be protected by the backing blockdev's 483 - * ->i_private_lock. 484 - * 485 - * Which introduces a requirement: all buffers on an address_space's 486 - * ->i_private_list must be from the same address_space: the blockdev's. 487 - * 488 - * address_spaces which do not place buffers at ->i_private_list via these 489 - * utility functions are free to use i_private_lock and i_private_list for 490 - * whatever they want. The only requirement is that list_empty(i_private_list) 491 - * be true at clear_inode() time. 492 - * 493 - * FIXME: clear_inode should not call invalidate_inode_buffers(). The 494 - * filesystems should do that. invalidate_inode_buffers() should just go 495 - * BUG_ON(!list_empty). 474 + * The locking is a little subtle: The list of buffer heads is protected by 475 + * the lock in mapping_metadata_bhs so functions coming from bdev mapping 476 + * (such as try_to_free_buffers()) need to safely get to mapping_metadata_bhs 477 + * using RCU, grab the lock, verify we didn't race with somebody detaching the 478 + * bh / moving it to different inode and only then proceeding. 496 479 * 497 480 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should 498 481 * take an address_space, not an inode. And it should be called ··· 492 509 * b_inode back. 493 510 */ 494 511 495 - /* 496 - * The buffer's backing address_space's i_private_lock must be held 497 - */ 498 - static void __remove_assoc_queue(struct buffer_head *bh) 512 + static void __remove_assoc_queue(struct mapping_metadata_bhs *mmb, 513 + struct buffer_head *bh) 499 514 { 515 + lockdep_assert_held(&mmb->lock); 500 516 list_del_init(&bh->b_assoc_buffers); 501 517 WARN_ON(!bh->b_assoc_map); 502 518 bh->b_assoc_map = NULL; 503 519 } 504 520 521 + static void remove_assoc_queue(struct buffer_head *bh) 522 + { 523 + struct address_space *mapping; 524 + struct mapping_metadata_bhs *mmb; 525 + 526 + /* 527 + * The locking dance is ugly here. We need to acquire the lock 528 + * protecting the metadata bh list while possibly racing with bh 529 + * being removed from the list or moved to a different one. We 530 + * use RCU to pin mapping_metadata_bhs in memory to 531 + * opportunistically acquire the lock and then recheck the bh 532 + * didn't move under us. 533 + */ 534 + while (bh->b_assoc_map) { 535 + rcu_read_lock(); 536 + mapping = READ_ONCE(bh->b_assoc_map); 537 + if (mapping) { 538 + mmb = &mapping->i_metadata_bhs; 539 + spin_lock(&mmb->lock); 540 + if (bh->b_assoc_map == mapping) 541 + __remove_assoc_queue(mmb, bh); 542 + spin_unlock(&mmb->lock); 543 + } 544 + rcu_read_unlock(); 545 + } 546 + } 547 + 505 548 int inode_has_buffers(struct inode *inode) 506 549 { 507 - return !list_empty(&inode->i_data.i_private_list); 550 + return !list_empty(&inode->i_data.i_metadata_bhs.list); 508 551 } 509 552 EXPORT_SYMBOL_GPL(inode_has_buffers); 510 553 ··· 538 529 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 539 530 * @mapping: the mapping which wants those buffers written 540 531 * 541 - * Starts I/O against the buffers at mapping->i_private_list, and waits upon 532 + * Starts I/O against the buffers at mapping->i_metadata_bhs and waits upon 542 533 * that I/O. Basically, this is a convenience function for fsync(). @mapping 543 534 * is a file or directory which needs those buffers to be written for a 544 535 * successful fsync(). ··· 557 548 */ 558 549 int sync_mapping_buffers(struct address_space *mapping) 559 550 { 560 - struct address_space *buffer_mapping = 561 - mapping->host->i_sb->s_bdev->bd_mapping; 551 + struct mapping_metadata_bhs *mmb = &mapping->i_metadata_bhs; 562 552 struct buffer_head *bh; 563 553 int err = 0; 564 554 struct blk_plug plug; 565 555 LIST_HEAD(tmp); 566 556 567 - if (list_empty(&mapping->i_private_list)) 557 + if (list_empty(&mmb->list)) 568 558 return 0; 569 559 570 560 blk_start_plug(&plug); 571 561 572 - spin_lock(&buffer_mapping->i_private_lock); 573 - while (!list_empty(&mapping->i_private_list)) { 574 - bh = BH_ENTRY(mapping->i_private_list.next); 562 + spin_lock(&mmb->lock); 563 + while (!list_empty(&mmb->list)) { 564 + bh = BH_ENTRY(mmb->list.next); 575 565 WARN_ON_ONCE(bh->b_assoc_map != mapping); 576 - __remove_assoc_queue(bh); 566 + __remove_assoc_queue(mmb, bh); 577 567 /* Avoid race with mark_buffer_dirty_inode() which does 578 568 * a lockless check and we rely on seeing the dirty bit */ 579 569 smp_mb(); ··· 581 573 bh->b_assoc_map = mapping; 582 574 if (buffer_dirty(bh)) { 583 575 get_bh(bh); 584 - spin_unlock(&buffer_mapping->i_private_lock); 576 + spin_unlock(&mmb->lock); 585 577 /* 586 578 * Ensure any pending I/O completes so that 587 579 * write_dirty_buffer() actually writes the ··· 598 590 * through sync_buffer(). 599 591 */ 600 592 brelse(bh); 601 - spin_lock(&buffer_mapping->i_private_lock); 593 + spin_lock(&mmb->lock); 602 594 } 603 595 } 604 596 } 605 597 606 - spin_unlock(&buffer_mapping->i_private_lock); 598 + spin_unlock(&mmb->lock); 607 599 blk_finish_plug(&plug); 608 - spin_lock(&buffer_mapping->i_private_lock); 600 + spin_lock(&mmb->lock); 609 601 610 602 while (!list_empty(&tmp)) { 611 603 bh = BH_ENTRY(tmp.prev); 612 604 get_bh(bh); 613 - __remove_assoc_queue(bh); 605 + __remove_assoc_queue(mmb, bh); 614 606 /* Avoid race with mark_buffer_dirty_inode() which does 615 607 * a lockless check and we rely on seeing the dirty bit */ 616 608 smp_mb(); 617 609 if (buffer_dirty(bh)) { 618 - list_add(&bh->b_assoc_buffers, 619 - &mapping->i_private_list); 610 + list_add(&bh->b_assoc_buffers, &mmb->list); 620 611 bh->b_assoc_map = mapping; 621 612 } 622 - spin_unlock(&buffer_mapping->i_private_lock); 613 + spin_unlock(&mmb->lock); 623 614 wait_on_buffer(bh); 624 615 if (!buffer_uptodate(bh)) 625 616 err = -EIO; 626 617 brelse(bh); 627 - spin_lock(&buffer_mapping->i_private_lock); 618 + spin_lock(&mmb->lock); 628 619 } 629 - spin_unlock(&buffer_mapping->i_private_lock); 620 + spin_unlock(&mmb->lock); 630 621 return err; 631 622 } 632 623 EXPORT_SYMBOL(sync_mapping_buffers); ··· 722 715 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) 723 716 { 724 717 struct address_space *mapping = inode->i_mapping; 725 - struct address_space *buffer_mapping = bh->b_folio->mapping; 726 718 727 719 mark_buffer_dirty(bh); 728 720 if (!bh->b_assoc_map) { 729 - spin_lock(&buffer_mapping->i_private_lock); 721 + spin_lock(&mapping->i_metadata_bhs.lock); 730 722 list_move_tail(&bh->b_assoc_buffers, 731 - &mapping->i_private_list); 723 + &mapping->i_metadata_bhs.list); 732 724 bh->b_assoc_map = mapping; 733 - spin_unlock(&buffer_mapping->i_private_lock); 725 + spin_unlock(&mapping->i_metadata_bhs.lock); 734 726 } 735 727 } 736 728 EXPORT_SYMBOL(mark_buffer_dirty_inode); ··· 802 796 * Invalidate any and all dirty buffers on a given inode. We are 803 797 * probably unmounting the fs, but that doesn't mean we have already 804 798 * done a sync(). Just drop the buffers from the inode list. 805 - * 806 - * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which 807 - * assumes that all the buffers are against the blockdev. 808 799 */ 809 800 void invalidate_inode_buffers(struct inode *inode) 810 801 { 811 802 if (inode_has_buffers(inode)) { 812 - struct address_space *mapping = &inode->i_data; 813 - struct list_head *list = &mapping->i_private_list; 814 - struct address_space *buffer_mapping = 815 - mapping->host->i_sb->s_bdev->bd_mapping; 803 + struct mapping_metadata_bhs *mmb = &inode->i_data.i_metadata_bhs; 816 804 817 - spin_lock(&buffer_mapping->i_private_lock); 818 - while (!list_empty(list)) 819 - __remove_assoc_queue(BH_ENTRY(list->next)); 820 - spin_unlock(&buffer_mapping->i_private_lock); 805 + spin_lock(&mmb->lock); 806 + while (!list_empty(&mmb->list)) 807 + __remove_assoc_queue(mmb, BH_ENTRY(mmb->list.next)); 808 + spin_unlock(&mmb->lock); 821 809 } 822 810 } 823 811 EXPORT_SYMBOL(invalidate_inode_buffers); ··· 1155 1155 void __bforget(struct buffer_head *bh) 1156 1156 { 1157 1157 clear_buffer_dirty(bh); 1158 - if (bh->b_assoc_map) { 1159 - struct address_space *buffer_mapping = bh->b_folio->mapping; 1160 - 1161 - spin_lock(&buffer_mapping->i_private_lock); 1162 - list_del_init(&bh->b_assoc_buffers); 1163 - bh->b_assoc_map = NULL; 1164 - spin_unlock(&buffer_mapping->i_private_lock); 1165 - } 1158 + remove_assoc_queue(bh); 1166 1159 __brelse(bh); 1167 1160 } 1168 1161 EXPORT_SYMBOL(__bforget); ··· 2803 2810 do { 2804 2811 struct buffer_head *next = bh->b_this_page; 2805 2812 2806 - if (bh->b_assoc_map) 2807 - __remove_assoc_queue(bh); 2813 + remove_assoc_queue(bh); 2808 2814 bh = next; 2809 2815 } while (bh != head); 2810 2816 *buffers_to_free = head;
+2
fs/inode.c
··· 483 483 init_rwsem(&mapping->i_mmap_rwsem); 484 484 INIT_LIST_HEAD(&mapping->i_private_list); 485 485 spin_lock_init(&mapping->i_private_lock); 486 + spin_lock_init(&mapping->i_metadata_bhs.lock); 487 + INIT_LIST_HEAD(&mapping->i_metadata_bhs.list); 486 488 mapping->i_mmap = RB_ROOT_CACHED; 487 489 } 488 490
+7
include/linux/fs.h
··· 445 445 446 446 extern const struct address_space_operations empty_aops; 447 447 448 + /* Structure for tracking metadata buffer heads associated with the mapping */ 449 + struct mapping_metadata_bhs { 450 + spinlock_t lock; /* Lock protecting bh list */ 451 + struct list_head list; /* The list of bhs (b_assoc_buffers) */ 452 + }; 453 + 448 454 /** 449 455 * struct address_space - Contents of a cacheable, mappable object. 450 456 * @host: Owner, either the inode or the block_device. ··· 490 484 errseq_t wb_err; 491 485 spinlock_t i_private_lock; 492 486 struct list_head i_private_list; 487 + struct mapping_metadata_bhs i_metadata_bhs; 493 488 struct rw_semaphore i_mmap_rwsem; 494 489 } __attribute__((aligned(sizeof(long)))) __randomize_layout; 495 490 /*