Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

writeback: Avoid contention on wb->list_lock when switching inodes

There can be multiple inode switch works that are trying to switch
inodes to / from the same wb. This can happen in particular if some
cgroup exits which owns many (thousands) inodes and we need to switch
them all. In this case several inode_switch_wbs_work_fn() instances will
be just spinning on the same wb->list_lock while only one of them makes
forward progress. This wastes CPU cycles and quickly leads to softlockup
reports and unusable system.

Instead of running several inode_switch_wbs_work_fn() instances in
parallel switching to the same wb and contending on wb->list_lock, run
just one work item per wb and manage a queue of isw items switching to
this wb.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>

authored by

Jan Kara and committed by
Christian Brauner
e1b849cf 8f5ae30d

+74 -36
+63 -36
fs/fs-writeback.c
··· 368 368 } 369 369 370 370 struct inode_switch_wbs_context { 371 - struct rcu_work work; 371 + /* List of queued switching contexts for the wb */ 372 + struct llist_node list; 372 373 373 374 /* 374 375 * Multiple inodes can be switched at once. The switching procedure ··· 379 378 * array embedded into struct inode_switch_wbs_context. Otherwise 380 379 * an inode could be left in a non-consistent state. 381 380 */ 382 - struct bdi_writeback *new_wb; 383 381 struct inode *inodes[]; 384 382 }; 385 383 ··· 486 486 return switched; 487 487 } 488 488 489 - static void inode_switch_wbs_work_fn(struct work_struct *work) 489 + static void process_inode_switch_wbs(struct bdi_writeback *new_wb, 490 + struct inode_switch_wbs_context *isw) 490 491 { 491 - struct inode_switch_wbs_context *isw = 492 - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); 493 492 struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); 494 493 struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; 495 - struct bdi_writeback *new_wb = isw->new_wb; 496 494 unsigned long nr_switched = 0; 497 495 struct inode **inodep; 498 496 ··· 541 543 atomic_dec(&isw_nr_in_flight); 542 544 } 543 545 546 + void inode_switch_wbs_work_fn(struct work_struct *work) 547 + { 548 + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, 549 + switch_work); 550 + struct inode_switch_wbs_context *isw, *next_isw; 551 + struct llist_node *list; 552 + 553 + /* 554 + * Grab out reference to wb so that it cannot get freed under us 555 + * after we process all the isw items. 556 + */ 557 + wb_get(new_wb); 558 + while (1) { 559 + list = llist_del_all(&new_wb->switch_wbs_ctxs); 560 + /* Nothing to do? */ 561 + if (!list) 562 + break; 563 + /* 564 + * In addition to synchronizing among switchers, I_WB_SWITCH 565 + * tells the RCU protected stat update paths to grab the i_page 566 + * lock so that stat transfer can synchronize against them. 567 + * Let's continue after I_WB_SWITCH is guaranteed to be 568 + * visible. 569 + */ 570 + synchronize_rcu(); 571 + 572 + llist_for_each_entry_safe(isw, next_isw, list, list) 573 + process_inode_switch_wbs(new_wb, isw); 574 + } 575 + wb_put(new_wb); 576 + } 577 + 544 578 static bool inode_prepare_wbs_switch(struct inode *inode, 545 579 struct bdi_writeback *new_wb) 546 580 { ··· 602 572 return true; 603 573 } 604 574 575 + static void wb_queue_isw(struct bdi_writeback *wb, 576 + struct inode_switch_wbs_context *isw) 577 + { 578 + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) 579 + queue_work(isw_wq, &wb->switch_work); 580 + } 581 + 605 582 /** 606 583 * inode_switch_wbs - change the wb association of an inode 607 584 * @inode: target inode ··· 622 585 struct backing_dev_info *bdi = inode_to_bdi(inode); 623 586 struct cgroup_subsys_state *memcg_css; 624 587 struct inode_switch_wbs_context *isw; 588 + struct bdi_writeback *new_wb = NULL; 625 589 626 590 /* noop if seems to be already in progress */ 627 591 if (inode->i_state & I_WB_SWITCH) ··· 647 609 if (!memcg_css) 648 610 goto out_free; 649 611 650 - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 612 + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); 651 613 css_put(memcg_css); 652 - if (!isw->new_wb) 614 + if (!new_wb) 653 615 goto out_free; 654 616 655 - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) 617 + if (!inode_prepare_wbs_switch(inode, new_wb)) 656 618 goto out_free; 657 619 658 620 isw->inodes[0] = inode; 659 621 660 - /* 661 - * In addition to synchronizing among switchers, I_WB_SWITCH tells 662 - * the RCU protected stat update paths to grab the i_page 663 - * lock so that stat transfer can synchronize against them. 664 - * Let's continue after I_WB_SWITCH is guaranteed to be visible. 665 - */ 666 - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); 667 - queue_rcu_work(isw_wq, &isw->work); 622 + wb_queue_isw(new_wb, isw); 668 623 return; 669 624 670 625 out_free: 671 626 atomic_dec(&isw_nr_in_flight); 672 - if (isw->new_wb) 673 - wb_put(isw->new_wb); 627 + if (new_wb) 628 + wb_put(new_wb); 674 629 kfree(isw); 675 630 } 676 631 677 - static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, 632 + static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, 633 + struct inode_switch_wbs_context *isw, 678 634 struct list_head *list, int *nr) 679 635 { 680 636 struct inode *inode; 681 637 682 638 list_for_each_entry(inode, list, i_io_list) { 683 - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) 639 + if (!inode_prepare_wbs_switch(inode, new_wb)) 684 640 continue; 685 641 686 642 isw->inodes[*nr] = inode; ··· 698 666 { 699 667 struct cgroup_subsys_state *memcg_css; 700 668 struct inode_switch_wbs_context *isw; 669 + struct bdi_writeback *new_wb; 701 670 int nr; 702 671 bool restart = false; 703 672 ··· 711 678 712 679 for (memcg_css = wb->memcg_css->parent; memcg_css; 713 680 memcg_css = memcg_css->parent) { 714 - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); 715 - if (isw->new_wb) 681 + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); 682 + if (new_wb) 716 683 break; 717 684 } 718 - if (unlikely(!isw->new_wb)) 719 - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ 685 + if (unlikely(!new_wb)) 686 + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ 720 687 721 688 nr = 0; 722 689 spin_lock(&wb->list_lock); ··· 728 695 * bandwidth restrictions, as writeback of inode metadata is not 729 696 * accounted for. 730 697 */ 731 - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); 698 + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); 732 699 if (!restart) 733 - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); 700 + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, 701 + &nr); 734 702 spin_unlock(&wb->list_lock); 735 703 736 704 /* no attached inodes? bail out */ 737 705 if (nr == 0) { 738 706 atomic_dec(&isw_nr_in_flight); 739 - wb_put(isw->new_wb); 707 + wb_put(new_wb); 740 708 kfree(isw); 741 709 return restart; 742 710 } 743 711 744 - /* 745 - * In addition to synchronizing among switchers, I_WB_SWITCH tells 746 - * the RCU protected stat update paths to grab the i_page 747 - * lock so that stat transfer can synchronize against them. 748 - * Let's continue after I_WB_SWITCH is guaranteed to be visible. 749 - */ 750 - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); 751 - queue_rcu_work(isw_wq, &isw->work); 712 + wb_queue_isw(new_wb, isw); 752 713 753 714 return restart; 754 715 }
+4
include/linux/backing-dev-defs.h
··· 152 152 struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ 153 153 struct list_head b_attached; /* attached inodes, protected by list_lock */ 154 154 struct list_head offline_node; /* anchored at offline_cgwbs */ 155 + struct work_struct switch_work; /* work used to perform inode switching 156 + * to this wb */ 157 + struct llist_head switch_wbs_ctxs; /* queued contexts for 158 + * writeback switching */ 155 159 156 160 union { 157 161 struct work_struct release_work;
+2
include/linux/writeback.h
··· 265 265 bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); 266 266 } 267 267 268 + void inode_switch_wbs_work_fn(struct work_struct *work); 269 + 268 270 #else /* CONFIG_CGROUP_WRITEBACK */ 269 271 270 272 static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
+5
mm/backing-dev.c
··· 633 633 wb_exit(wb); 634 634 bdi_put(bdi); 635 635 WARN_ON_ONCE(!list_empty(&wb->b_attached)); 636 + WARN_ON_ONCE(work_pending(&wb->switch_work)); 636 637 call_rcu(&wb->rcu, cgwb_free_rcu); 637 638 } 638 639 ··· 710 709 wb->memcg_css = memcg_css; 711 710 wb->blkcg_css = blkcg_css; 712 711 INIT_LIST_HEAD(&wb->b_attached); 712 + INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); 713 + init_llist_head(&wb->switch_wbs_ctxs); 713 714 INIT_WORK(&wb->release_work, cgwb_release_workfn); 714 715 set_bit(WB_registered, &wb->state); 715 716 bdi_get(bdi); ··· 842 839 if (!ret) { 843 840 bdi->wb.memcg_css = &root_mem_cgroup->css; 844 841 bdi->wb.blkcg_css = blkcg_root_css; 842 + INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); 843 + init_llist_head(&bdi->wb.switch_wbs_ctxs); 845 844 } 846 845 return ret; 847 846 }