mm/thp: fix deferred split unqueue naming and locking

+26 -9

mm/huge_memory.c

··· 3588 3588 return split_huge_page_to_list_to_order(&folio->page, list, ret); 3589 3589 } 3590 3590 3591 - void __folio_undo_large_rmappable(struct folio *folio) 3591 + /* 3592 + * __folio_unqueue_deferred_split() is not to be called directly: 3593 + * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h 3594 + * limits its calls to those folios which may have a _deferred_list for 3595 + * queueing THP splits, and that list is (racily observed to be) non-empty. 3596 + * 3597 + * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is 3598 + * zero: because even when split_queue_lock is held, a non-empty _deferred_list 3599 + * might be in use on deferred_split_scan()'s unlocked on-stack list. 3600 + * 3601 + * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is 3602 + * therefore important to unqueue deferred split before changing folio memcg. 3603 + */ 3604 + bool __folio_unqueue_deferred_split(struct folio *folio) 3592 3605 { 3593 3606 struct deferred_split *ds_queue; 3594 3607 unsigned long flags; 3608 + bool unqueued = false; 3609 + 3610 + WARN_ON_ONCE(folio_ref_count(folio)); 3611 + WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); 3595 3612 3596 3613 ds_queue = get_deferred_split_queue(folio); 3597 3614 spin_lock_irqsave(&ds_queue->split_queue_lock, flags); ··· 3620 3603 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 3621 3604 } 3622 3605 list_del_init(&folio->_deferred_list); 3606 + unqueued = true; 3623 3607 } 3624 3608 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); 3609 + 3610 + return unqueued; /* useful for debug warnings */ 3625 3611 } 3626 3612 3627 3613 /* partially_mapped=false won't clear PG_partially_mapped folio flag */ ··· 3647 3627 return; 3648 3628 3649 3629 /* 3650 - * The try_to_unmap() in page reclaim path might reach here too, 3651 - * this may cause a race condition to corrupt deferred split queue. 3652 - * And, if page reclaim is already handling the same folio, it is 3653 - * unnecessary to handle it again in shrinker. 3654 - * 3655 - * Check the swapcache flag to determine if the folio is being 3656 - * handled by page reclaim since THP swap would add the folio into 3657 - * swap cache before calling try_to_unmap(). 3630 + * Exclude swapcache: originally to avoid a corrupt deferred split 3631 + * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); 3632 + * but if page reclaim is already handling the same folio, it is 3633 + * unnecessary to handle it again in the shrinker, so excluding 3634 + * swapcache here may still be a useful optimization. 3658 3635 */ 3659 3636 if (folio_test_swapcache(folio)) 3660 3637 return;

+5 -5

mm/internal.h

··· 639 639 #endif 640 640 } 641 641 642 - void __folio_undo_large_rmappable(struct folio *folio); 643 - static inline void folio_undo_large_rmappable(struct folio *folio) 642 + bool __folio_unqueue_deferred_split(struct folio *folio); 643 + static inline bool folio_unqueue_deferred_split(struct folio *folio) 644 644 { 645 645 if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) 646 - return; 646 + return false; 647 647 648 648 /* 649 649 * At this point, there is no one trying to add the folio to ··· 651 651 * to check without acquiring the split_queue_lock. 652 652 */ 653 653 if (data_race(list_empty(&folio->_deferred_list))) 654 - return; 654 + return false; 655 655 656 - __folio_undo_large_rmappable(folio); 656 + return __folio_unqueue_deferred_split(folio); 657 657 } 658 658 659 659 static inline struct folio *page_rmappable_folio(struct page *page)

+25

mm/memcontrol-v1.c

··· 848 848 css_get(&to->css); 849 849 css_put(&from->css); 850 850 851 + /* Warning should never happen, so don't worry about refcount non-0 */ 852 + WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); 851 853 folio->memcg_data = (unsigned long)to; 852 854 853 855 __folio_memcg_unlock(from); ··· 1219 1217 enum mc_target_type target_type; 1220 1218 union mc_target target; 1221 1219 struct folio *folio; 1220 + bool tried_split_before = false; 1222 1221 1222 + retry_pmd: 1223 1223 ptl = pmd_trans_huge_lock(pmd, vma); 1224 1224 if (ptl) { 1225 1225 if (mc.precharge < HPAGE_PMD_NR) { ··· 1231 1227 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 1232 1228 if (target_type == MC_TARGET_PAGE) { 1233 1229 folio = target.folio; 1230 + /* 1231 + * Deferred split queue locking depends on memcg, 1232 + * and unqueue is unsafe unless folio refcount is 0: 1233 + * split or skip if on the queue? first try to split. 1234 + */ 1235 + if (!list_empty(&folio->_deferred_list)) { 1236 + spin_unlock(ptl); 1237 + if (!tried_split_before) 1238 + split_folio(folio); 1239 + folio_unlock(folio); 1240 + folio_put(folio); 1241 + if (tried_split_before) 1242 + return 0; 1243 + tried_split_before = true; 1244 + goto retry_pmd; 1245 + } 1246 + /* 1247 + * So long as that pmd lock is held, the folio cannot 1248 + * be racily added to the _deferred_list, because 1249 + * __folio_remove_rmap() will find !partially_mapped. 1250 + */ 1234 1251 if (folio_isolate_lru(folio)) { 1235 1252 if (!mem_cgroup_move_account(folio, true, 1236 1253 mc.from, mc.to)) {

+5 -3

mm/memcontrol.c

··· 4629 4629 struct obj_cgroup *objcg; 4630 4630 4631 4631 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 4632 - VM_BUG_ON_FOLIO(folio_order(folio) > 1 && 4633 - !folio_test_hugetlb(folio) && 4634 - !list_empty(&folio->_deferred_list), folio); 4635 4632 4636 4633 /* 4637 4634 * Nobody should be changing or seriously looking at ··· 4675 4678 ug->nr_memory += nr_pages; 4676 4679 ug->pgpgout++; 4677 4680 4681 + WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); 4678 4682 folio->memcg_data = 0; 4679 4683 } 4680 4684 ··· 4787 4789 4788 4790 /* Transfer the charge and the css ref */ 4789 4791 commit_charge(new, memcg); 4792 + 4793 + /* Warning should never happen, so don't worry about refcount non-0 */ 4794 + WARN_ON_ONCE(folio_unqueue_deferred_split(old)); 4790 4795 old->memcg_data = 0; 4791 4796 } 4792 4797 ··· 4976 4975 VM_BUG_ON_FOLIO(oldid, folio); 4977 4976 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 4978 4977 4978 + folio_unqueue_deferred_split(folio); 4979 4979 folio->memcg_data = 0; 4980 4980 4981 4981 if (!mem_cgroup_is_root(memcg))

+2 -2

mm/migrate.c

··· 490 490 folio_test_large_rmappable(folio)) { 491 491 if (!folio_ref_freeze(folio, expected_count)) 492 492 return -EAGAIN; 493 - folio_undo_large_rmappable(folio); 493 + folio_unqueue_deferred_split(folio); 494 494 folio_ref_unfreeze(folio, expected_count); 495 495 } 496 496 ··· 515 515 } 516 516 517 517 /* Take off deferred split queue while frozen and memcg set */ 518 - folio_undo_large_rmappable(folio); 518 + folio_unqueue_deferred_split(folio); 519 519 520 520 /* 521 521 * Now we know that no one else is looking at the folio:

-1

mm/page_alloc.c

··· 2681 2681 unsigned long pfn = folio_pfn(folio); 2682 2682 unsigned int order = folio_order(folio); 2683 2683 2684 - folio_undo_large_rmappable(folio); 2685 2684 if (!free_pages_prepare(&folio->page, order)) 2686 2685 continue; 2687 2686 /*

+2 -2

mm/swap.c

··· 121 121 } 122 122 123 123 page_cache_release(folio); 124 - folio_undo_large_rmappable(folio); 124 + folio_unqueue_deferred_split(folio); 125 125 mem_cgroup_uncharge(folio); 126 126 free_unref_page(&folio->page, folio_order(folio)); 127 127 } ··· 988 988 free_huge_folio(folio); 989 989 continue; 990 990 } 991 - folio_undo_large_rmappable(folio); 991 + folio_unqueue_deferred_split(folio); 992 992 __page_cache_release(folio, &lruvec, &flags); 993 993 994 994 if (j != i)

+2 -2

mm/vmscan.c

··· 1476 1476 */ 1477 1477 nr_reclaimed += nr_pages; 1478 1478 1479 - folio_undo_large_rmappable(folio); 1479 + folio_unqueue_deferred_split(folio); 1480 1480 if (folio_batch_add(&free_folios, folio) == 0) { 1481 1481 mem_cgroup_uncharge_folios(&free_folios); 1482 1482 try_to_unmap_flush(); ··· 1864 1864 if (unlikely(folio_put_testzero(folio))) { 1865 1865 __folio_clear_lru_flags(folio); 1866 1866 1867 - folio_undo_large_rmappable(folio); 1867 + folio_unqueue_deferred_split(folio); 1868 1868 if (folio_batch_add(&free_folios, folio) == 0) { 1869 1869 spin_unlock_irq(&lruvec->lru_lock); 1870 1870 mem_cgroup_uncharge_folios(&free_folios);

Configure Feed

Configure Feed