mm/hugetlb_vmemmap: batch HVO work when demoting

Batch the HVO work, including de-HVO of the source and HVO of the
destination hugeTLB folios, to speed up demotion.

After commit bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative
PFN walkers"), each request of HVO or de-HVO, batched or not, invokes
synchronize_rcu() once. For example, when not batched, demoting one 1GB
hugeTLB folio to 512 2MB hugeTLB folios invokes synchronize_rcu() 513
times (1 de-HVO plus 512 HVO requests), whereas when batched, only twice
(1 de-HVO plus 1 HVO request). And the performance difference between the
two cases is significant, e.g.,

echo 2048kB >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote_size
time echo 100 >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote

Before this patch:
real 8m58.158s
user 0m0.009s
sys 0m5.900s

After this patch:
real 0m0.900s
user 0m0.000s
sys 0m0.851s

Note that this patch changes the behavior of the `demote` interface when
de-HVO fails. Before, the interface aborts immediately upon failure; now,
it tries to finish an entire batch, meaning it can make extra progress if
the rest of the batch contains folios that do not need to de-HVO.

Link: https://lkml.kernel.org/r/20240812224823.3914837-1-yuzhao@google.com
Fixes: bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Yu Zhao and committed by

Andrew Morton 2 years ago c0f398c3 67b9a353

+107 -79

1 changed file

expand all

hugetlb.c

+107 -79

mm/hugetlb.c

··· 3921 3921 return 0; 3922 3922 } 3923 3923 3924 - static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) 3924 + static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, 3925 + struct list_head *src_list) 3925 3926 { 3926 - int i, nid = folio_nid(folio); 3927 - struct hstate *target_hstate; 3928 - struct page *subpage; 3929 - struct folio *inner_folio; 3930 - int rc = 0; 3927 + long rc; 3928 + struct folio *folio, *next; 3929 + LIST_HEAD(dst_list); 3930 + LIST_HEAD(ret_list); 3931 3931 3932 - target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); 3933 - 3934 - remove_hugetlb_folio(h, folio, false); 3935 - spin_unlock_irq(&hugetlb_lock); 3936 - 3937 - /* 3938 - * If vmemmap already existed for folio, the remove routine above would 3939 - * have cleared the hugetlb folio flag. Hence the folio is technically 3940 - * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be 3941 - * passed hugetlb folios and will BUG otherwise. 3942 - */ 3943 - if (folio_test_hugetlb(folio)) { 3944 - rc = hugetlb_vmemmap_restore_folio(h, folio); 3945 - if (rc) { 3946 - /* Allocation of vmemmmap failed, we can not demote folio */ 3947 - spin_lock_irq(&hugetlb_lock); 3948 - add_hugetlb_folio(h, folio, false); 3949 - return rc; 3950 - } 3951 - } 3952 - 3953 - /* 3954 - * Use destroy_compound_hugetlb_folio_for_demote for all huge page 3955 - * sizes as it will not ref count folios. 3956 - */ 3957 - destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); 3932 + rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list); 3933 + list_splice_init(&ret_list, src_list); 3958 3934 3959 3935 /* 3960 3936 * Taking target hstate mutex synchronizes with set_max_huge_pages. 3961 3937 * Without the mutex, pages added to target hstate could be marked 3962 3938 * as surplus. 3963 3939 * 3964 - * Note that we already hold h->resize_lock. To prevent deadlock, 3940 + * Note that we already hold src->resize_lock. To prevent deadlock, 3965 3941 * use the convention of always taking larger size hstate mutex first. 3966 3942 */ 3967 - mutex_lock(&target_hstate->resize_lock); 3968 - for (i = 0; i < pages_per_huge_page(h); 3969 - i += pages_per_huge_page(target_hstate)) { 3970 - subpage = folio_page(folio, i); 3971 - inner_folio = page_folio(subpage); 3972 - if (hstate_is_gigantic(target_hstate)) 3973 - prep_compound_gigantic_folio_for_demote(inner_folio, 3974 - target_hstate->order); 3975 - else 3976 - prep_compound_page(subpage, target_hstate->order); 3977 - folio_change_private(inner_folio, NULL); 3978 - prep_new_hugetlb_folio(target_hstate, inner_folio, nid); 3979 - free_huge_folio(inner_folio); 3980 - } 3981 - mutex_unlock(&target_hstate->resize_lock); 3943 + mutex_lock(&dst->resize_lock); 3982 3944 3983 - spin_lock_irq(&hugetlb_lock); 3945 + list_for_each_entry_safe(folio, next, src_list, lru) { 3946 + int i; 3947 + 3948 + if (folio_test_hugetlb_vmemmap_optimized(folio)) 3949 + continue; 3950 + 3951 + list_del(&folio->lru); 3952 + /* 3953 + * Use destroy_compound_hugetlb_folio_for_demote for all huge page 3954 + * sizes as it will not ref count folios. 3955 + */ 3956 + destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(src)); 3957 + 3958 + for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { 3959 + struct page *page = folio_page(folio, i); 3960 + 3961 + if (hstate_is_gigantic(dst)) 3962 + prep_compound_gigantic_folio_for_demote(page_folio(page), 3963 + dst->order); 3964 + else 3965 + prep_compound_page(page, dst->order); 3966 + set_page_private(page, 0); 3967 + 3968 + init_new_hugetlb_folio(dst, page_folio(page)); 3969 + list_add(&page->lru, &dst_list); 3970 + } 3971 + } 3972 + 3973 + prep_and_add_allocated_folios(dst, &dst_list); 3974 + 3975 + mutex_unlock(&dst->resize_lock); 3976 + 3977 + return rc; 3978 + } 3979 + 3980 + static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, 3981 + unsigned long nr_to_demote) 3982 + __must_hold(&hugetlb_lock) 3983 + { 3984 + int nr_nodes, node; 3985 + struct hstate *dst; 3986 + long rc = 0; 3987 + long nr_demoted = 0; 3988 + 3989 + lockdep_assert_held(&hugetlb_lock); 3990 + 3991 + /* We should never get here if no demote order */ 3992 + if (!src->demote_order) { 3993 + pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); 3994 + return -EINVAL; /* internal error */ 3995 + } 3996 + dst = size_to_hstate(PAGE_SIZE << src->demote_order); 3997 + 3998 + for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) { 3999 + LIST_HEAD(list); 4000 + struct folio *folio, *next; 4001 + 4002 + list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) { 4003 + if (folio_test_hwpoison(folio)) 4004 + continue; 4005 + 4006 + remove_hugetlb_folio(src, folio, false); 4007 + list_add(&folio->lru, &list); 4008 + 4009 + if (++nr_demoted == nr_to_demote) 4010 + break; 4011 + } 4012 + 4013 + spin_unlock_irq(&hugetlb_lock); 4014 + 4015 + rc = demote_free_hugetlb_folios(src, dst, &list); 4016 + 4017 + spin_lock_irq(&hugetlb_lock); 4018 + 4019 + list_for_each_entry_safe(folio, next, &list, lru) { 4020 + list_del(&folio->lru); 4021 + add_hugetlb_folio(src, folio, false); 4022 + 4023 + nr_demoted--; 4024 + } 4025 + 4026 + if (rc < 0 || nr_demoted == nr_to_demote) 4027 + break; 4028 + } 3984 4029 3985 4030 /* 3986 4031 * Not absolutely necessary, but for consistency update max_huge_pages 3987 4032 * based on pool changes for the demoted page. 3988 4033 */ 3989 - h->max_huge_pages--; 3990 - target_hstate->max_huge_pages += 3991 - pages_per_huge_page(h) / pages_per_huge_page(target_hstate); 4034 + src->max_huge_pages -= nr_demoted; 4035 + dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst)); 3992 4036 3993 - return rc; 3994 - } 4037 + if (rc < 0) 4038 + return rc; 3995 4039 3996 - static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 3997 - __must_hold(&hugetlb_lock) 3998 - { 3999 - int nr_nodes, node; 4000 - struct folio *folio; 4001 - 4002 - lockdep_assert_held(&hugetlb_lock); 4003 - 4004 - /* We should never get here if no demote order */ 4005 - if (!h->demote_order) { 4006 - pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); 4007 - return -EINVAL; /* internal error */ 4008 - } 4009 - 4010 - for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 4011 - list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { 4012 - if (folio_test_hwpoison(folio)) 4013 - continue; 4014 - return demote_free_hugetlb_folio(h, folio); 4015 - } 4016 - } 4017 - 4040 + if (nr_demoted) 4041 + return nr_demoted; 4018 4042 /* 4019 4043 * Only way to get here is if all pages on free lists are poisoned. 4020 4044 * Return -EBUSY so that caller will not retry. ··· 4273 4249 spin_lock_irq(&hugetlb_lock); 4274 4250 4275 4251 while (nr_demote) { 4252 + long rc; 4253 + 4276 4254 /* 4277 4255 * Check for available pages to demote each time thorough the 4278 4256 * loop as demote_pool_huge_page will drop hugetlb_lock. ··· 4287 4261 if (!nr_available) 4288 4262 break; 4289 4263 4290 - err = demote_pool_huge_page(h, n_mask); 4291 - if (err) 4264 + rc = demote_pool_huge_page(h, n_mask, nr_demote); 4265 + if (rc < 0) { 4266 + err = rc; 4292 4267 break; 4268 + } 4293 4269 4294 - nr_demote--; 4270 + nr_demote -= rc; 4295 4271 } 4296 4272 4297 4273 spin_unlock_irq(&hugetlb_lock);

Configure Feed

Configure Feed