Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"

This reverts commit 89c83fb539f95491be80cdd5158e6f0ce329e317.

This should have been done as part of 2f0799a0ffc0 ("mm, thp: restore
node-local hugepage allocations"). The movement of the thp allocation
policy from alloc_pages_vma() to alloc_hugepage_direct_gfpmask() was
intended to only set __GFP_THISNODE for mempolicies that are not
MPOL_BIND whereas the revert could set this regardless of mempolicy.

While the check for MPOL_BIND between alloc_hugepage_direct_gfpmask()
and alloc_pages_vma() was racy, that has since been removed since the
revert. What is left is the possibility to use __GFP_THISNODE in
policy_node() when it is unexpected because the special handling for
hugepages in alloc_pages_vma() was removed as part of the consolidation.

Secondly, prior to 89c83fb539f9, alloc_pages_vma() implemented a somewhat
different policy for hugepage allocations, which were allocated through
alloc_hugepage_vma(). For hugepage allocations, if the allocating
process's node is in the set of allowed nodes, allocate with
__GFP_THISNODE for that node (for MPOL_PREFERRED, use that node with
__GFP_THISNODE instead). This was changed for shmem_alloc_hugepage() to
allow fallback to other nodes in 89c83fb539f9 as it did for new_page() in
mm/mempolicy.c which is functionally different behavior and removes the
requirement to only allocate hugepages locally.

So this commit does a full revert of 89c83fb539f9 instead of the partial
revert that was done in 2f0799a0ffc0. The result is the same thp
allocation policy for 4.20 that was in 4.19.

Fixes: 89c83fb539f9 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask")
Fixes: 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations")
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

David Rientjes and committed by

Linus Torvalds 7 years ago 356ff8a9 5f179793

+51 -22

4 changed files

expand all

include

linux

gfp.h

huge_memory.c

mempolicy.c

shmem.c

+8 -4

include/linux/gfp.h

··· 510 510 } 511 511 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, 512 512 struct vm_area_struct *vma, unsigned long addr, 513 - int node); 513 + int node, bool hugepage); 514 + #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ 515 + alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) 514 516 #else 515 517 #define alloc_pages(gfp_mask, order) \ 516 518 alloc_pages_node(numa_node_id(), gfp_mask, order) 517 - #define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ 519 + #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ 520 + alloc_pages(gfp_mask, order) 521 + #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ 518 522 alloc_pages(gfp_mask, order) 519 523 #endif 520 524 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) 521 525 #define alloc_page_vma(gfp_mask, vma, addr) \ 522 - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) 526 + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) 523 527 #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ 524 - alloc_pages_vma(gfp_mask, 0, vma, addr, node) 528 + alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) 525 529 526 530 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); 527 531 extern unsigned long get_zeroed_page(gfp_t gfp_mask);

+13 -14

mm/huge_memory.c

··· 629 629 * available 630 630 * never: never stall for any thp allocation 631 631 */ 632 - static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) 632 + static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 633 633 { 634 634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 635 - const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; 636 635 637 636 /* Always do synchronous compaction */ 638 637 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 639 - return GFP_TRANSHUGE | __GFP_THISNODE | 640 - (vma_madvised ? 0 : __GFP_NORETRY); 638 + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 641 639 642 640 /* Kick kcompactd and fail quickly */ 643 641 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 644 - return gfp_mask | __GFP_KSWAPD_RECLAIM; 642 + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 645 643 646 644 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 647 645 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 648 - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 649 - __GFP_KSWAPD_RECLAIM); 646 + return GFP_TRANSHUGE_LIGHT | 647 + (vma_madvised ? __GFP_DIRECT_RECLAIM : 648 + __GFP_KSWAPD_RECLAIM); 650 649 651 650 /* Only do synchronous compaction if madvised */ 652 651 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 653 - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 652 + return GFP_TRANSHUGE_LIGHT | 653 + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 654 654 655 - return gfp_mask; 655 + return GFP_TRANSHUGE_LIGHT; 656 656 } 657 657 658 658 /* Caller must hold page table lock. */ ··· 724 724 pte_free(vma->vm_mm, pgtable); 725 725 return ret; 726 726 } 727 - gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 728 - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); 727 + gfp = alloc_hugepage_direct_gfpmask(vma); 728 + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 729 729 if (unlikely(!page)) { 730 730 count_vm_event(THP_FAULT_FALLBACK); 731 731 return VM_FAULT_FALLBACK; ··· 1295 1295 alloc: 1296 1296 if (transparent_hugepage_enabled(vma) && 1297 1297 !transparent_hugepage_debug_cow()) { 1298 - huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 1299 - new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, 1300 - haddr, numa_node_id()); 1298 + huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1299 + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1301 1300 } else 1302 1301 new_page = NULL; 1303 1302

+29 -3

mm/mempolicy.c

··· 1116 1116 } else if (PageTransHuge(page)) { 1117 1117 struct page *thp; 1118 1118 1119 - thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, 1120 - address, numa_node_id()); 1119 + thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, 1120 + HPAGE_PMD_ORDER); 1121 1121 if (!thp) 1122 1122 return NULL; 1123 1123 prep_transhuge_page(thp); ··· 2011 2011 * @vma: Pointer to VMA or NULL if not available. 2012 2012 * @addr: Virtual Address of the allocation. Must be inside the VMA. 2013 2013 * @node: Which node to prefer for allocation (modulo policy). 2014 + * @hugepage: for hugepages try only the preferred node if possible 2014 2015 * 2015 2016 * This function allocates a page from the kernel page pool and applies 2016 2017 * a NUMA policy associated with the VMA or the current process. ··· 2022 2021 */ 2023 2022 struct page * 2024 2023 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 2025 - unsigned long addr, int node) 2024 + unsigned long addr, int node, bool hugepage) 2026 2025 { 2027 2026 struct mempolicy *pol; 2028 2027 struct page *page; ··· 2038 2037 mpol_cond_put(pol); 2039 2038 page = alloc_page_interleave(gfp, order, nid); 2040 2039 goto out; 2040 + } 2041 + 2042 + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { 2043 + int hpage_node = node; 2044 + 2045 + /* 2046 + * For hugepage allocation and non-interleave policy which 2047 + * allows the current node (or other explicitly preferred 2048 + * node) we only try to allocate from the current/preferred 2049 + * node and don't fall back to other nodes, as the cost of 2050 + * remote accesses would likely offset THP benefits. 2051 + * 2052 + * If the policy is interleave, or does not allow the current 2053 + * node in its nodemask, we allocate the standard way. 2054 + */ 2055 + if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) 2056 + hpage_node = pol->v.preferred_node; 2057 + 2058 + nmask = policy_nodemask(gfp, pol); 2059 + if (!nmask || node_isset(hpage_node, *nmask)) { 2060 + mpol_cond_put(pol); 2061 + page = __alloc_pages_node(hpage_node, 2062 + gfp | __GFP_THISNODE, order); 2063 + goto out; 2064 + } 2041 2065 } 2042 2066 2043 2067 nmask = policy_nodemask(gfp, pol);

+1 -1

mm/shmem.c

··· 1439 1439 1440 1440 shmem_pseudo_vma_init(&pvma, info, hindex); 1441 1441 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, 1442 - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); 1442 + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); 1443 1443 shmem_pseudo_vma_destroy(&pvma); 1444 1444 if (page) 1445 1445 prep_transhuge_page(page);

Configure Feed

Configure Feed