Merge tag 'mm-hotfixes-stable-2022-12-02' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'mm-hotfixes-stable-2022-12-02' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc hotfixes from Andrew Morton:
"15 hotfixes, 11 marked cc:stable.

Only three or four of the latter address post-6.0 issues, which is
hopefully a sign that things are converging"

* tag 'mm-hotfixes-stable-2022-12-02' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
revert "kbuild: fix -Wimplicit-function-declaration in license_is_gpl_compatible"
Kconfig.debug: provide a little extra FRAME_WARN leeway when KASAN is enabled
drm/amdgpu: temporarily disable broken Clang builds due to blown stack-frame
mm/khugepaged: invoke MMU notifiers in shmem/file collapse paths
mm/khugepaged: fix GUP-fast interaction by sending IPI
mm/khugepaged: take the right locks for page table retraction
mm: migrate: fix THP's mapcount on isolation
mm: introduce arch_has_hw_nonleaf_pmd_young()
mm: add dummy pmd_young() for architectures not having it
mm/damon/sysfs: fix wrong empty schemes assumption under online tuning in damon_sysfs_set_schemes()
tools/vm/slabinfo-gnuplot: use "grep -E" instead of "egrep"
nilfs2: fix NULL pointer dereference in nilfs_palloc_commit_free_entry()
hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing
madvise: use zap_page_range_single for madvise dontneed
mm: replace VM_WARN_ON to pr_warn if the node is offline with __GFP_THISNODE

Linus Torvalds 3 years ago bdaa78c6 6647e76a

+245 -71

23 changed files

expand all collapse all

arch

loongarch

include

asm

pgtable.h

mips

include

asm

pgtable.h

riscv

include

asm

pgtable.h

s390

include

asm

pgtable.h

sparc

include

asm

pgtable_64.h

x86

include

asm

pgtable.h

drivers

gpu

drm

amd

display

Kconfig

nilfs2

dat.c

include

asm-generic

tlb.h

linux

gfp.h

license.h

mm.h

pgtable.h

lib

Kconfig.debug

compaction.c

damon

sysfs.c

hugetlb.c

khugepaged.c

madvise.c

memory.c

mmu_gather.c

vmscan.c

tools

slabinfo-gnuplot.sh

arch/loongarch/include/asm/pgtable.h

reviewed

··· 490 490 return pmd; 491 491 } 492 492 493 493 + #define pmd_young pmd_young 493 494 static inline int pmd_young(pmd_t pmd) 494 495 { 495 496 return !!(pmd_val(pmd) & _PAGE_ACCESSED);

arch/mips/include/asm/pgtable.h

reviewed

··· 622 622 return pmd; 623 623 } 624 624 625 625 + #define pmd_young pmd_young 625 626 static inline int pmd_young(pmd_t pmd) 626 627 { 627 628 return !!(pmd_val(pmd) & _PAGE_ACCESSED);

arch/riscv/include/asm/pgtable.h

reviewed

··· 600 600 return pte_dirty(pmd_pte(pmd)); 601 601 } 602 602 603 603 + #define pmd_young pmd_young 603 604 static inline int pmd_young(pmd_t pmd) 604 605 { 605 606 return pte_young(pmd_pte(pmd));

arch/s390/include/asm/pgtable.h

reviewed

··· 763 763 return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; 764 764 } 765 765 766 766 + #define pmd_young pmd_young 766 767 static inline int pmd_young(pmd_t pmd) 767 768 { 768 769 return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;

arch/sparc/include/asm/pgtable_64.h

reviewed

··· 693 693 return pte_dirty(pte); 694 694 } 695 695 696 696 + #define pmd_young pmd_young 696 697 static inline unsigned long pmd_young(pmd_t pmd) 697 698 { 698 699 pte_t pte = __pte(pmd_val(pmd));

arch/x86/include/asm/pgtable.h

reviewed

··· 139 139 return pmd_flags(pmd) & _PAGE_DIRTY; 140 140 } 141 141 142 142 + #define pmd_young pmd_young 142 143 static inline int pmd_young(pmd_t pmd) 143 144 { 144 145 return pmd_flags(pmd) & _PAGE_ACCESSED; ··· 1438 1437 { 1439 1438 return true; 1440 1439 } 1440 1440 + 1441 1441 + #ifdef CONFIG_XEN_PV 1442 1442 + #define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young 1443 1443 + static inline bool arch_has_hw_nonleaf_pmd_young(void) 1444 1444 + { 1445 1445 + return !cpu_feature_enabled(X86_FEATURE_XENPV); 1446 1446 + } 1447 1447 + #endif 1441 1448 1442 1449 #ifdef CONFIG_PAGE_TABLE_CHECK 1443 1450 static inline bool pte_user_accessible_page(pte_t pte)

drivers/gpu/drm/amd/display/Kconfig

reviewed

··· 5 5 config DRM_AMD_DC 6 6 bool "AMD DC - Enable new display engine" 7 7 default y 8 8 + depends on BROKEN || !CC_IS_CLANG || X86_64 || SPARC64 || ARM64 8 9 select SND_HDA_COMPONENT if SND_HDA_CORE 9 10 select DRM_AMD_DC_DCN if (X86 || PPC_LONG_DOUBLE_128) 10 11 help 11 12 Choose this option if you want to use the new display engine 12 13 support for AMDGPU. This adds required support for Vega and 13 14 Raven ASICs. 15 15 + 16 16 + calculate_bandwidth() is presently broken on all !(X86_64 || SPARC64 || ARM64) 17 17 + architectures built with Clang (all released versions), whereby the stack 18 18 + frame gets blown up to well over 5k. This would cause an immediate kernel 19 19 + panic on most architectures. We'll revert this when the following bug report 20 20 + has been resolved: https://github.com/llvm/llvm-project/issues/41896. 14 21 15 22 config DRM_AMD_DC_DCN 16 23 def_bool n

fs/nilfs2/dat.c

reviewed

··· 111 111 kunmap_atomic(kaddr); 112 112 113 113 nilfs_dat_commit_entry(dat, req); 114 114 + 115 115 + if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) { 116 116 + nilfs_error(dat->i_sb, 117 117 + "state inconsistency probably due to duplicate use of vblocknr = %llu", 118 118 + (unsigned long long)req->pr_entry_nr); 119 119 + return; 120 120 + } 114 121 nilfs_palloc_commit_free_entry(dat, req); 115 122 } 116 123

include/asm-generic/tlb.h

reviewed

··· 222 222 #define tlb_needs_table_invalidate() (true) 223 223 #endif 224 224 225 225 + void tlb_remove_table_sync_one(void); 226 226 + 225 227 #else 226 228 227 229 #ifdef tlb_needs_table_invalidate 228 230 #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE 229 231 #endif 232 232 + 233 233 + static inline void tlb_remove_table_sync_one(void) { } 230 234 231 235 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 232 236

+16 -2

include/linux/gfp.h

reviewed

··· 210 210 return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); 211 211 } 212 212 213 213 + static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) 214 214 + { 215 215 + gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); 216 216 + 217 217 + if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN)) 218 218 + return; 219 219 + 220 220 + if (node_online(this_node)) 221 221 + return; 222 222 + 223 223 + pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node); 224 224 + dump_stack(); 225 225 + } 226 226 + 213 227 /* 214 228 * Allocate pages, preferring the node given as nid. The node must be valid and 215 229 * online. For more general interface, see alloc_pages_node(). ··· 232 218 __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) 233 219 { 234 220 VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); 235 235 - VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid)); 221 221 + warn_if_node_offline(nid, gfp_mask); 236 222 237 223 return __alloc_pages(gfp_mask, order, nid, NULL); 238 224 } ··· 241 227 struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) 242 228 { 243 229 VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); 244 244 - VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid)); 230 230 + warn_if_node_offline(nid, gfp); 245 231 246 232 return __folio_alloc(gfp, order, nid, NULL); 247 233 }

-2

include/linux/license.h

reviewed

··· 2 2 #ifndef __LICENSE_H 3 3 #define __LICENSE_H 4 4 5 5 - #include <linux/string.h> 6 6 - 7 5 static inline int license_is_gpl_compatible(const char *license) 8 6 { 9 7 return (strcmp(license, "GPL") == 0

+21 -8

include/linux/mm.h

reviewed

··· 1852 1852 __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); 1853 1853 } 1854 1854 1855 1855 + /* 1856 1856 + * Parameter block passed down to zap_pte_range in exceptional cases. 1857 1857 + */ 1858 1858 + struct zap_details { 1859 1859 + struct folio *single_folio; /* Locked folio to be unmapped */ 1860 1860 + bool even_cows; /* Zap COWed private pages too? */ 1861 1861 + zap_flags_t zap_flags; /* Extra flags for zapping */ 1862 1862 + }; 1863 1863 + 1864 1864 + /* 1865 1865 + * Whether to drop the pte markers, for example, the uffd-wp information for 1866 1866 + * file-backed memory. This should only be specified when we will completely 1867 1867 + * drop the page in the mm, either by truncation or unmapping of the vma. By 1868 1868 + * default, the flag is not set. 1869 1869 + */ 1870 1870 + #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) 1871 1871 + /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ 1872 1872 + #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) 1873 1873 + 1855 1874 #ifdef CONFIG_MMU 1856 1875 extern bool can_do_mlock(void); 1857 1876 #else ··· 1888 1869 unsigned long size); 1889 1870 void zap_page_range(struct vm_area_struct *vma, unsigned long address, 1890 1871 unsigned long size); 1872 1872 + void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 1873 1873 + unsigned long size, struct zap_details *details); 1891 1874 void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, 1892 1875 struct vm_area_struct *start_vma, unsigned long start, 1893 1876 unsigned long end); ··· 3487 3466 return 0; 3488 3467 } 3489 3468 #endif 3490 3490 - 3491 3491 - /* 3492 3492 - * Whether to drop the pte markers, for example, the uffd-wp information for 3493 3493 - * file-backed memory. This should only be specified when we will completely 3494 3494 - * drop the page in the mm, either by truncation or unmapping of the vma. By 3495 3495 - * default, the flag is not set. 3496 3496 - */ 3497 3497 - #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) 3498 3469 3499 3470 #endif /* _LINUX_MM_H */

+18

include/linux/pgtable.h

reviewed

··· 165 165 return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); 166 166 } 167 167 168 168 + #ifndef pmd_young 169 169 + static inline int pmd_young(pmd_t pmd) 170 170 + { 171 171 + return 0; 172 172 + } 173 173 + #endif 174 174 + 168 175 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 169 176 extern int ptep_set_access_flags(struct vm_area_struct *vma, 170 177 unsigned long address, pte_t *ptep, ··· 265 258 return 0; 266 259 } 267 260 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 261 261 + #endif 262 262 + 263 263 + #ifndef arch_has_hw_nonleaf_pmd_young 264 264 + /* 265 265 + * Return whether the accessed bit in non-leaf PMD entries is supported on the 266 266 + * local CPU. 267 267 + */ 268 268 + static inline bool arch_has_hw_nonleaf_pmd_young(void) 269 269 + { 270 270 + return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); 271 271 + } 268 272 #endif 269 273 270 274 #ifndef arch_has_hw_pte_young

lib/Kconfig.debug

reviewed

··· 399 399 default 2048 if GCC_PLUGIN_LATENT_ENTROPY 400 400 default 2048 if PARISC 401 401 default 1536 if (!64BIT && XTENSA) 402 402 + default 1280 if KASAN && !64BIT 402 403 default 1024 if !64BIT 403 404 default 2048 if 64BIT 404 405 help

+16 -16

mm/compaction.c

reviewed

··· 985 985 } 986 986 987 987 /* 988 988 - * Migration will fail if an anonymous page is pinned in memory, 989 989 - * so avoid taking lru_lock and isolating it unnecessarily in an 990 990 - * admittedly racy check. 991 991 - */ 992 992 - mapping = page_mapping(page); 993 993 - if (!mapping && page_count(page) > page_mapcount(page)) 994 994 - goto isolate_fail; 995 995 - 996 996 - /* 997 997 - * Only allow to migrate anonymous pages in GFP_NOFS context 998 998 - * because those do not depend on fs locks. 999 999 - */ 1000 1000 - if (!(cc->gfp_mask & __GFP_FS) && mapping) 1001 1001 - goto isolate_fail; 1002 1002 - 1003 1003 - /* 1004 988 * Be careful not to clear PageLRU until after we're 1005 989 * sure the page is not being freed elsewhere -- the 1006 990 * page release code relies on it. 1007 991 */ 1008 992 if (unlikely(!get_page_unless_zero(page))) 1009 993 goto isolate_fail; 994 994 + 995 995 + /* 996 996 + * Migration will fail if an anonymous page is pinned in memory, 997 997 + * so avoid taking lru_lock and isolating it unnecessarily in an 998 998 + * admittedly racy check. 999 999 + */ 1000 1000 + mapping = page_mapping(page); 1001 1001 + if (!mapping && (page_count(page) - 1) > total_mapcount(page)) 1002 1002 + goto isolate_fail_put; 1003 1003 + 1004 1004 + /* 1005 1005 + * Only allow to migrate anonymous pages in GFP_NOFS context 1006 1006 + * because those do not depend on fs locks. 1007 1007 + */ 1008 1008 + if (!(cc->gfp_mask & __GFP_FS) && mapping) 1009 1009 + goto isolate_fail_put; 1010 1010 1011 1011 /* Only take pages on LRU: a check now makes later tests safe */ 1012 1012 if (!PageLRU(page))

+44 -2

mm/damon/sysfs.c

reviewed

··· 2283 2283 &wmarks); 2284 2284 } 2285 2285 2286 2286 + static void damon_sysfs_update_scheme(struct damos *scheme, 2287 2287 + struct damon_sysfs_scheme *sysfs_scheme) 2288 2288 + { 2289 2289 + struct damon_sysfs_access_pattern *access_pattern = 2290 2290 + sysfs_scheme->access_pattern; 2291 2291 + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; 2292 2292 + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; 2293 2293 + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; 2294 2294 + 2295 2295 + scheme->pattern.min_sz_region = access_pattern->sz->min; 2296 2296 + scheme->pattern.max_sz_region = access_pattern->sz->max; 2297 2297 + scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; 2298 2298 + scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; 2299 2299 + scheme->pattern.min_age_region = access_pattern->age->min; 2300 2300 + scheme->pattern.max_age_region = access_pattern->age->max; 2301 2301 + 2302 2302 + scheme->action = sysfs_scheme->action; 2303 2303 + 2304 2304 + scheme->quota.ms = sysfs_quotas->ms; 2305 2305 + scheme->quota.sz = sysfs_quotas->sz; 2306 2306 + scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; 2307 2307 + scheme->quota.weight_sz = sysfs_weights->sz; 2308 2308 + scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; 2309 2309 + scheme->quota.weight_age = sysfs_weights->age; 2310 2310 + 2311 2311 + scheme->wmarks.metric = sysfs_wmarks->metric; 2312 2312 + scheme->wmarks.interval = sysfs_wmarks->interval_us; 2313 2313 + scheme->wmarks.high = sysfs_wmarks->high; 2314 2314 + scheme->wmarks.mid = sysfs_wmarks->mid; 2315 2315 + scheme->wmarks.low = sysfs_wmarks->low; 2316 2316 + } 2317 2317 + 2286 2318 static int damon_sysfs_set_schemes(struct damon_ctx *ctx, 2287 2319 struct damon_sysfs_schemes *sysfs_schemes) 2288 2320 { 2289 2289 - int i; 2321 2321 + struct damos *scheme, *next; 2322 2322 + int i = 0; 2290 2323 2291 2291 - for (i = 0; i < sysfs_schemes->nr; i++) { 2324 2324 + damon_for_each_scheme_safe(scheme, next, ctx) { 2325 2325 + if (i < sysfs_schemes->nr) 2326 2326 + damon_sysfs_update_scheme(scheme, 2327 2327 + sysfs_schemes->schemes_arr[i]); 2328 2328 + else 2329 2329 + damon_destroy_scheme(scheme); 2330 2330 + i++; 2331 2331 + } 2332 2332 + 2333 2333 + for (; i < sysfs_schemes->nr; i++) { 2292 2334 struct damos *scheme, *next; 2293 2335 2294 2336 scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);

+16 -11

mm/hugetlb.c

reviewed

··· 5206 5206 5207 5207 __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); 5208 5208 5209 5209 - /* 5210 5210 - * Unlock and free the vma lock before releasing i_mmap_rwsem. When 5211 5211 - * the vma_lock is freed, this makes the vma ineligible for pmd 5212 5212 - * sharing. And, i_mmap_rwsem is required to set up pmd sharing. 5213 5213 - * This is important as page tables for this unmapped range will 5214 5214 - * be asynchrously deleted. If the page tables are shared, there 5215 5215 - * will be issues when accessed by someone else. 5216 5216 - */ 5217 5217 - __hugetlb_vma_unlock_write_free(vma); 5218 5218 - 5219 5219 - i_mmap_unlock_write(vma->vm_file->f_mapping); 5209 5209 + if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ 5210 5210 + /* 5211 5211 + * Unlock and free the vma lock before releasing i_mmap_rwsem. 5212 5212 + * When the vma_lock is freed, this makes the vma ineligible 5213 5213 + * for pmd sharing. And, i_mmap_rwsem is required to set up 5214 5214 + * pmd sharing. This is important as page tables for this 5215 5215 + * unmapped range will be asynchrously deleted. If the page 5216 5216 + * tables are shared, there will be issues when accessed by 5217 5217 + * someone else. 5218 5218 + */ 5219 5219 + __hugetlb_vma_unlock_write_free(vma); 5220 5220 + i_mmap_unlock_write(vma->vm_file->f_mapping); 5221 5221 + } else { 5222 5222 + i_mmap_unlock_write(vma->vm_file->f_mapping); 5223 5223 + hugetlb_vma_unlock_write(vma); 5224 5224 + } 5220 5225 } 5221 5226 5222 5227 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,

+58 -4

mm/khugepaged.c

reviewed

··· 1051 1051 _pmd = pmdp_collapse_flush(vma, address, pmd); 1052 1052 spin_unlock(pmd_ptl); 1053 1053 mmu_notifier_invalidate_range_end(&range); 1054 1054 + tlb_remove_table_sync_one(); 1054 1055 1055 1056 spin_lock(pte_ptl); 1056 1057 result = __collapse_huge_page_isolate(vma, address, pte, cc, ··· 1380 1379 return SCAN_SUCCEED; 1381 1380 } 1382 1381 1382 1382 + /* 1383 1383 + * A note about locking: 1384 1384 + * Trying to take the page table spinlocks would be useless here because those 1385 1385 + * are only used to synchronize: 1386 1386 + * 1387 1387 + * - modifying terminal entries (ones that point to a data page, not to another 1388 1388 + * page table) 1389 1389 + * - installing *new* non-terminal entries 1390 1390 + * 1391 1391 + * Instead, we need roughly the same kind of protection as free_pgtables() or 1392 1392 + * mm_take_all_locks() (but only for a single VMA): 1393 1393 + * The mmap lock together with this VMA's rmap locks covers all paths towards 1394 1394 + * the page table entries we're messing with here, except for hardware page 1395 1395 + * table walks and lockless_pages_from_mm(). 1396 1396 + */ 1383 1397 static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, 1384 1398 unsigned long addr, pmd_t *pmdp) 1385 1399 { 1386 1386 - spinlock_t *ptl; 1387 1400 pmd_t pmd; 1401 1401 + struct mmu_notifier_range range; 1388 1402 1389 1403 mmap_assert_write_locked(mm); 1390 1390 - ptl = pmd_lock(vma->vm_mm, pmdp); 1404 1404 + if (vma->vm_file) 1405 1405 + lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem); 1406 1406 + /* 1407 1407 + * All anon_vmas attached to the VMA have the same root and are 1408 1408 + * therefore locked by the same lock. 1409 1409 + */ 1410 1410 + if (vma->anon_vma) 1411 1411 + lockdep_assert_held_write(&vma->anon_vma->root->rwsem); 1412 1412 + 1413 1413 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, 1414 1414 + addr + HPAGE_PMD_SIZE); 1415 1415 + mmu_notifier_invalidate_range_start(&range); 1391 1416 pmd = pmdp_collapse_flush(vma, addr, pmdp); 1392 1392 - spin_unlock(ptl); 1417 1417 + tlb_remove_table_sync_one(); 1418 1418 + mmu_notifier_invalidate_range_end(&range); 1393 1419 mm_dec_nr_ptes(mm); 1394 1420 page_table_check_pte_clear_range(mm, addr, pmd); 1395 1421 pte_free(mm, pmd_pgtable(pmd)); ··· 1467 1439 if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) 1468 1440 return SCAN_VMA_CHECK; 1469 1441 1442 1442 + /* 1443 1443 + * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings 1444 1444 + * that got written to. Without this, we'd have to also lock the 1445 1445 + * anon_vma if one exists. 1446 1446 + */ 1447 1447 + if (vma->anon_vma) 1448 1448 + return SCAN_VMA_CHECK; 1449 1449 + 1470 1450 /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ 1471 1451 if (userfaultfd_wp(vma)) 1472 1452 return SCAN_PTE_UFFD_WP; ··· 1508 1472 goto drop_hpage; 1509 1473 } 1510 1474 1475 1475 + /* 1476 1476 + * We need to lock the mapping so that from here on, only GUP-fast and 1477 1477 + * hardware page walks can access the parts of the page tables that 1478 1478 + * we're operating on. 1479 1479 + * See collapse_and_free_pmd(). 1480 1480 + */ 1481 1481 + i_mmap_lock_write(vma->vm_file->f_mapping); 1482 1482 + 1483 1483 + /* 1484 1484 + * This spinlock should be unnecessary: Nobody else should be accessing 1485 1485 + * the page tables under spinlock protection here, only 1486 1486 + * lockless_pages_from_mm() and the hardware page walker can access page 1487 1487 + * tables while all the high-level locks are held in write mode. 1488 1488 + */ 1511 1489 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); 1512 1490 result = SCAN_FAIL; 1513 1491 ··· 1576 1526 /* step 4: remove pte entries */ 1577 1527 collapse_and_free_pmd(mm, vma, haddr, pmd); 1578 1528 1529 1529 + i_mmap_unlock_write(vma->vm_file->f_mapping); 1530 1530 + 1579 1531 maybe_install_pmd: 1580 1532 /* step 5: install pmd entry */ 1581 1533 result = install_pmd ··· 1591 1539 1592 1540 abort: 1593 1541 pte_unmap_unlock(start_pte, ptl); 1542 1542 + i_mmap_unlock_write(vma->vm_file->f_mapping); 1594 1543 goto drop_hpage; 1595 1544 } 1596 1545 ··· 1648 1595 * An alternative would be drop the check, but check that page 1649 1596 * table is clear before calling pmdp_collapse_flush() under 1650 1597 * ptl. It has higher chance to recover THP for the VMA, but 1651 1651 - * has higher cost too. 1598 1598 + * has higher cost too. It would also probably require locking 1599 1599 + * the anon_vma. 1652 1600 */ 1653 1601 if (vma->anon_vma) { 1654 1602 result = SCAN_PAGE_ANON;

+3 -3

mm/madvise.c

reviewed

··· 772 772 * Application no longer needs these pages. If the pages are dirty, 773 773 * it's OK to just throw them away. The app will be more careful about 774 774 * data it wants to keep. Be sure to free swap resources too. The 775 775 - * zap_page_range call sets things up for shrink_active_list to actually free 776 776 - * these pages later if no one else has touched them in the meantime, 775 775 + * zap_page_range_single call sets things up for shrink_active_list to actually 776 776 + * free these pages later if no one else has touched them in the meantime, 777 777 * although we could add these pages to a global reuse list for 778 778 * shrink_active_list to pick up before reclaiming other pages. 779 779 * ··· 790 790 static long madvise_dontneed_single_vma(struct vm_area_struct *vma, 791 791 unsigned long start, unsigned long end) 792 792 { 793 793 - zap_page_range(vma, start, end - start); 793 793 + zap_page_range_single(vma, start, end - start, NULL); 794 794 return 0; 795 795 } 796 796

+12 -13

mm/memory.c

reviewed

··· 1341 1341 return ret; 1342 1342 } 1343 1343 1344 1344 - /* 1345 1345 - * Parameter block passed down to zap_pte_range in exceptional cases. 1346 1346 - */ 1347 1347 - struct zap_details { 1348 1348 - struct folio *single_folio; /* Locked folio to be unmapped */ 1349 1349 - bool even_cows; /* Zap COWed private pages too? */ 1350 1350 - zap_flags_t zap_flags; /* Extra flags for zapping */ 1351 1351 - }; 1352 1352 - 1353 1344 /* Whether we should zap all COWed (private) pages too */ 1354 1345 static inline bool should_zap_cows(struct zap_details *details) 1355 1346 { ··· 1711 1720 { 1712 1721 struct mmu_notifier_range range; 1713 1722 struct zap_details details = { 1714 1714 - .zap_flags = ZAP_FLAG_DROP_MARKER, 1723 1723 + .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, 1715 1724 /* Careful - we need to zap private pages too! */ 1716 1725 .even_cows = true, 1717 1726 }; ··· 1765 1774 * 1766 1775 * The range must fit into one VMA. 1767 1776 */ 1768 1768 - static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 1777 1777 + void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, 1769 1778 unsigned long size, struct zap_details *details) 1770 1779 { 1780 1780 + const unsigned long end = address + size; 1771 1781 struct mmu_notifier_range range; 1772 1782 struct mmu_gather tlb; 1773 1783 1774 1784 lru_add_drain(); 1775 1785 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1776 1776 - address, address + size); 1786 1786 + address, end); 1787 1787 + if (is_vm_hugetlb_page(vma)) 1788 1788 + adjust_range_if_pmd_sharing_possible(vma, &range.start, 1789 1789 + &range.end); 1777 1790 tlb_gather_mmu(&tlb, vma->vm_mm); 1778 1791 update_hiwater_rss(vma->vm_mm); 1779 1792 mmu_notifier_invalidate_range_start(&range); 1780 1780 - unmap_single_vma(&tlb, vma, address, range.end, details); 1793 1793 + /* 1794 1794 + * unmap 'address-end' not 'range.start-range.end' as range 1795 1795 + * could have been expanded for hugetlb pmd sharing. 1796 1796 + */ 1797 1797 + unmap_single_vma(&tlb, vma, address, end, details); 1781 1798 mmu_notifier_invalidate_range_end(&range); 1782 1799 tlb_finish_mmu(&tlb); 1783 1800 }

+1 -3

mm/mmu_gather.c

reviewed

··· 153 153 /* Simply deliver the interrupt */ 154 154 } 155 155 156 156 - static void tlb_remove_table_sync_one(void) 156 156 + void tlb_remove_table_sync_one(void) 157 157 { 158 158 /* 159 159 * This isn't an RCU grace period and hence the page-tables cannot be ··· 176 176 } 177 177 178 178 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ 179 179 - 180 180 - static void tlb_remove_table_sync_one(void) { } 181 179 182 180 static void tlb_remove_table_free(struct mmu_table_batch *batch) 183 181 {

+5 -5

mm/vmscan.c

reviewed

··· 3987 3987 goto next; 3988 3988 3989 3989 if (!pmd_trans_huge(pmd[i])) { 3990 3990 - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && 3990 3990 + if (arch_has_hw_nonleaf_pmd_young() && 3991 3991 get_cap(LRU_GEN_NONLEAF_YOUNG)) 3992 3992 pmdp_test_and_clear_young(vma, addr, pmd + i); 3993 3993 goto next; ··· 4085 4085 #endif 4086 4086 walk->mm_stats[MM_NONLEAF_TOTAL]++; 4087 4087 4088 4088 - #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG 4089 4089 - if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { 4088 4088 + if (arch_has_hw_nonleaf_pmd_young() && 4089 4089 + get_cap(LRU_GEN_NONLEAF_YOUNG)) { 4090 4090 if (!pmd_young(val)) 4091 4091 continue; 4092 4092 4093 4093 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); 4094 4094 } 4095 4095 - #endif 4095 4095 + 4096 4096 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) 4097 4097 continue; 4098 4098 ··· 5392 5392 if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) 5393 5393 caps |= BIT(LRU_GEN_MM_WALK); 5394 5394 5395 5395 - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) 5395 5395 + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) 5396 5396 caps |= BIT(LRU_GEN_NONLEAF_YOUNG); 5397 5397 5398 5398 return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);

+2 -2

tools/vm/slabinfo-gnuplot.sh

reviewed

··· 150 150 let lines=3 151 151 out=`basename "$in"`"-slabs-by-loss" 152 152 `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\ 153 153 - egrep -iv '\-\-|Name|Slabs'\ 153 153 + grep -E -iv '\-\-|Name|Slabs'\ 154 154 | awk '{print $1" "$4+$2*$3" "$4}' > "$out"` 155 155 if [ $? -eq 0 ]; then 156 156 do_slabs_plotting "$out" ··· 159 159 let lines=3 160 160 out=`basename "$in"`"-slabs-by-size" 161 161 `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\ 162 162 - egrep -iv '\-\-|Name|Slabs'\ 162 162 + grep -E -iv '\-\-|Name|Slabs'\ 163 163 | awk '{print $1" "$4" "$4-$2*$3}' > "$out"` 164 164 if [ $? -eq 0 ]; then 165 165 do_slabs_plotting "$out"