Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: make vma cache SLAB_TYPESAFE_BY_RCU

To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
object reuse before RCU grace period is over will be detected by
lock_vma_under_rcu().

Current checks are sufficient as long as vma is detached before it is
freed. The only place this is not currently happening is in exit_mmap().
Add the missing vma_mark_detached() in exit_mmap().

Another issue which might trick lock_vma_under_rcu() during vma reuse is
vm_area_dup(), which copies the entire content of the vma into a new one,
overriding new vma's vm_refcnt and temporarily making it appear as
attached. This might trick a racing lock_vma_under_rcu() to operate on a
reused vma if it found the vma before it got reused. To prevent this
situation, we should ensure that vm_refcnt stays at detached state (0)
when it is copied and advances to attached state only after it is added
into the vma tree. Introduce vm_area_init_from() which preserves new
vma's vm_refcnt and use it in vm_area_dup(). Since all vmas are in
detached state with no current readers when they are freed,

lock_vma_under_rcu() will not be able to take vm_refcnt after vma got
detached even if vma is reused. vma_mark_attached() in modified to
include a release fence to ensure all stores to the vma happen before
vm_refcnt gets initialized.

Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
vm_area_struct reuse and will minimize the number of call_rcu() calls.

[surenb@google.com: remove atomic_set_release() usage in tools/]
Link: https://lkml.kernel.org/r/20250217054351.2973666-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250213224655.1680278-18-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Suren Baghdasaryan and committed by
Andrew Morton
31041385 e49510bf

+70 -56
+1 -3
include/linux/mm.h
··· 258 258 struct vm_area_struct *vm_area_alloc(struct mm_struct *); 259 259 struct vm_area_struct *vm_area_dup(struct vm_area_struct *); 260 260 void vm_area_free(struct vm_area_struct *); 261 - /* Use only if VMA has no other users */ 262 - void __vm_area_free(struct vm_area_struct *vma); 263 261 264 262 #ifndef CONFIG_MMU 265 263 extern struct rb_root nommu_region_tree; ··· 888 890 { 889 891 vma_assert_write_locked(vma); 890 892 vma_assert_detached(vma); 891 - refcount_set(&vma->vm_refcnt, 1); 893 + refcount_set_release(&vma->vm_refcnt, 1); 892 894 } 893 895 894 896 void vma_mark_detached(struct vm_area_struct *vma);
+10 -3
include/linux/mm_types.h
··· 575 575 typedef unsigned long vm_flags_t; 576 576 577 577 /* 578 + * freeptr_t represents a SLUB freelist pointer, which might be encoded 579 + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. 580 + */ 581 + typedef struct { unsigned long v; } freeptr_t; 582 + 583 + /* 578 584 * A region containing a mapping of a non-memory backed file under NOMMU 579 585 * conditions. These are held in a global tree and are pinned by the VMAs that 580 586 * map parts of them. ··· 683 677 * 684 678 * Only explicitly marked struct members may be accessed by RCU readers before 685 679 * getting a stable reference. 680 + * 681 + * WARNING: when adding new members, please update vm_area_init_from() to copy 682 + * them during vm_area_struct content duplication. 686 683 */ 687 684 struct vm_area_struct { 688 685 /* The first cache line has the info for VMA tree walking. */ ··· 696 687 unsigned long vm_start; 697 688 unsigned long vm_end; 698 689 }; 699 - #ifdef CONFIG_PER_VMA_LOCK 700 - struct rcu_head vm_rcu; /* Used for deferred freeing. */ 701 - #endif 690 + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ 702 691 }; 703 692 704 693 /*
-6
include/linux/slab.h
··· 244 244 #endif 245 245 246 246 /* 247 - * freeptr_t represents a SLUB freelist pointer, which might be encoded 248 - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. 249 - */ 250 - typedef struct { unsigned long v; } freeptr_t; 251 - 252 - /* 253 247 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. 254 248 * 255 249 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
+46 -27
kernel/fork.c
··· 449 449 return vma; 450 450 } 451 451 452 + static void vm_area_init_from(const struct vm_area_struct *src, 453 + struct vm_area_struct *dest) 454 + { 455 + dest->vm_mm = src->vm_mm; 456 + dest->vm_ops = src->vm_ops; 457 + dest->vm_start = src->vm_start; 458 + dest->vm_end = src->vm_end; 459 + dest->anon_vma = src->anon_vma; 460 + dest->vm_pgoff = src->vm_pgoff; 461 + dest->vm_file = src->vm_file; 462 + dest->vm_private_data = src->vm_private_data; 463 + vm_flags_init(dest, src->vm_flags); 464 + memcpy(&dest->vm_page_prot, &src->vm_page_prot, 465 + sizeof(dest->vm_page_prot)); 466 + /* 467 + * src->shared.rb may be modified concurrently when called from 468 + * dup_mmap(), but the clone will reinitialize it. 469 + */ 470 + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); 471 + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, 472 + sizeof(dest->vm_userfaultfd_ctx)); 473 + #ifdef CONFIG_ANON_VMA_NAME 474 + dest->anon_name = src->anon_name; 475 + #endif 476 + #ifdef CONFIG_SWAP 477 + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, 478 + sizeof(dest->swap_readahead_info)); 479 + #endif 480 + #ifndef CONFIG_MMU 481 + dest->vm_region = src->vm_region; 482 + #endif 483 + #ifdef CONFIG_NUMA 484 + dest->vm_policy = src->vm_policy; 485 + #endif 486 + } 487 + 452 488 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) 453 489 { 454 490 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); ··· 494 458 495 459 ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); 496 460 ASSERT_EXCLUSIVE_WRITER(orig->vm_file); 497 - /* 498 - * orig->shared.rb may be modified concurrently, but the clone 499 - * will be reinitialized. 500 - */ 501 - data_race(memcpy(new, orig, sizeof(*new))); 461 + vm_area_init_from(orig, new); 502 462 vma_lock_init(new, true); 503 463 INIT_LIST_HEAD(&new->anon_vma_chain); 504 464 vma_numab_state_init(new); ··· 503 471 return new; 504 472 } 505 473 506 - void __vm_area_free(struct vm_area_struct *vma) 474 + void vm_area_free(struct vm_area_struct *vma) 507 475 { 508 476 /* The vma should be detached while being destroyed. */ 509 477 vma_assert_detached(vma); 510 478 vma_numab_state_free(vma); 511 479 free_anon_vma_name(vma); 512 480 kmem_cache_free(vm_area_cachep, vma); 513 - } 514 - 515 - #ifdef CONFIG_PER_VMA_LOCK 516 - static void vm_area_free_rcu_cb(struct rcu_head *head) 517 - { 518 - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, 519 - vm_rcu); 520 - 521 - __vm_area_free(vma); 522 - } 523 - #endif 524 - 525 - void vm_area_free(struct vm_area_struct *vma) 526 - { 527 - #ifdef CONFIG_PER_VMA_LOCK 528 - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); 529 - #else 530 - __vm_area_free(vma); 531 - #endif 532 481 } 533 482 534 483 static void account_kernel_stack(struct task_struct *tsk, int account) ··· 3169 3156 3170 3157 void __init proc_caches_init(void) 3171 3158 { 3159 + struct kmem_cache_args args = { 3160 + .use_freeptr_offset = true, 3161 + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), 3162 + }; 3163 + 3172 3164 sighand_cachep = kmem_cache_create("sighand_cache", 3173 3165 sizeof(struct sighand_struct), 0, 3174 3166 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| ··· 3190 3172 sizeof(struct fs_struct), 0, 3191 3173 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 3192 3174 NULL); 3193 - vm_area_cachep = KMEM_CACHE(vm_area_struct, 3194 - SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC| 3175 + vm_area_cachep = kmem_cache_create("vm_area_struct", 3176 + sizeof(struct vm_area_struct), &args, 3177 + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| 3195 3178 SLAB_ACCOUNT); 3196 3179 mmap_init(); 3197 3180 nsproxy_cache_init();
+2 -1
mm/mmap.c
··· 1305 1305 do { 1306 1306 if (vma->vm_flags & VM_ACCOUNT) 1307 1307 nr_accounted += vma_pages(vma); 1308 - remove_vma(vma, /* unreachable = */ true); 1308 + vma_mark_detached(vma); 1309 + remove_vma(vma); 1309 1310 count++; 1310 1311 cond_resched(); 1311 1312 vma = vma_next(&vmi);
+3 -8
mm/vma.c
··· 420 420 /* 421 421 * Close a vm structure and free it. 422 422 */ 423 - void remove_vma(struct vm_area_struct *vma, bool unreachable) 423 + void remove_vma(struct vm_area_struct *vma) 424 424 { 425 425 might_sleep(); 426 426 vma_close(vma); 427 427 if (vma->vm_file) 428 428 fput(vma->vm_file); 429 429 mpol_put(vma_policy(vma)); 430 - if (unreachable) { 431 - vma_mark_detached(vma); 432 - __vm_area_free(vma); 433 - } else { 434 - vm_area_free(vma); 435 - } 430 + vm_area_free(vma); 436 431 } 437 432 438 433 /* ··· 1213 1218 /* Remove and clean up vmas */ 1214 1219 mas_set(mas_detach, 0); 1215 1220 mas_for_each(mas_detach, vma, ULONG_MAX) 1216 - remove_vma(vma, /* unreachable = */ false); 1221 + remove_vma(vma); 1217 1222 1218 1223 vm_unacct_memory(vms->nr_accounted); 1219 1224 validate_mm(mm);
+1 -1
mm/vma.h
··· 218 218 unsigned long start, size_t len, struct list_head *uf, 219 219 bool unlock); 220 220 221 - void remove_vma(struct vm_area_struct *vma, bool unreachable); 221 + void remove_vma(struct vm_area_struct *vma); 222 222 223 223 void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, 224 224 struct vm_area_struct *prev, struct vm_area_struct *next);
+5
tools/include/linux/refcount.h
··· 60 60 atomic_set(&r->refs, n); 61 61 } 62 62 63 + static inline void refcount_set_release(refcount_t *r, unsigned int n) 64 + { 65 + atomic_set(&r->refs, n); 66 + } 67 + 63 68 static inline unsigned int refcount_read(const refcount_t *r) 64 69 { 65 70 return atomic_read(&r->refs);
+2 -7
tools/testing/vma/vma_internal.h
··· 476 476 { 477 477 vma_assert_write_locked(vma); 478 478 vma_assert_detached(vma); 479 - refcount_set(&vma->vm_refcnt, 1); 479 + refcount_set_release(&vma->vm_refcnt, 1); 480 480 } 481 481 482 482 static inline void vma_mark_detached(struct vm_area_struct *vma) ··· 696 696 { 697 697 } 698 698 699 - static inline void __vm_area_free(struct vm_area_struct *vma) 700 - { 701 - free(vma); 702 - } 703 - 704 699 static inline void vm_area_free(struct vm_area_struct *vma) 705 700 { 706 - __vm_area_free(vma); 701 + free(vma); 707 702 } 708 703 709 704 static inline void lru_add_drain(void)