mm: move per-vma lock into vm_area_struct

Back when per-vma locks were introduces, vm_lock was moved out of
vm_area_struct in [1] because of the performance regression caused by
false cacheline sharing. Recent investigation [2] revealed that the
regressions is limited to a rather old Broadwell microarchitecture and
even there it can be mitigated by disabling adjacent cacheline
prefetching, see [3].

Splitting single logical structure into multiple ones leads to more
complicated management, extra pointer dereferences and overall less
maintainable code. When that split-away part is a lock, it complicates
things even further. With no performance benefits, there are no reasons
for this split. Merging the vm_lock back into vm_area_struct also allows
vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. Move
vm_lock back into vm_area_struct, aligning it at the cacheline boundary
and changing the cache to be cacheline-aligned as well. With kernel
compiled using defconfig, this causes VMA memory consumption to grow from
160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:

slabinfo before:
<name> ... <objsize> <objperslab> <pagesperslab> : ...
vma_lock ... 40 102 1 : ...
vm_area_struct ... 160 51 2 : ...

slabinfo after moving vm_lock:
<name> ... <objsize> <objperslab> <pagesperslab> : ...
vm_area_struct ... 256 32 2 : ...

Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
which is 5.5MB per 100000 VMAs. Note that the size of this structure is
dependent on the kernel configuration and typically the original size is
higher than 160 bytes. Therefore these calculations are close to the
worst case scenario. A more realistic vm_area_struct usage before this
change is:

<name> ... <objsize> <objperslab> <pagesperslab> : ...
vma_lock ... 40 102 1 : ...
vm_area_struct ... 176 46 2 : ...

Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
which is 3.9MB per 100000 VMAs. This memory consumption growth can be
addressed later by optimizing the vm_lock.

[1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
[2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
[3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/

Link: https://lkml.kernel.org/r/20250213224655.1680278-3-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Shivank Garg <shivankg@amd.com>
Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Klara Modin <klarasmodin@gmail.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Sourav Panda <souravpanda@google.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Suren Baghdasaryan and committed by

Andrew Morton 1 year ago 7b6218ae b2ae5fcc

+32 -84

4 changed files

expand all

include

linux

mm.h

mm_types.h

kernel

fork.c

tools

testing

vma

vma_internal.h

+16 -12

include/linux/mm.h

··· 697 697 #endif /* CONFIG_NUMA_BALANCING */ 698 698 699 699 #ifdef CONFIG_PER_VMA_LOCK 700 + static inline void vma_lock_init(struct vm_area_struct *vma) 701 + { 702 + init_rwsem(&vma->vm_lock.lock); 703 + vma->vm_lock_seq = UINT_MAX; 704 + } 705 + 700 706 /* 701 707 * Try to read-lock a vma. The function is allowed to occasionally yield false 702 708 * locked result to avoid performance overhead, in which case we fall back to ··· 720 714 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) 721 715 return false; 722 716 723 - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) 717 + if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) 724 718 return false; 725 719 726 720 /* ··· 735 729 * This pairs with RELEASE semantics in vma_end_write_all(). 736 730 */ 737 731 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { 738 - up_read(&vma->vm_lock->lock); 732 + up_read(&vma->vm_lock.lock); 739 733 return false; 740 734 } 741 735 return true; ··· 750 744 static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) 751 745 { 752 746 mmap_assert_locked(vma->vm_mm); 753 - down_read_nested(&vma->vm_lock->lock, subclass); 747 + down_read_nested(&vma->vm_lock.lock, subclass); 754 748 } 755 749 756 750 /* ··· 762 756 static inline void vma_start_read_locked(struct vm_area_struct *vma) 763 757 { 764 758 mmap_assert_locked(vma->vm_mm); 765 - down_read(&vma->vm_lock->lock); 759 + down_read(&vma->vm_lock.lock); 766 760 } 767 761 768 762 static inline void vma_end_read(struct vm_area_struct *vma) 769 763 { 770 764 rcu_read_lock(); /* keeps vma alive till the end of up_read */ 771 - up_read(&vma->vm_lock->lock); 765 + up_read(&vma->vm_lock.lock); 772 766 rcu_read_unlock(); 773 767 } 774 768 ··· 797 791 if (__is_vma_write_locked(vma, &mm_lock_seq)) 798 792 return; 799 793 800 - down_write(&vma->vm_lock->lock); 794 + down_write(&vma->vm_lock.lock); 801 795 /* 802 796 * We should use WRITE_ONCE() here because we can have concurrent reads 803 797 * from the early lockless pessimistic check in vma_start_read(). ··· 805 799 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 806 800 */ 807 801 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 808 - up_write(&vma->vm_lock->lock); 802 + up_write(&vma->vm_lock.lock); 809 803 } 810 804 811 805 static inline void vma_assert_write_locked(struct vm_area_struct *vma) ··· 817 811 818 812 static inline void vma_assert_locked(struct vm_area_struct *vma) 819 813 { 820 - if (!rwsem_is_locked(&vma->vm_lock->lock)) 814 + if (!rwsem_is_locked(&vma->vm_lock.lock)) 821 815 vma_assert_write_locked(vma); 822 816 } 823 817 ··· 850 844 851 845 #else /* CONFIG_PER_VMA_LOCK */ 852 846 847 + static inline void vma_lock_init(struct vm_area_struct *vma) {} 853 848 static inline bool vma_start_read(struct vm_area_struct *vma) 854 849 { return false; } 855 850 static inline void vma_end_read(struct vm_area_struct *vma) {} ··· 885 878 886 879 extern const struct vm_operations_struct vma_dummy_vm_ops; 887 880 888 - /* 889 - * WARNING: vma_init does not initialize vma->vm_lock. 890 - * Use vm_area_alloc()/vm_area_free() if vma needs locking. 891 - */ 892 881 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) 893 882 { 894 883 memset(vma, 0, sizeof(*vma)); ··· 893 890 INIT_LIST_HEAD(&vma->anon_vma_chain); 894 891 vma_mark_detached(vma, false); 895 892 vma_numab_state_init(vma); 893 + vma_lock_init(vma); 896 894 } 897 895 898 896 /* Use when VMA is not part of the VMA tree and needs no locking */

+4 -2

include/linux/mm_types.h

··· 730 730 * slowpath. 731 731 */ 732 732 unsigned int vm_lock_seq; 733 - /* Unstable RCU readers are allowed to read this. */ 734 - struct vma_lock *vm_lock; 735 733 #endif 736 734 737 735 /* ··· 782 784 struct vma_numab_state *numab_state; /* NUMA Balancing state */ 783 785 #endif 784 786 struct vm_userfaultfd_ctx vm_userfaultfd_ctx; 787 + #ifdef CONFIG_PER_VMA_LOCK 788 + /* Unstable RCU readers are allowed to read this. */ 789 + struct vma_lock vm_lock ____cacheline_aligned_in_smp; 790 + #endif 785 791 } __randomize_layout; 786 792 787 793 #ifdef CONFIG_NUMA

+5 -44

kernel/fork.c

··· 436 436 /* SLAB cache for mm_struct structures (tsk->mm) */ 437 437 static struct kmem_cache *mm_cachep; 438 438 439 - #ifdef CONFIG_PER_VMA_LOCK 440 - 441 - /* SLAB cache for vm_area_struct.lock */ 442 - static struct kmem_cache *vma_lock_cachep; 443 - 444 - static bool vma_lock_alloc(struct vm_area_struct *vma) 445 - { 446 - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); 447 - if (!vma->vm_lock) 448 - return false; 449 - 450 - init_rwsem(&vma->vm_lock->lock); 451 - vma->vm_lock_seq = UINT_MAX; 452 - 453 - return true; 454 - } 455 - 456 - static inline void vma_lock_free(struct vm_area_struct *vma) 457 - { 458 - kmem_cache_free(vma_lock_cachep, vma->vm_lock); 459 - } 460 - 461 - #else /* CONFIG_PER_VMA_LOCK */ 462 - 463 - static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } 464 - static inline void vma_lock_free(struct vm_area_struct *vma) {} 465 - 466 - #endif /* CONFIG_PER_VMA_LOCK */ 467 - 468 439 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) 469 440 { 470 441 struct vm_area_struct *vma; ··· 445 474 return NULL; 446 475 447 476 vma_init(vma, mm); 448 - if (!vma_lock_alloc(vma)) { 449 - kmem_cache_free(vm_area_cachep, vma); 450 - return NULL; 451 - } 452 477 453 478 return vma; 454 479 } ··· 463 496 * will be reinitialized. 464 497 */ 465 498 data_race(memcpy(new, orig, sizeof(*new))); 466 - if (!vma_lock_alloc(new)) { 467 - kmem_cache_free(vm_area_cachep, new); 468 - return NULL; 469 - } 499 + vma_lock_init(new); 470 500 INIT_LIST_HEAD(&new->anon_vma_chain); 471 501 vma_numab_state_init(new); 472 502 dup_anon_vma_name(orig, new); ··· 475 511 { 476 512 vma_numab_state_free(vma); 477 513 free_anon_vma_name(vma); 478 - vma_lock_free(vma); 479 514 kmem_cache_free(vm_area_cachep, vma); 480 515 } 481 516 ··· 485 522 vm_rcu); 486 523 487 524 /* The vma should not be locked while being destroyed. */ 488 - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); 525 + VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma); 489 526 __vm_area_free(vma); 490 527 } 491 528 #endif ··· 3163 3200 sizeof(struct fs_struct), 0, 3164 3201 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 3165 3202 NULL); 3166 - 3167 - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); 3168 - #ifdef CONFIG_PER_VMA_LOCK 3169 - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); 3170 - #endif 3203 + vm_area_cachep = KMEM_CACHE(vm_area_struct, 3204 + SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC| 3205 + SLAB_ACCOUNT); 3171 3206 mmap_init(); 3172 3207 nsproxy_cache_init(); 3173 3208 }

+7 -26

tools/testing/vma/vma_internal.h

··· 275 275 /* 276 276 * Can only be written (using WRITE_ONCE()) while holding both: 277 277 * - mmap_lock (in write mode) 278 - * - vm_lock->lock (in write mode) 278 + * - vm_lock.lock (in write mode) 279 279 * Can be read reliably while holding one of: 280 280 * - mmap_lock (in read or write mode) 281 - * - vm_lock->lock (in read or write mode) 281 + * - vm_lock.lock (in read or write mode) 282 282 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout 283 283 * while holding nothing (except RCU to keep the VMA struct allocated). 284 284 * ··· 287 287 * slowpath. 288 288 */ 289 289 unsigned int vm_lock_seq; 290 - struct vma_lock *vm_lock; 290 + struct vma_lock vm_lock; 291 291 #endif 292 292 293 293 /* ··· 464 464 return mas_find(&vmi->mas, ULONG_MAX); 465 465 } 466 466 467 - static inline bool vma_lock_alloc(struct vm_area_struct *vma) 467 + static inline void vma_lock_init(struct vm_area_struct *vma) 468 468 { 469 - vma->vm_lock = calloc(1, sizeof(struct vma_lock)); 470 - 471 - if (!vma->vm_lock) 472 - return false; 473 - 474 - init_rwsem(&vma->vm_lock->lock); 469 + init_rwsem(&vma->vm_lock.lock); 475 470 vma->vm_lock_seq = UINT_MAX; 476 - 477 - return true; 478 471 } 479 472 480 473 static inline void vma_assert_write_locked(struct vm_area_struct *); ··· 490 497 vma->vm_ops = &vma_dummy_vm_ops; 491 498 INIT_LIST_HEAD(&vma->anon_vma_chain); 492 499 vma_mark_detached(vma, false); 500 + vma_lock_init(vma); 493 501 } 494 502 495 503 static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) ··· 501 507 return NULL; 502 508 503 509 vma_init(vma, mm); 504 - if (!vma_lock_alloc(vma)) { 505 - free(vma); 506 - return NULL; 507 - } 508 510 509 511 return vma; 510 512 } ··· 513 523 return NULL; 514 524 515 525 memcpy(new, orig, sizeof(*new)); 516 - if (!vma_lock_alloc(new)) { 517 - free(new); 518 - return NULL; 519 - } 526 + vma_lock_init(new); 520 527 INIT_LIST_HEAD(&new->anon_vma_chain); 521 528 522 529 return new; ··· 683 696 { 684 697 } 685 698 686 - static inline void vma_lock_free(struct vm_area_struct *vma) 687 - { 688 - free(vma->vm_lock); 689 - } 690 - 691 699 static inline void __vm_area_free(struct vm_area_struct *vma) 692 700 { 693 - vma_lock_free(vma); 694 701 free(vma); 695 702 } 696 703

Configure Feed

Configure Feed