Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-mm-2025-01-31' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:

- The biggest changes are the TLB flushing scalability optimizations,
to update the mm_cpumask lazily and related changes.

This feature has both a track record and a continued risk of
performance regressions, so it was already delayed by a cycle - but
it's all 100% perfect now™ (Rik van Riel)

- Also miscellaneous fixes and cleanups. (Gautam Somani, Kirill
Shutemov, Sebastian Andrzej Siewior)

* tag 'x86-mm-2025-01-31' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mm: Remove unnecessary include of <linux/extable.h>
x86/mtrr: Rename mtrr_overwrite_state() to guest_force_mtrr_state()
x86/mm/selftests: Fix typo in lam.c
x86/mm/tlb: Only trim the mm_cpumask once a second
x86/mm/tlb: Also remove local CPU from mm_cpumask if stale
x86/mm/tlb: Add tracepoint for TLB flush IPI to stale CPU
x86/mm/tlb: Update mm_cpumask lazily

+57 -20
+2
arch/x86/include/asm/mmu.h
··· 37 37 */ 38 38 atomic64_t tlb_gen; 39 39 40 + unsigned long next_trim_cpumask; 41 + 40 42 #ifdef CONFIG_MODIFY_LDT_SYSCALL 41 43 struct rw_semaphore ldt_usr_sem; 42 44 struct ldt_struct *ldt;
+1
arch/x86/include/asm/mmu_context.h
··· 151 151 152 152 mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); 153 153 atomic64_set(&mm->context.tlb_gen, 0); 154 + mm->context.next_trim_cpumask = jiffies + HZ; 154 155 155 156 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 156 157 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+1
arch/x86/include/asm/tlbflush.h
··· 222 222 unsigned int initiating_cpu; 223 223 u8 stride_shift; 224 224 u8 freed_tables; 225 + u8 trim_cpumask; 225 226 }; 226 227 227 228 void flush_tlb_local(void);
+7 -3
arch/x86/kernel/alternative.c
··· 1854 1854 return temp_state; 1855 1855 } 1856 1856 1857 + __ro_after_init struct mm_struct *poking_mm; 1858 + __ro_after_init unsigned long poking_addr; 1859 + 1857 1860 static inline void unuse_temporary_mm(temp_mm_state_t prev_state) 1858 1861 { 1859 1862 lockdep_assert_irqs_disabled(); 1863 + 1860 1864 switch_mm_irqs_off(NULL, prev_state.mm, current); 1865 + 1866 + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ 1867 + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); 1861 1868 1862 1869 /* 1863 1870 * Restore the breakpoints if they were disabled before the temporary mm ··· 1873 1866 if (hw_breakpoint_active()) 1874 1867 hw_breakpoint_restore(); 1875 1868 } 1876 - 1877 - __ro_after_init struct mm_struct *poking_mm; 1878 - __ro_after_init unsigned long poking_addr; 1879 1869 1880 1870 static void text_poke_memcpy(void *dst, const void *src, size_t len) 1881 1871 {
-1
arch/x86/mm/fault.c
··· 7 7 #include <linux/sched.h> /* test_thread_flag(), ... */ 8 8 #include <linux/sched/task_stack.h> /* task_stack_*(), ... */ 9 9 #include <linux/kdebug.h> /* oops_begin/end, ... */ 10 - #include <linux/extable.h> /* search_exception_tables */ 11 10 #include <linux/memblock.h> /* max_low_pfn */ 12 11 #include <linux/kfence.h> /* kfence_handle_page_fault */ 13 12 #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
+44 -15
arch/x86/mm/tlb.c
··· 607 607 cond_mitigation(tsk); 608 608 609 609 /* 610 - * Stop remote flushes for the previous mm. 611 - * Skip kernel threads; we never send init_mm TLB flushing IPIs, 612 - * but the bitmap manipulation can cause cache line contention. 610 + * Leave this CPU in prev's mm_cpumask. Atomic writes to 611 + * mm_cpumask can be expensive under contention. The CPU 612 + * will be removed lazily at TLB flush time. 613 613 */ 614 - if (prev != &init_mm) { 615 - VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, 616 - mm_cpumask(prev))); 617 - cpumask_clear_cpu(cpu, mm_cpumask(prev)); 618 - } 614 + VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, 615 + mm_cpumask(prev))); 619 616 620 617 /* Start receiving IPIs and then read tlb_gen (and LAM below) */ 621 - if (next != &init_mm) 618 + if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) 622 619 cpumask_set_cpu(cpu, mm_cpumask(next)); 623 620 next_tlb_gen = atomic64_read(&next->context.tlb_gen); 624 621 ··· 757 760 if (!local) { 758 761 inc_irq_stat(irq_tlb_count); 759 762 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 763 + } 760 764 761 - /* Can only happen on remote CPUs */ 762 - if (f->mm && f->mm != loaded_mm) 763 - return; 765 + /* The CPU was left in the mm_cpumask of the target mm. Clear it. */ 766 + if (f->mm && f->mm != loaded_mm) { 767 + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); 768 + trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); 769 + return; 764 770 } 765 771 766 772 if (unlikely(loaded_mm == &init_mm)) ··· 893 893 nr_invalidate); 894 894 } 895 895 896 - static bool tlb_is_not_lazy(int cpu, void *data) 896 + static bool should_flush_tlb(int cpu, void *data) 897 897 { 898 - return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); 898 + struct flush_tlb_info *info = data; 899 + 900 + /* Lazy TLB will get flushed at the next context switch. */ 901 + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) 902 + return false; 903 + 904 + /* No mm means kernel memory flush. */ 905 + if (!info->mm) 906 + return true; 907 + 908 + /* The target mm is loaded, and the CPU is not lazy. */ 909 + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) 910 + return true; 911 + 912 + /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ 913 + if (info->trim_cpumask) 914 + return true; 915 + 916 + return false; 917 + } 918 + 919 + static bool should_trim_cpumask(struct mm_struct *mm) 920 + { 921 + if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) { 922 + WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ); 923 + return true; 924 + } 925 + return false; 899 926 } 900 927 901 928 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); ··· 956 929 if (info->freed_tables) 957 930 on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); 958 931 else 959 - on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, 932 + on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, 960 933 (void *)info, 1, cpumask); 961 934 } 962 935 ··· 1007 980 info->freed_tables = freed_tables; 1008 981 info->new_tlb_gen = new_tlb_gen; 1009 982 info->initiating_cpu = smp_processor_id(); 983 + info->trim_cpumask = 0; 1010 984 1011 985 return info; 1012 986 } ··· 1050 1022 * flush_tlb_func_local() directly in this case. 1051 1023 */ 1052 1024 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { 1025 + info->trim_cpumask = should_trim_cpumask(mm); 1053 1026 flush_tlb_multi(mm_cpumask(mm), info); 1054 1027 } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { 1055 1028 lockdep_assert_irqs_enabled();
+1
include/linux/mm_types.h
··· 1406 1406 TLB_LOCAL_SHOOTDOWN, 1407 1407 TLB_LOCAL_MM_SHOOTDOWN, 1408 1408 TLB_REMOTE_SEND_IPI, 1409 + TLB_REMOTE_WRONG_CPU, 1409 1410 NR_TLB_FLUSH_REASONS, 1410 1411 }; 1411 1412
+1 -1
tools/testing/selftests/x86/lam.c
··· 237 237 * both pointers should point to the same address. 238 238 * 239 239 * @return: 240 - * 0: value on the pointer with metadate and value on original are same 240 + * 0: value on the pointer with metadata and value on original are same 241 241 * 1: not same. 242 242 */ 243 243 static int handle_lam_test(void *src, unsigned int lam)