Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-urgent-2024-08-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:

- Fix 32-bit PTI for real.

pti_clone_entry_text() is called twice, once before initcalls so that
initcalls can use the user-mode helper and then again after text is
set read only. Setting read only on 32-bit might break up the PMD
mapping, which makes the second invocation of pti_clone_entry_text()
find the mappings out of sync and failing.

Allow the second call to split the existing PMDs in the user mapping
and synchronize with the kernel mapping.

- Don't make acpi_mp_wake_mailbox read-only after init as the mail box
must be writable in the case that CPU hotplug operations happen after
boot. Otherwise the attempt to start a CPU crashes with a write to
read only memory.

- Add a missing sanity check in mtrr_save_state() to ensure that the
fixed MTRR MSRs are supported.

Otherwise mtrr_save_state() ends up in a #GP, which is fixed up, but
the WARN_ON() can bring systems down when panic on warn is set.

* tag 'x86-urgent-2024-08-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mtrr: Check if fixed MTRRs exist before saving them
x86/paravirt: Fix incorrect virt spinlock setting on bare metal
x86/acpi: Remove __ro_after_init from acpi_mp_wake_mailbox
x86/mm: Fix PTI for i386 some more

+41 -27
+7 -5
arch/x86/include/asm/qspinlock.h
··· 66 66 67 67 #ifdef CONFIG_PARAVIRT 68 68 /* 69 - * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. 69 + * virt_spin_lock_key - disables by default the virt_spin_lock() hijack. 70 70 * 71 - * Native (and PV wanting native due to vCPU pinning) should disable this key. 72 - * It is done in this backwards fashion to only have a single direction change, 73 - * which removes ordering between native_pv_spin_init() and HV setup. 71 + * Native (and PV wanting native due to vCPU pinning) should keep this key 72 + * disabled. Native does not touch the key. 73 + * 74 + * When in a guest then native_pv_lock_init() enables the key first and 75 + * KVM/XEN might conditionally disable it later in the boot process again. 74 76 */ 75 - DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); 77 + DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); 76 78 77 79 /* 78 80 * Shortcut for the queued_spin_lock_slowpath() function that allows
+1 -1
arch/x86/kernel/acpi/madt_wakeup.c
··· 19 19 static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; 20 20 21 21 /* Virtual address of the Multiprocessor Wakeup Structure mailbox */ 22 - static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init; 22 + static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; 23 23 24 24 static u64 acpi_mp_pgd __ro_after_init; 25 25 static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+1 -1
arch/x86/kernel/cpu/mtrr/mtrr.c
··· 609 609 { 610 610 int first_cpu; 611 611 612 - if (!mtrr_enabled()) 612 + if (!mtrr_enabled() || !mtrr_state.have_fixed) 613 613 return; 614 614 615 615 first_cpu = cpumask_first(cpu_online_mask);
+3 -4
arch/x86/kernel/paravirt.c
··· 51 51 DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); 52 52 #endif 53 53 54 - DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); 54 + DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); 55 55 56 56 void __init native_pv_lock_init(void) 57 57 { 58 - if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && 59 - !boot_cpu_has(X86_FEATURE_HYPERVISOR)) 60 - static_branch_disable(&virt_spin_lock_key); 58 + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) 59 + static_branch_enable(&virt_spin_lock_key); 61 60 } 62 61 63 62 static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
+29 -16
arch/x86/mm/pti.c
··· 241 241 * 242 242 * Returns a pointer to a PTE on success, or NULL on failure. 243 243 */ 244 - static pte_t *pti_user_pagetable_walk_pte(unsigned long address) 244 + static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) 245 245 { 246 246 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 247 247 pmd_t *pmd; ··· 251 251 if (!pmd) 252 252 return NULL; 253 253 254 - /* We can't do anything sensible if we hit a large mapping. */ 254 + /* Large PMD mapping found */ 255 255 if (pmd_leaf(*pmd)) { 256 - WARN_ON(1); 257 - return NULL; 256 + /* Clear the PMD if we hit a large mapping from the first round */ 257 + if (late_text) { 258 + set_pmd(pmd, __pmd(0)); 259 + } else { 260 + WARN_ON_ONCE(1); 261 + return NULL; 262 + } 258 263 } 259 264 260 265 if (pmd_none(*pmd)) { ··· 288 283 if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) 289 284 return; 290 285 291 - target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); 286 + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); 292 287 if (WARN_ON(!target_pte)) 293 288 return; 294 289 ··· 306 301 307 302 static void 308 303 pti_clone_pgtable(unsigned long start, unsigned long end, 309 - enum pti_clone_level level) 304 + enum pti_clone_level level, bool late_text) 310 305 { 311 306 unsigned long addr; 312 307 ··· 395 390 return; 396 391 397 392 /* Allocate PTE in the user page-table */ 398 - target_pte = pti_user_pagetable_walk_pte(addr); 393 + target_pte = pti_user_pagetable_walk_pte(addr, late_text); 399 394 if (WARN_ON(!target_pte)) 400 395 return; 401 396 ··· 457 452 phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); 458 453 pte_t *target_pte; 459 454 460 - target_pte = pti_user_pagetable_walk_pte(va); 455 + target_pte = pti_user_pagetable_walk_pte(va, false); 461 456 if (WARN_ON(!target_pte)) 462 457 return; 463 458 ··· 480 475 start = CPU_ENTRY_AREA_BASE; 481 476 end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); 482 477 483 - pti_clone_pgtable(start, end, PTI_CLONE_PMD); 478 + pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); 484 479 } 485 480 #endif /* CONFIG_X86_64 */ 486 481 ··· 497 492 /* 498 493 * Clone the populated PMDs of the entry text and force it RO. 499 494 */ 500 - static void pti_clone_entry_text(void) 495 + static void pti_clone_entry_text(bool late) 501 496 { 502 497 pti_clone_pgtable((unsigned long) __entry_text_start, 503 498 (unsigned long) __entry_text_end, 504 - PTI_LEVEL_KERNEL_IMAGE); 499 + PTI_LEVEL_KERNEL_IMAGE, late); 505 500 } 506 501 507 502 /* ··· 576 571 * pti_set_kernel_image_nonglobal() did to clear the 577 572 * global bit. 578 573 */ 579 - pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); 574 + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); 580 575 581 576 /* 582 577 * pti_clone_pgtable() will set the global bit in any PMDs ··· 643 638 644 639 /* Undo all global bits from the init pagetables in head_64.S: */ 645 640 pti_set_kernel_image_nonglobal(); 641 + 646 642 /* Replace some of the global bits just for shared entry text: */ 647 - pti_clone_entry_text(); 643 + /* 644 + * This is very early in boot. Device and Late initcalls can do 645 + * modprobe before free_initmem() and mark_readonly(). This 646 + * pti_clone_entry_text() allows those user-mode-helpers to function, 647 + * but notably the text is still RW. 648 + */ 649 + pti_clone_entry_text(false); 648 650 pti_setup_espfix64(); 649 651 pti_setup_vsyscall(); 650 652 } ··· 668 656 if (!boot_cpu_has(X86_FEATURE_PTI)) 669 657 return; 670 658 /* 671 - * We need to clone everything (again) that maps parts of the 672 - * kernel image. 659 + * This is after free_initmem() (all initcalls are done) and we've done 660 + * mark_readonly(). Text is now NX which might've split some PMDs 661 + * relative to the early clone. 673 662 */ 674 - pti_clone_entry_text(); 663 + pti_clone_entry_text(true); 675 664 pti_clone_kernel_text(); 676 665 677 666 debug_checkwx_user();