Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-urgent-2024-05-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull misc x86 fixes from Ingo Molnar:

- Remove the broken vsyscall emulation code from
the page fault code

- Fix kexec crash triggered by certain SEV RMP
table layouts

- Fix unchecked MSR access error when disabling
the x2APIC via iommu=off

* tag 'x86-urgent-2024-05-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mm: Remove broken vsyscall emulation code from the page fault code
x86/apic: Don't access the APIC when disabling x2APIC
x86/sev: Add callback to apply RMP table fixups for kexec
x86/e820: Add a new e820 table update helper

+64 -67
+2 -26
arch/x86/entry/vsyscall/vsyscall_64.c
··· 98 98 99 99 static bool write_ok_or_segv(unsigned long ptr, size_t size) 100 100 { 101 - /* 102 - * XXX: if access_ok, get_user, and put_user handled 103 - * sig_on_uaccess_err, this could go away. 104 - */ 105 - 106 101 if (!access_ok((void __user *)ptr, size)) { 107 102 struct thread_struct *thread = &current->thread; 108 103 ··· 115 120 bool emulate_vsyscall(unsigned long error_code, 116 121 struct pt_regs *regs, unsigned long address) 117 122 { 118 - struct task_struct *tsk; 119 123 unsigned long caller; 120 124 int vsyscall_nr, syscall_nr, tmp; 121 - int prev_sig_on_uaccess_err; 122 125 long ret; 123 126 unsigned long orig_dx; 124 127 ··· 164 171 "vsyscall with bad stack (exploit attempt?)"); 165 172 goto sigsegv; 166 173 } 167 - 168 - tsk = current; 169 174 170 175 /* 171 176 * Check for access_ok violations and find the syscall nr. ··· 225 234 goto do_ret; /* skip requested */ 226 235 227 236 /* 228 - * With a real vsyscall, page faults cause SIGSEGV. We want to 229 - * preserve that behavior to make writing exploits harder. 237 + * With a real vsyscall, page faults cause SIGSEGV. 230 238 */ 231 - prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; 232 - current->thread.sig_on_uaccess_err = 1; 233 - 234 239 ret = -EFAULT; 235 240 switch (vsyscall_nr) { 236 241 case 0: ··· 249 262 break; 250 263 } 251 264 252 - current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; 253 - 254 265 check_fault: 255 266 if (ret == -EFAULT) { 256 267 /* Bad news -- userspace fed a bad pointer to a vsyscall. */ 257 268 warn_bad_vsyscall(KERN_INFO, regs, 258 269 "vsyscall fault (exploit attempt?)"); 259 - 260 - /* 261 - * If we failed to generate a signal for any reason, 262 - * generate one here. (This should be impossible.) 263 - */ 264 - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && 265 - !sigismember(&tsk->pending.signal, SIGSEGV))) 266 - goto sigsegv; 267 - 268 - return true; /* Don't emulate the ret. */ 270 + goto sigsegv; 269 271 } 270 272 271 273 regs->ax = ret;
+1
arch/x86/include/asm/e820/api.h
··· 17 17 extern void e820__range_add (u64 start, u64 size, enum e820_type type); 18 18 extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); 19 19 extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); 20 + extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); 20 21 21 22 extern void e820__print_table(char *who); 22 23 extern int e820__update_table(struct e820_table *table);
-1
arch/x86/include/asm/processor.h
··· 472 472 unsigned long iopl_emul; 473 473 474 474 unsigned int iopl_warn:1; 475 - unsigned int sig_on_uaccess_err:1; 476 475 477 476 /* 478 477 * Protection Keys Register for Userspace. Loaded immediately on
+2
arch/x86/include/asm/sev.h
··· 269 269 int rmp_make_shared(u64 pfn, enum pg_level level); 270 270 void snp_leak_pages(u64 pfn, unsigned int npages); 271 271 void kdump_sev_callback(void); 272 + void snp_fixup_e820_tables(void); 272 273 #else 273 274 static inline bool snp_probe_rmptable_info(void) { return false; } 274 275 static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } ··· 283 282 static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } 284 283 static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} 285 284 static inline void kdump_sev_callback(void) { } 285 + static inline void snp_fixup_e820_tables(void) {} 286 286 #endif 287 287 288 288 #endif
+11 -5
arch/x86/kernel/apic/apic.c
··· 1771 1771 __x2apic_enable(); 1772 1772 } 1773 1773 1774 - static __init void apic_set_fixmap(void); 1774 + static __init void apic_set_fixmap(bool read_apic); 1775 1775 1776 1776 static __init void x2apic_disable(void) 1777 1777 { ··· 1793 1793 } 1794 1794 1795 1795 __x2apic_disable(); 1796 - apic_set_fixmap(); 1796 + /* 1797 + * Don't reread the APIC ID as it was already done from 1798 + * check_x2apic() and the APIC driver still is a x2APIC variant, 1799 + * which fails to do the read after x2APIC was disabled. 1800 + */ 1801 + apic_set_fixmap(false); 1797 1802 } 1798 1803 1799 1804 static __init void x2apic_enable(void) ··· 2062 2057 } 2063 2058 } 2064 2059 2065 - static __init void apic_set_fixmap(void) 2060 + static __init void apic_set_fixmap(bool read_apic) 2066 2061 { 2067 2062 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); 2068 2063 apic_mmio_base = APIC_BASE; 2069 2064 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", 2070 2065 apic_mmio_base, mp_lapic_addr); 2071 - apic_read_boot_cpu_id(false); 2066 + if (read_apic) 2067 + apic_read_boot_cpu_id(false); 2072 2068 } 2073 2069 2074 2070 void __init register_lapic_address(unsigned long address) ··· 2079 2073 mp_lapic_addr = address; 2080 2074 2081 2075 if (!x2apic_mode) 2082 - apic_set_fixmap(); 2076 + apic_set_fixmap(true); 2083 2077 } 2084 2078 2085 2079 /*
+4 -3
arch/x86/kernel/e820.c
··· 532 532 return __e820__range_update(e820_table, start, size, old_type, new_type); 533 533 } 534 534 535 - static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) 535 + u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, 536 + enum e820_type old_type, enum e820_type new_type) 536 537 { 537 - return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); 538 + return __e820__range_update(t, start, size, old_type, new_type); 538 539 } 539 540 540 541 /* Remove a range of memory from the E820 table: */ ··· 807 806 808 807 addr = memblock_phys_alloc(size, align); 809 808 if (addr) { 810 - e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); 809 + e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); 811 810 pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); 812 811 e820__update_table_kexec(); 813 812 }
+1 -32
arch/x86/mm/fault.c
··· 723 723 WARN_ON_ONCE(user_mode(regs)); 724 724 725 725 /* Are we prepared to handle this kernel fault? */ 726 - if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { 727 - /* 728 - * Any interrupt that takes a fault gets the fixup. This makes 729 - * the below recursive fault logic only apply to a faults from 730 - * task context. 731 - */ 732 - if (in_interrupt()) 733 - return; 734 - 735 - /* 736 - * Per the above we're !in_interrupt(), aka. task context. 737 - * 738 - * In this case we need to make sure we're not recursively 739 - * faulting through the emulate_vsyscall() logic. 740 - */ 741 - if (current->thread.sig_on_uaccess_err && signal) { 742 - sanitize_error_code(address, &error_code); 743 - 744 - set_signal_archinfo(address, error_code); 745 - 746 - if (si_code == SEGV_PKUERR) { 747 - force_sig_pkuerr((void __user *)address, pkey); 748 - } else { 749 - /* XXX: hwpoison faults will set the wrong code. */ 750 - force_sig_fault(signal, si_code, (void __user *)address); 751 - } 752 - } 753 - 754 - /* 755 - * Barring that, we can do the fixup and be happy. 756 - */ 726 + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) 757 727 return; 758 - } 759 728 760 729 /* 761 730 * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
+7
arch/x86/mm/mem_encrypt.c
··· 102 102 phys_addr_t total_mem = memblock_phys_mem_size(); 103 103 unsigned long size; 104 104 105 + /* 106 + * Do RMP table fixups after the e820 tables have been setup by 107 + * e820__memory_setup(). 108 + */ 109 + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 110 + snp_fixup_e820_tables(); 111 + 105 112 if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) 106 113 return; 107 114
+36
arch/x86/virt/svm/sev.c
··· 163 163 return true; 164 164 } 165 165 166 + static void __init __snp_fixup_e820_tables(u64 pa) 167 + { 168 + if (IS_ALIGNED(pa, PMD_SIZE)) 169 + return; 170 + 171 + /* 172 + * Handle cases where the RMP table placement by the BIOS is not 173 + * 2M aligned and the kexec kernel could try to allocate 174 + * from within that chunk which then causes a fatal RMP fault. 175 + * 176 + * The e820_table needs to be updated as it is converted to 177 + * kernel memory resources and used by KEXEC_FILE_LOAD syscall 178 + * to load kexec segments. 179 + * 180 + * The e820_table_firmware needs to be updated as it is exposed 181 + * to sysfs and used by the KEXEC_LOAD syscall to load kexec 182 + * segments. 183 + * 184 + * The e820_table_kexec needs to be updated as it passed to 185 + * the kexec-ed kernel. 186 + */ 187 + pa = ALIGN_DOWN(pa, PMD_SIZE); 188 + if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) { 189 + pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); 190 + e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 191 + e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 192 + e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); 193 + } 194 + } 195 + 196 + void __init snp_fixup_e820_tables(void) 197 + { 198 + __snp_fixup_e820_tables(probed_rmp_base); 199 + __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size); 200 + } 201 + 166 202 /* 167 203 * Do the necessary preparations which are verified by the firmware as 168 204 * described in the SNP_INIT_EX firmware command description in the SNP