Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86_urgent_for_v6.13_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:

- Have the Automatic IBRS setting check on AMD does not falsely fire in
the guest when it has been set already on the host

- Make sure cacheinfo structures memory is allocated to address a boot
NULL ptr dereference on Intel Meteor Lake which has different numbers
of subleafs in its CPUID(4) leaf

- Take care of the GDT restoring on the kexec path too, as expected by
the kernel

- Make sure SMP is not disabled when IO-APIC is disabled on the kernel
cmdline

- Add a PGD flag _PAGE_NOPTISHADOW to instruct machinery not to
propagate changes to the kernelmode page tables, to the user portion,
in PTI

- Mark Intel Lunar Lake as affected by an issue where MONITOR wakeups
can get lost and thus user-visible delays happen

- Make sure PKRU is properly restored with XRSTOR on AMD after a PRKU
write of 0 (WRPKRU) which will mark PKRU in its init state and thus
lose the actual buffer

* tag 'x86_urgent_for_v6.13_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/CPU/AMD: WARN when setting EFER.AUTOIBRS if and only if the WRMSR fails
x86/cacheinfo: Delete global num_cache_leaves
cacheinfo: Allocate memory during CPU hotplug if not done from the primary CPU
x86/kexec: Restore GDT on return from ::preserve_context kexec
x86/cpu/topology: Remove limit of CPUs due to disabled IO/APIC
x86/mm: Add _PAGE_NOPTISHADOW bit to avoid updating userspace page tables
x86/cpu: Add Lunar Lake to list of CPUs with a broken MONITOR implementation
x86/pkeys: Ensure updated PKRU value is XRSTOR'd
x86/pkeys: Change caller of update_pkru_in_sigframe()

+81 -58
+6 -2
arch/x86/include/asm/pgtable_types.h
··· 36 36 #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 37 37 38 38 #ifdef CONFIG_X86_64 39 - #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit */ 39 + #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */ 40 + #define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW5 /* No PTI shadow (root PGD) */ 40 41 #else 41 42 /* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ 42 - #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit */ 43 + #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit (leaf) */ 44 + #define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW2 /* No PTI shadow (root PGD) */ 43 45 #endif 44 46 45 47 /* If _PAGE_BIT_PRESENT is clear, we use these: */ ··· 140 138 #define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) 141 139 142 140 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 141 + 142 + #define _PAGE_NOPTISHADOW (_AT(pteval_t, 1) << _PAGE_BIT_NOPTISHADOW) 143 143 144 144 /* 145 145 * Set of bits not changed in pte_modify. The pte's
+1 -1
arch/x86/kernel/cpu/amd.c
··· 1065 1065 */ 1066 1066 if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && 1067 1067 cpu_has(c, X86_FEATURE_AUTOIBRS)) 1068 - WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); 1068 + WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0); 1069 1069 1070 1070 /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ 1071 1071 clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+21 -22
arch/x86/kernel/cpu/cacheinfo.c
··· 178 178 struct amd_northbridge *nb; 179 179 }; 180 180 181 - static unsigned short num_cache_leaves; 182 - 183 181 /* AMD doesn't have CPUID4. Emulate it here to report the same 184 182 information to the user. This makes some assumptions about the machine: 185 183 L2 not shared, no SMT etc. that is currently true on AMD CPUs. ··· 715 717 716 718 void init_amd_cacheinfo(struct cpuinfo_x86 *c) 717 719 { 720 + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); 718 721 719 722 if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { 720 - num_cache_leaves = find_num_cache_leaves(c); 723 + ci->num_leaves = find_num_cache_leaves(c); 721 724 } else if (c->extended_cpuid_level >= 0x80000006) { 722 725 if (cpuid_edx(0x80000006) & 0xf000) 723 - num_cache_leaves = 4; 726 + ci->num_leaves = 4; 724 727 else 725 - num_cache_leaves = 3; 728 + ci->num_leaves = 3; 726 729 } 727 730 } 728 731 729 732 void init_hygon_cacheinfo(struct cpuinfo_x86 *c) 730 733 { 731 - num_cache_leaves = find_num_cache_leaves(c); 734 + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); 735 + 736 + ci->num_leaves = find_num_cache_leaves(c); 732 737 } 733 738 734 739 void init_intel_cacheinfo(struct cpuinfo_x86 *c) ··· 741 740 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ 742 741 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ 743 742 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; 743 + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); 744 744 745 745 if (c->cpuid_level > 3) { 746 - static int is_initialized; 747 - 748 - if (is_initialized == 0) { 749 - /* Init num_cache_leaves from boot CPU */ 750 - num_cache_leaves = find_num_cache_leaves(c); 751 - is_initialized++; 752 - } 746 + /* 747 + * There should be at least one leaf. A non-zero value means 748 + * that the number of leaves has been initialized. 749 + */ 750 + if (!ci->num_leaves) 751 + ci->num_leaves = find_num_cache_leaves(c); 753 752 754 753 /* 755 754 * Whenever possible use cpuid(4), deterministic cache 756 755 * parameters cpuid leaf to find the cache details 757 756 */ 758 - for (i = 0; i < num_cache_leaves; i++) { 757 + for (i = 0; i < ci->num_leaves; i++) { 759 758 struct _cpuid4_info_regs this_leaf = {}; 760 759 int retval; 761 760 ··· 791 790 * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for 792 791 * trace cache 793 792 */ 794 - if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { 793 + if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) { 795 794 /* supports eax=2 call */ 796 795 int j, n; 797 796 unsigned int regs[4]; 798 797 unsigned char *dp = (unsigned char *)regs; 799 798 int only_trace = 0; 800 799 801 - if (num_cache_leaves != 0 && c->x86 == 15) 800 + if (ci->num_leaves && c->x86 == 15) 802 801 only_trace = 1; 803 802 804 803 /* Number of times to iterate */ ··· 992 991 993 992 int init_cache_level(unsigned int cpu) 994 993 { 995 - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); 994 + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); 996 995 997 - if (!num_cache_leaves) 996 + /* There should be at least one leaf. */ 997 + if (!ci->num_leaves) 998 998 return -ENOENT; 999 - if (!this_cpu_ci) 1000 - return -EINVAL; 1001 - this_cpu_ci->num_levels = 3; 1002 - this_cpu_ci->num_leaves = num_cache_leaves; 999 + 1003 1000 return 0; 1004 1001 } 1005 1002
+3 -1
arch/x86/kernel/cpu/intel.c
··· 555 555 c->x86_vfm == INTEL_WESTMERE_EX)) 556 556 set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); 557 557 558 - if (boot_cpu_has(X86_FEATURE_MWAIT) && c->x86_vfm == INTEL_ATOM_GOLDMONT) 558 + if (boot_cpu_has(X86_FEATURE_MWAIT) && 559 + (c->x86_vfm == INTEL_ATOM_GOLDMONT || 560 + c->x86_vfm == INTEL_LUNARLAKE_M)) 559 561 set_cpu_bug(c, X86_BUG_MONITOR); 560 562 561 563 #ifdef CONFIG_X86_64
+3 -3
arch/x86/kernel/cpu/topology.c
··· 428 428 { 429 429 unsigned int possible = nr_cpu_ids; 430 430 431 - /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */ 432 - if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled) 431 + /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' */ 432 + if (!setup_max_cpus || apic_is_disabled) 433 433 possible = 1; 434 434 435 435 /* 'possible_cpus=N' */ ··· 443 443 444 444 static __init bool restrict_to_up(void) 445 445 { 446 - if (!smp_found_config || ioapic_is_disabled) 446 + if (!smp_found_config) 447 447 return true; 448 448 /* 449 449 * XEN PV is special as it does not advertise the local APIC
+2 -18
arch/x86/kernel/fpu/signal.c
··· 64 64 } 65 65 66 66 /* 67 - * Update the value of PKRU register that was already pushed onto the signal frame. 68 - */ 69 - static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) 70 - { 71 - if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) 72 - return 0; 73 - return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); 74 - } 75 - 76 - /* 77 67 * Signal frame handlers. 78 68 */ 79 69 static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) ··· 158 168 159 169 static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru) 160 170 { 161 - int err = 0; 162 - 163 - if (use_xsave()) { 164 - err = xsave_to_user_sigframe(buf); 165 - if (!err) 166 - err = update_pkru_in_sigframe(buf, pkru); 167 - return err; 168 - } 171 + if (use_xsave()) 172 + return xsave_to_user_sigframe(buf, pkru); 169 173 170 174 if (use_fxsr()) 171 175 return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
+26 -1
arch/x86/kernel/fpu/xstate.h
··· 69 69 return fpu_kernel_cfg.independent_features; 70 70 } 71 71 72 + /* 73 + * Update the value of PKRU register that was already pushed onto the signal frame. 74 + */ 75 + static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru) 76 + { 77 + u64 xstate_bv; 78 + int err; 79 + 80 + if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) 81 + return 0; 82 + 83 + /* Mark PKRU as in-use so that it is restored correctly. */ 84 + xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU; 85 + 86 + err = __put_user(xstate_bv, &buf->header.xfeatures); 87 + if (err) 88 + return err; 89 + 90 + /* Update PKRU value in the userspace xsave buffer. */ 91 + return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); 92 + } 93 + 72 94 /* XSAVE/XRSTOR wrapper functions */ 73 95 74 96 #ifdef CONFIG_X86_64 ··· 278 256 * The caller has to zero buf::header before calling this because XSAVE* 279 257 * does not touch the reserved fields in the header. 280 258 */ 281 - static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) 259 + static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru) 282 260 { 283 261 /* 284 262 * Include the features which are not xsaved/rstored by the kernel ··· 302 280 stac(); 303 281 XSTATE_OP(XSAVE, buf, lmask, hmask, err); 304 282 clac(); 283 + 284 + if (!err) 285 + err = update_pkru_in_sigframe(buf, mask, pkru); 305 286 306 287 return err; 307 288 }
+7
arch/x86/kernel/relocate_kernel_64.S
··· 242 242 movq CR0(%r8), %r8 243 243 movq %rax, %cr3 244 244 movq %r8, %cr0 245 + 246 + #ifdef CONFIG_KEXEC_JUMP 247 + /* Saved in save_processor_state. */ 248 + movq $saved_context, %rax 249 + lgdt saved_context_gdt_desc(%rax) 250 + #endif 251 + 245 252 movq %rbp, %rax 246 253 247 254 popf
+3 -3
arch/x86/mm/ident_map.c
··· 174 174 if (result) 175 175 return result; 176 176 177 - set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); 177 + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag | _PAGE_NOPTISHADOW)); 178 178 } 179 179 180 180 return 0; ··· 218 218 if (result) 219 219 return result; 220 220 if (pgtable_l5_enabled()) { 221 - set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); 221 + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag | _PAGE_NOPTISHADOW)); 222 222 } else { 223 223 /* 224 224 * With p4d folded, pgd is equal to p4d. 225 225 * The pgd entry has to point to the pud page table in this case. 226 226 */ 227 227 pud_t *pud = pud_offset(p4d, 0); 228 - set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); 228 + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag | _PAGE_NOPTISHADOW)); 229 229 } 230 230 } 231 231
+1 -1
arch/x86/mm/pti.c
··· 132 132 * Top-level entries added to init_mm's usermode pgd after boot 133 133 * will not be automatically propagated to other mms. 134 134 */ 135 - if (!pgdp_maps_userspace(pgdp)) 135 + if (!pgdp_maps_userspace(pgdp) || (pgd.pgd & _PAGE_NOPTISHADOW)) 136 136 return pgd; 137 137 138 138 /*
+8 -6
drivers/base/cacheinfo.c
··· 58 58 { 59 59 struct cacheinfo *llc; 60 60 61 - if (!cache_leaves(cpu)) 61 + if (!cache_leaves(cpu) || !per_cpu_cacheinfo(cpu)) 62 62 return false; 63 63 64 64 llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); ··· 458 458 return -ENOENT; 459 459 } 460 460 461 - static inline 462 - int allocate_cache_info(int cpu) 461 + static inline int allocate_cache_info(int cpu) 463 462 { 464 - per_cpu_cacheinfo(cpu) = kcalloc(cache_leaves(cpu), 465 - sizeof(struct cacheinfo), GFP_ATOMIC); 463 + per_cpu_cacheinfo(cpu) = kcalloc(cache_leaves(cpu), sizeof(struct cacheinfo), GFP_ATOMIC); 466 464 if (!per_cpu_cacheinfo(cpu)) { 467 465 cache_leaves(cpu) = 0; 468 466 return -ENOMEM; ··· 532 534 */ 533 535 ci_cacheinfo(cpu)->early_ci_levels = false; 534 536 535 - if (cache_leaves(cpu) <= early_leaves) 537 + /* 538 + * Some architectures (e.g., x86) do not use early initialization. 539 + * Allocate memory now in such case. 540 + */ 541 + if (cache_leaves(cpu) <= early_leaves && per_cpu_cacheinfo(cpu)) 536 542 return 0; 537 543 538 544 kfree(per_cpu_cacheinfo(cpu));