Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-urgent-2024-03-24' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:

- Ensure that the encryption mask at boot is properly propagated on
5-level page tables, otherwise the PGD entry is incorrectly set to
non-encrypted, which causes system crashes during boot.

- Undo the deferred 5-level page table setup as it cannot work with
memory encryption enabled.

- Prevent inconsistent XFD state on CPU hotplug, where the MSR is reset
to the default value but the cached variable is not, so subsequent
comparisons might yield the wrong result and as a consequence the
result prevents updating the MSR.

- Register the local APIC address only once in the MPPARSE enumeration
to prevent triggering the related WARN_ONs() in the APIC and topology
code.

- Handle the case where no APIC is found gracefully by registering a
fake APIC in the topology code. That makes all related topology
functions work correctly and does not affect the actual APIC driver
code at all.

- Don't evaluate logical IDs during early boot as the local APIC IDs
are not yet enumerated and the invoked function returns an error
code. Nothing requires the logical IDs before the final CPUID
enumeration takes place, which happens after the enumeration.

- Cure the fallout of the per CPU rework on UP which misplaced the
copying of boot_cpu_data to per CPU data so that the final update to
boot_cpu_data got lost which caused inconsistent state and boot
crashes.

- Use copy_from_kernel_nofault() in the kprobes setup as there is no
guarantee that the address can be safely accessed.

- Reorder struct members in struct saved_context to work around another
kmemleak false positive

- Remove the buggy code which tries to update the E820 kexec table for
setup_data as that is never passed to the kexec kernel.

- Update the resource control documentation to use the proper units.

- Fix a Kconfig warning observed with tinyconfig

* tag 'x86-urgent-2024-03-24' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/boot/64: Move 5-level paging global variable assignments back
x86/boot/64: Apply encryption mask to 5-level pagetable update
x86/cpu: Add model number for another Intel Arrow Lake mobile processor
x86/fpu: Keep xfd_state in sync with MSR_IA32_XFD
Documentation/x86: Document that resctrl bandwidth control units are MiB
x86/mpparse: Register APIC address only once
x86/topology: Handle the !APIC case gracefully
x86/topology: Don't evaluate logical IDs during early boot
x86/cpu: Ensure that CPU info updates are propagated on UP
kprobes/x86: Use copy_from_kernel_nofault() to read from unsafe address
x86/pm: Work around false positive kmemleak report in msr_build_context()
x86/kexec: Do not update E820 kexec table for setup_data
x86/config: Fix warning for 'make ARCH=x86_64 tinyconfig'

+80 -89
+4 -4
Documentation/arch/x86/resctrl.rst
··· 45 45 Enable code/data prioritization in L2 cache allocations. 46 46 "mba_MBps": 47 47 Enable the MBA Software Controller(mba_sc) to specify MBA 48 - bandwidth in MBps 48 + bandwidth in MiBps 49 49 "debug": 50 50 Make debug files accessible. Available debug files are annotated with 51 51 "Available only with debug option". ··· 526 526 increase or vary although user specified bandwidth percentage is same. 527 527 528 528 In order to mitigate this and make the interface more user friendly, 529 - resctrl added support for specifying the bandwidth in MBps as well. The 529 + resctrl added support for specifying the bandwidth in MiBps as well. The 530 530 kernel underneath would use a software feedback mechanism or a "Software 531 531 Controller(mba_sc)" which reads the actual bandwidth using MBM counters 532 532 and adjust the memory bandwidth percentages to ensure:: ··· 573 573 574 574 MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;... 575 575 576 - Memory bandwidth Allocation specified in MBps 576 + Memory bandwidth Allocation specified in MiBps 577 577 --------------------------------------------- 578 578 579 579 Memory bandwidth domain is L3 cache. 580 580 :: 581 581 582 - MB:<cache_id0>=bw_MBps0;<cache_id1>=bw_MBps1;... 582 + MB:<cache_id0>=bw_MiBps0;<cache_id1>=bw_MiBps1;... 583 583 584 584 Slow Memory Bandwidth Allocation (SMBA) 585 585 ---------------------------------------
+1
arch/x86/configs/tiny.config
··· 1 1 CONFIG_NOHIGHMEM=y 2 2 # CONFIG_HIGHMEM4G is not set 3 3 # CONFIG_HIGHMEM64G is not set 4 + # CONFIG_UNWINDER_ORC is not set 4 5 CONFIG_UNWINDER_GUESS=y 5 6 # CONFIG_UNWINDER_FRAME_POINTER is not set
+1
arch/x86/include/asm/intel-family.h
··· 127 127 128 128 #define INTEL_FAM6_ARROWLAKE_H 0xC5 129 129 #define INTEL_FAM6_ARROWLAKE 0xC6 130 + #define INTEL_FAM6_ARROWLAKE_U 0xB5 130 131 131 132 #define INTEL_FAM6_LUNARLAKE_M 0xBD 132 133
+5 -5
arch/x86/include/asm/suspend_32.h
··· 12 12 13 13 /* image of the saved processor state */ 14 14 struct saved_context { 15 - /* 16 - * On x86_32, all segment registers except gs are saved at kernel 17 - * entry in pt_regs. 18 - */ 19 - u16 gs; 20 15 unsigned long cr0, cr2, cr3, cr4; 21 16 u64 misc_enable; 22 17 struct saved_msrs saved_msrs; ··· 22 27 unsigned long tr; 23 28 unsigned long safety; 24 29 unsigned long return_address; 30 + /* 31 + * On x86_32, all segment registers except gs are saved at kernel 32 + * entry in pt_regs. 33 + */ 34 + u16 gs; 25 35 bool misc_enable_saved; 26 36 } __attribute__((packed)); 27 37
+9
arch/x86/kernel/cpu/common.c
··· 2307 2307 2308 2308 void __init arch_cpu_finalize_init(void) 2309 2309 { 2310 + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); 2311 + 2310 2312 identify_boot_cpu(); 2311 2313 2312 2314 select_idle_routine(); ··· 2346 2344 */ 2347 2345 fpu__init_system(); 2348 2346 fpu__init_cpu(); 2347 + 2348 + /* 2349 + * Ensure that access to the per CPU representation has the initial 2350 + * boot CPU configuration. 2351 + */ 2352 + *c = boot_cpu_data; 2353 + c->initialized = true; 2349 2354 2350 2355 alternative_instructions(); 2351 2356
+11
arch/x86/kernel/cpu/topology.c
··· 415 415 unsigned int total = assigned + disabled; 416 416 u32 apicid, firstid; 417 417 418 + /* 419 + * If there was no APIC registered, then fake one so that the 420 + * topology bitmap is populated. That ensures that the code below 421 + * is valid and the various query interfaces can be used 422 + * unconditionally. This does not affect the actual APIC code in 423 + * any way because either the local APIC address has not been 424 + * registered or the local APIC was disabled on the command line. 425 + */ 426 + if (topo_info.boot_cpu_apic_id == BAD_APICID) 427 + topology_register_boot_apic(0); 428 + 418 429 if (!restrict_to_up()) { 419 430 if (WARN_ON_ONCE(assigned > nr_cpu_ids)) { 420 431 disabled += assigned - nr_cpu_ids;
+7 -5
arch/x86/kernel/cpu/topology_common.c
··· 140 140 } 141 141 } 142 142 143 - static void topo_set_ids(struct topo_scan *tscan) 143 + static void topo_set_ids(struct topo_scan *tscan, bool early) 144 144 { 145 145 struct cpuinfo_x86 *c = tscan->c; 146 146 u32 apicid = c->topo.apicid; ··· 148 148 c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN); 149 149 c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN); 150 150 151 - c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); 152 - c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); 151 + if (!early) { 152 + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); 153 + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); 154 + } 153 155 154 156 /* Package relative core ID */ 155 157 c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >> ··· 189 187 tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]); 190 188 } 191 189 192 - topo_set_ids(&tscan); 190 + topo_set_ids(&tscan, false); 193 191 } 194 192 195 193 void __init cpu_init_topology(struct cpuinfo_x86 *c) ··· 210 208 x86_topo_system.dom_size[dom] = 1U << sft; 211 209 } 212 210 213 - topo_set_ids(&tscan); 211 + topo_set_ids(&tscan, true); 214 212 215 213 /* 216 214 * AMD systems have Nodes per package which cannot be mapped to
+1 -16
arch/x86/kernel/e820.c
··· 1016 1016 1017 1017 e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); 1018 1018 1019 - /* 1020 - * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by 1021 - * kexec and do not need to be reserved. 1022 - */ 1023 - if (data->type != SETUP_EFI && 1024 - data->type != SETUP_IMA && 1025 - data->type != SETUP_RNG_SEED) 1026 - e820__range_update_kexec(pa_data, 1027 - sizeof(*data) + data->len, 1028 - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); 1029 - 1030 1019 if (data->type == SETUP_INDIRECT) { 1031 1020 len += data->len; 1032 1021 early_memunmap(data, sizeof(*data)); ··· 1027 1038 1028 1039 indirect = (struct setup_indirect *)data->data; 1029 1040 1030 - if (indirect->type != SETUP_INDIRECT) { 1041 + if (indirect->type != SETUP_INDIRECT) 1031 1042 e820__range_update(indirect->addr, indirect->len, 1032 1043 E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); 1033 - e820__range_update_kexec(indirect->addr, indirect->len, 1034 - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); 1035 - } 1036 1044 } 1037 1045 1038 1046 pa_data = pa_next; ··· 1037 1051 } 1038 1052 1039 1053 e820__update_table(e820_table); 1040 - e820__update_table(e820_table_kexec); 1041 1054 1042 1055 pr_info("extended physical RAM map:\n"); 1043 1056 e820__print_table("reserve setup_data");
+3 -2
arch/x86/kernel/fpu/xstate.c
··· 178 178 * Must happen after CR4 setup and before xsetbv() to allow KVM 179 179 * lazy passthrough. Write independent of the dynamic state static 180 180 * key as that does not work on the boot CPU. This also ensures 181 - * that any stale state is wiped out from XFD. 181 + * that any stale state is wiped out from XFD. Reset the per CPU 182 + * xfd cache too. 182 183 */ 183 184 if (cpu_feature_enabled(X86_FEATURE_XFD)) 184 - wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); 185 + xfd_set_state(init_fpstate.xfd); 185 186 186 187 /* 187 188 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
+10 -4
arch/x86/kernel/fpu/xstate.h
··· 148 148 #endif 149 149 150 150 #ifdef CONFIG_X86_64 151 + static inline void xfd_set_state(u64 xfd) 152 + { 153 + wrmsrl(MSR_IA32_XFD, xfd); 154 + __this_cpu_write(xfd_state, xfd); 155 + } 156 + 151 157 static inline void xfd_update_state(struct fpstate *fpstate) 152 158 { 153 159 if (fpu_state_size_dynamic()) { 154 160 u64 xfd = fpstate->xfd; 155 161 156 - if (__this_cpu_read(xfd_state) != xfd) { 157 - wrmsrl(MSR_IA32_XFD, xfd); 158 - __this_cpu_write(xfd_state, xfd); 159 - } 162 + if (__this_cpu_read(xfd_state) != xfd) 163 + xfd_set_state(xfd); 160 164 } 161 165 } 162 166 163 167 extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); 164 168 #else 169 + static inline void xfd_set_state(u64 xfd) { } 170 + 165 171 static inline void xfd_update_state(struct fpstate *fpstate) { } 166 172 167 173 static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
+8 -10
arch/x86/kernel/head64.c
··· 81 81 if (!(native_read_cr4() & X86_CR4_LA57)) 82 82 return false; 83 83 84 + RIP_REL_REF(__pgtable_l5_enabled) = 1; 85 + RIP_REL_REF(pgdir_shift) = 48; 86 + RIP_REL_REF(ptrs_per_p4d) = 512; 87 + RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; 88 + RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; 89 + RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; 90 + 84 91 return true; 85 92 } 86 93 ··· 182 175 p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); 183 176 p4d[MAX_PTRS_PER_P4D - 1] += load_delta; 184 177 185 - pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE_NOENC; 178 + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; 186 179 } 187 180 188 181 RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; ··· 437 430 MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 438 431 (__START_KERNEL & PGDIR_MASK))); 439 432 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 440 - 441 - if (check_la57_support()) { 442 - __pgtable_l5_enabled = 1; 443 - pgdir_shift = 48; 444 - ptrs_per_p4d = 512; 445 - page_offset_base = __PAGE_OFFSET_BASE_L5; 446 - vmalloc_base = __VMALLOC_BASE_L5; 447 - vmemmap_base = __VMEMMAP_BASE_L5; 448 - } 449 433 450 434 cr4_init_shadow(); 451 435
+10 -1
arch/x86/kernel/kprobes/core.c
··· 373 373 kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, 374 374 bool *on_func_entry) 375 375 { 376 - if (is_endbr(*(u32 *)addr)) { 376 + u32 insn; 377 + 378 + /* 379 + * Since 'addr' is not guaranteed to be safe to access, use 380 + * copy_from_kernel_nofault() to read the instruction: 381 + */ 382 + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) 383 + return NULL; 384 + 385 + if (is_endbr(insn)) { 377 386 *on_func_entry = !offset || offset == 4; 378 387 if (*on_func_entry) 379 388 offset = 4;
+5 -5
arch/x86/kernel/mpparse.c
··· 197 197 if (!smp_check_mpc(mpc, oem, str)) 198 198 return 0; 199 199 200 - /* Initialize the lapic mapping */ 201 - if (!acpi_lapic) 202 - register_lapic_address(mpc->lapic); 203 - 204 - if (early) 200 + if (early) { 201 + /* Initialize the lapic mapping */ 202 + if (!acpi_lapic) 203 + register_lapic_address(mpc->lapic); 205 204 return 1; 205 + } 206 206 207 207 /* Now process the configuration blocks. */ 208 208 while (count < mpc->length) {
-10
arch/x86/kernel/setup.c
··· 1206 1206 1207 1207 #endif /* CONFIG_X86_32 */ 1208 1208 1209 - #ifndef CONFIG_SMP 1210 - void __init smp_prepare_boot_cpu(void) 1211 - { 1212 - struct cpuinfo_x86 *c = &cpu_data(0); 1213 - 1214 - *c = boot_cpu_data; 1215 - c->initialized = true; 1216 - } 1217 - #endif 1218 - 1219 1209 static struct notifier_block kernel_offset_notifier = { 1220 1210 .notifier_call = dump_kernel_offset 1221 1211 };
+5 -27
arch/x86/kernel/smpboot.c
··· 313 313 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); 314 314 } 315 315 316 - static void __init smp_store_boot_cpu_info(void) 317 - { 318 - struct cpuinfo_x86 *c = &cpu_data(0); 319 - 320 - *c = boot_cpu_data; 321 - c->initialized = true; 322 - } 323 - 324 316 /* 325 317 * The bootstrap kernel entry code has set these up. Save them for 326 318 * a given CPU ··· 1031 1039 cpumask_set_cpu(0, topology_die_cpumask(0)); 1032 1040 } 1033 1041 1034 - static void __init smp_cpu_index_default(void) 1035 - { 1036 - int i; 1037 - struct cpuinfo_x86 *c; 1038 - 1039 - for_each_possible_cpu(i) { 1040 - c = &cpu_data(i); 1041 - /* mark all to hotplug */ 1042 - c->cpu_index = nr_cpu_ids; 1043 - } 1044 - } 1045 - 1046 1042 void __init smp_prepare_cpus_common(void) 1047 1043 { 1048 1044 unsigned int i; 1049 1045 1050 - smp_cpu_index_default(); 1051 - 1052 - /* 1053 - * Setup boot CPU information 1054 - */ 1055 - smp_store_boot_cpu_info(); /* Final full version of the data */ 1056 - mb(); 1046 + /* Mark all except the boot CPU as hotpluggable */ 1047 + for_each_possible_cpu(i) { 1048 + if (i) 1049 + per_cpu(cpu_info.cpu_index, i) = nr_cpu_ids; 1050 + } 1057 1051 1058 1052 for_each_possible_cpu(i) { 1059 1053 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);