Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-x86-vmxon-7.1' of https://github.com/kvm-x86/linux into HEAD

KVM x86 VMXON and EFER.SVME extraction for 7.1

Move _only_ VMXON+VMXOFF and EFER.SVME toggling out of KVM (versus all of VMX
and SVM enabling) out of KVM and into the core kernel so that non-KVM TDX
enabling, e.g. for trusted I/O, can make SEAMCALLs without needing to ensure
KVM is fully loaded.

TIO isn't a hypervisor, and isn't trying to be a hypervisor. Specifically, TIO
should _never_ have it's own VMCSes (that are visible to the host; the
TDX-Module has it's own VMCSes to do SEAMCALL/SEAMRET), and so there is simply
no reason to move that functionality out of KVM.

With that out of the way, dealing with VMXON/VMXOFF and EFER.SVME is a fairly
simple refcounting game.

+721 -661
+5 -31
Documentation/arch/x86/tdx.rst
··· 60 60 must be done on one cpu before any other SEAMCALLs can be made on that 61 61 cpu. 62 62 63 - The kernel provides two functions, tdx_enable() and tdx_cpu_enable() to 64 - allow the user of TDX to enable the TDX module and enable TDX on local 65 - cpu respectively. 66 - 67 - Making SEAMCALL requires VMXON has been done on that CPU. Currently only 68 - KVM implements VMXON. For now both tdx_enable() and tdx_cpu_enable() 69 - don't do VMXON internally (not trivial), but depends on the caller to 70 - guarantee that. 71 - 72 - To enable TDX, the caller of TDX should: 1) temporarily disable CPU 73 - hotplug; 2) do VMXON and tdx_enable_cpu() on all online cpus; 3) call 74 - tdx_enable(). For example:: 75 - 76 - cpus_read_lock(); 77 - on_each_cpu(vmxon_and_tdx_cpu_enable()); 78 - ret = tdx_enable(); 79 - cpus_read_unlock(); 80 - if (ret) 81 - goto no_tdx; 82 - // TDX is ready to use 83 - 84 - And the caller of TDX must guarantee the tdx_cpu_enable() has been 85 - successfully done on any cpu before it wants to run any other SEAMCALL. 86 - A typical usage is do both VMXON and tdx_cpu_enable() in CPU hotplug 87 - online callback, and refuse to online if tdx_cpu_enable() fails. 88 - 89 63 User can consult dmesg to see whether the TDX module has been initialized. 90 64 91 65 If the TDX module is initialized successfully, dmesg shows something 92 66 like below:: 93 67 94 68 [..] virt/tdx: 262668 KBs allocated for PAMT 95 - [..] virt/tdx: module initialized 69 + [..] virt/tdx: TDX-Module initialized 96 70 97 71 If the TDX module failed to initialize, dmesg also shows it failed to 98 72 initialize:: 99 73 100 - [..] virt/tdx: module initialization failed ... 74 + [..] virt/tdx: TDX-Module initialization failed ... 101 75 102 76 TDX Interaction to Other Kernel Components 103 77 ------------------------------------------ ··· 103 129 ~~~~~~~~~~~ 104 130 105 131 TDX module requires the per-cpu initialization SEAMCALL must be done on 106 - one cpu before any other SEAMCALLs can be made on that cpu. The kernel 107 - provides tdx_cpu_enable() to let the user of TDX to do it when the user 108 - wants to use a new cpu for TDX task. 132 + one cpu before any other SEAMCALLs can be made on that cpu. The kernel, 133 + via the CPU hotplug framework, performs the necessary initialization when 134 + a CPU is first brought online. 109 135 110 136 TDX doesn't support physical (ACPI) CPU hotplug. During machine boot, 111 137 TDX verifies all boot-time present logical CPUs are TDX compatible before
-1
arch/x86/events/intel/pt.c
··· 1591 1591 1592 1592 local_irq_restore(flags); 1593 1593 } 1594 - EXPORT_SYMBOL_FOR_KVM(intel_pt_handle_vmx); 1595 1594 1596 1595 /* 1597 1596 * PMU callbacks
+2 -1
arch/x86/include/asm/kvm_host.h
··· 40 40 #include <asm/irq_remapping.h> 41 41 #include <asm/kvm_page_track.h> 42 42 #include <asm/kvm_vcpu_regs.h> 43 - #include <asm/reboot.h> 43 + #include <asm/virt.h> 44 + 44 45 #include <hyperv/hvhdk.h> 45 46 46 47 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
-11
arch/x86/include/asm/reboot.h
··· 25 25 #define MRR_BIOS 0 26 26 #define MRR_APM 1 27 27 28 - typedef void (cpu_emergency_virt_cb)(void); 29 - #if IS_ENABLED(CONFIG_KVM_X86) 30 - void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); 31 - void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); 32 - void cpu_emergency_disable_virtualization(void); 33 - #else 34 - static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {} 35 - static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {} 36 - static inline void cpu_emergency_disable_virtualization(void) {} 37 - #endif /* CONFIG_KVM_X86 */ 38 - 39 28 typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); 40 29 void nmi_shootdown_cpus(nmi_shootdown_cb callback); 41 30 void run_crash_ipi_callback(struct pt_regs *regs);
-4
arch/x86/include/asm/tdx.h
··· 145 145 #define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) 146 146 #define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) 147 147 #define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) 148 - int tdx_cpu_enable(void); 149 - int tdx_enable(void); 150 148 const char *tdx_dump_mce_info(struct mce *m); 151 149 const struct tdx_sys_info *tdx_get_sysinfo(void); 152 150 ··· 221 223 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page); 222 224 #else 223 225 static inline void tdx_init(void) { } 224 - static inline int tdx_cpu_enable(void) { return -ENODEV; } 225 - static inline int tdx_enable(void) { return -ENODEV; } 226 226 static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } 227 227 static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } 228 228 static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
+26
arch/x86/include/asm/virt.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef _ASM_X86_VIRT_H 3 + #define _ASM_X86_VIRT_H 4 + 5 + #include <asm/reboot.h> 6 + 7 + typedef void (cpu_emergency_virt_cb)(void); 8 + 9 + #if IS_ENABLED(CONFIG_KVM_X86) 10 + extern bool virt_rebooting; 11 + 12 + void __init x86_virt_init(void); 13 + 14 + int x86_virt_get_ref(int feat); 15 + void x86_virt_put_ref(int feat); 16 + 17 + int x86_virt_emergency_disable_virtualization_cpu(void); 18 + 19 + void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback); 20 + void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback); 21 + #else 22 + static __always_inline void x86_virt_init(void) {} 23 + static inline int x86_virt_emergency_disable_virtualization_cpu(void) { return -ENOENT; } 24 + #endif 25 + 26 + #endif /* _ASM_X86_VIRT_H */
+11
arch/x86/include/asm/vmx.h
··· 20 20 #include <asm/trapnr.h> 21 21 #include <asm/vmxfeatures.h> 22 22 23 + struct vmcs_hdr { 24 + u32 revision_id:31; 25 + u32 shadow_vmcs:1; 26 + }; 27 + 28 + struct vmcs { 29 + struct vmcs_hdr hdr; 30 + u32 abort; 31 + char data[]; 32 + }; 33 + 23 34 #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) 24 35 25 36 /*
+2
arch/x86/kernel/cpu/common.c
··· 71 71 #include <asm/traps.h> 72 72 #include <asm/sev.h> 73 73 #include <asm/tdx.h> 74 + #include <asm/virt.h> 74 75 #include <asm/posted_intr.h> 75 76 #include <asm/runtime-const.h> 76 77 ··· 2162 2161 cpu_detect_tlb(&boot_cpu_data); 2163 2162 setup_cr_pinning(); 2164 2163 2164 + x86_virt_init(); 2165 2165 tsx_init(); 2166 2166 tdx_init(); 2167 2167 lkgs_init();
+2 -1
arch/x86/kernel/crash.c
··· 42 42 #include <asm/crash.h> 43 43 #include <asm/cmdline.h> 44 44 #include <asm/sev.h> 45 + #include <asm/virt.h> 45 46 46 47 /* Used while preparing memory map entries for second kernel */ 47 48 struct crash_memmap_data { ··· 112 111 113 112 crash_smp_send_stop(); 114 113 115 - cpu_emergency_disable_virtualization(); 114 + x86_virt_emergency_disable_virtualization_cpu(); 116 115 117 116 /* 118 117 * Disable Intel PT to stop its logging
+7 -56
arch/x86/kernel/reboot.c
··· 27 27 #include <asm/cpu.h> 28 28 #include <asm/nmi.h> 29 29 #include <asm/smp.h> 30 + #include <asm/virt.h> 30 31 31 32 #include <linux/ctype.h> 32 33 #include <linux/mc146818rtc.h> ··· 533 532 static inline void nmi_shootdown_cpus_on_restart(void); 534 533 535 534 #if IS_ENABLED(CONFIG_KVM_X86) 536 - /* RCU-protected callback to disable virtualization prior to reboot. */ 537 - static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; 538 - 539 - void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) 540 - { 541 - if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) 542 - return; 543 - 544 - rcu_assign_pointer(cpu_emergency_virt_callback, callback); 545 - } 546 - EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback); 547 - 548 - void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) 549 - { 550 - if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) 551 - return; 552 - 553 - rcu_assign_pointer(cpu_emergency_virt_callback, NULL); 554 - synchronize_rcu(); 555 - } 556 - EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback); 557 - 558 - /* 559 - * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during 560 - * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if 561 - * GIF=0, i.e. if the crash occurred between CLGI and STGI. 562 - */ 563 - void cpu_emergency_disable_virtualization(void) 564 - { 565 - cpu_emergency_virt_cb *callback; 566 - 567 - /* 568 - * IRQs must be disabled as KVM enables virtualization in hardware via 569 - * function call IPIs, i.e. IRQs need to be disabled to guarantee 570 - * virtualization stays disabled. 571 - */ 572 - lockdep_assert_irqs_disabled(); 573 - 574 - rcu_read_lock(); 575 - callback = rcu_dereference(cpu_emergency_virt_callback); 576 - if (callback) 577 - callback(); 578 - rcu_read_unlock(); 579 - } 580 - 581 535 static void emergency_reboot_disable_virtualization(void) 582 536 { 583 537 local_irq_disable(); ··· 544 588 * We can't take any locks and we may be on an inconsistent state, so 545 589 * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt. 546 590 * 547 - * Do the NMI shootdown even if virtualization is off on _this_ CPU, as 548 - * other CPUs may have virtualization enabled. 591 + * Safely force _this_ CPU out of VMX/SVM operation, and if necessary, 592 + * blast NMIs to force other CPUs out of VMX/SVM as well.k 549 593 */ 550 - if (rcu_access_pointer(cpu_emergency_virt_callback)) { 551 - /* Safely force _this_ CPU out of VMX/SVM operation. */ 552 - cpu_emergency_disable_virtualization(); 553 - 554 - /* Disable VMX/SVM and halt on other CPUs. */ 594 + if (!x86_virt_emergency_disable_virtualization_cpu()) 555 595 nmi_shootdown_cpus_on_restart(); 556 - } 557 596 } 558 597 #else 559 598 static void emergency_reboot_disable_virtualization(void) { } ··· 826 875 shootdown_callback(cpu, regs); 827 876 828 877 /* 829 - * Prepare the CPU for reboot _after_ invoking the callback so that the 830 - * callback can safely use virtualization instructions, e.g. VMCLEAR. 878 + * Disable virtualization, as both VMX and SVM can block INIT and thus 879 + * prevent AP bringup, e.g. in a kdump kernel or in firmware. 831 880 */ 832 - cpu_emergency_disable_virtualization(); 881 + x86_virt_emergency_disable_virtualization_cpu(); 833 882 834 883 atomic_dec(&waiting_for_crash_ipi); 835 884
+3 -2
arch/x86/kernel/smp.c
··· 35 35 #include <asm/trace/irq_vectors.h> 36 36 #include <asm/kexec.h> 37 37 #include <asm/reboot.h> 38 + #include <asm/virt.h> 38 39 39 40 /* 40 41 * Some notes on x86 processor bugs affecting SMP operation: ··· 125 124 if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) 126 125 return NMI_HANDLED; 127 126 128 - cpu_emergency_disable_virtualization(); 127 + x86_virt_emergency_disable_virtualization_cpu(); 129 128 stop_this_cpu(NULL); 130 129 131 130 return NMI_HANDLED; ··· 137 136 DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) 138 137 { 139 138 apic_eoi(); 140 - cpu_emergency_disable_virtualization(); 139 + x86_virt_emergency_disable_virtualization_cpu(); 141 140 stop_this_cpu(NULL); 142 141 } 143 142
+8 -27
arch/x86/kvm/svm/svm.c
··· 44 44 #include <asm/traps.h> 45 45 #include <asm/reboot.h> 46 46 #include <asm/fpu/api.h> 47 + #include <asm/virt.h> 47 48 48 49 #include <trace/events/ipi.h> 49 50 ··· 494 493 return &sd->save_area->host_sev_es_save; 495 494 } 496 495 497 - static inline void kvm_cpu_svm_disable(void) 498 - { 499 - uint64_t efer; 500 - 501 - wrmsrq(MSR_VM_HSAVE_PA, 0); 502 - rdmsrq(MSR_EFER, efer); 503 - if (efer & EFER_SVME) { 504 - /* 505 - * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and 506 - * NMI aren't blocked. 507 - */ 508 - stgi(); 509 - wrmsrq(MSR_EFER, efer & ~EFER_SVME); 510 - } 511 - } 512 - 513 496 static void svm_emergency_disable_virtualization_cpu(void) 514 497 { 515 - kvm_rebooting = true; 516 - 517 - kvm_cpu_svm_disable(); 498 + wrmsrq(MSR_VM_HSAVE_PA, 0); 518 499 } 519 500 520 501 static void svm_disable_virtualization_cpu(void) ··· 505 522 if (tsc_scaling) 506 523 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 507 524 508 - kvm_cpu_svm_disable(); 525 + x86_virt_put_ref(X86_FEATURE_SVM); 526 + wrmsrq(MSR_VM_HSAVE_PA, 0); 509 527 510 528 amd_pmu_disable_virt(); 511 529 } ··· 515 531 { 516 532 517 533 struct svm_cpu_data *sd; 518 - uint64_t efer; 519 534 int me = raw_smp_processor_id(); 535 + int r; 520 536 521 - rdmsrq(MSR_EFER, efer); 522 - if (efer & EFER_SVME) 523 - return -EBUSY; 537 + r = x86_virt_get_ref(X86_FEATURE_SVM); 538 + if (r) 539 + return r; 524 540 525 541 sd = per_cpu_ptr(&svm_data, me); 526 542 sd->asid_generation = 1; 527 543 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 528 544 sd->next_asid = sd->max_asid + 1; 529 545 sd->min_asid = max_sev_asid + 1; 530 - 531 - wrmsrq(MSR_EFER, efer | EFER_SVME); 532 546 533 547 wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); 534 548 ··· 537 555 */ 538 556 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 539 557 } 540 - 541 558 542 559 /* 543 560 * Get OSVW bits.
+5 -5
arch/x86/kvm/svm/vmenter.S
··· 298 298 RESTORE_GUEST_SPEC_CTRL_BODY 299 299 RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP) 300 300 301 - 10: cmpb $0, _ASM_RIP(kvm_rebooting) 301 + 10: cmpb $0, _ASM_RIP(virt_rebooting) 302 302 jne 2b 303 303 ud2 304 - 30: cmpb $0, _ASM_RIP(kvm_rebooting) 304 + 30: cmpb $0, _ASM_RIP(virt_rebooting) 305 305 jne 4b 306 306 ud2 307 - 50: cmpb $0, _ASM_RIP(kvm_rebooting) 307 + 50: cmpb $0, _ASM_RIP(virt_rebooting) 308 308 jne 6b 309 309 ud2 310 - 70: cmpb $0, _ASM_RIP(kvm_rebooting) 310 + 70: cmpb $0, _ASM_RIP(virt_rebooting) 311 311 jne 8b 312 312 ud2 313 313 ··· 394 394 RESTORE_GUEST_SPEC_CTRL_BODY 395 395 RESTORE_HOST_SPEC_CTRL_BODY %sil 396 396 397 - 3: cmpb $0, kvm_rebooting(%rip) 397 + 3: cmpb $0, virt_rebooting(%rip) 398 398 jne 2b 399 399 ud2 400 400
+9 -12
arch/x86/kvm/vmx/main.c
··· 29 29 if (ret) 30 30 return ret; 31 31 32 - if (enable_tdx) 33 - tdx_hardware_setup(); 32 + return enable_tdx ? tdx_hardware_setup() : 0; 33 + } 34 34 35 - return 0; 35 + static void vt_hardware_unsetup(void) 36 + { 37 + if (enable_tdx) 38 + tdx_hardware_unsetup(); 39 + 40 + vmx_hardware_unsetup(); 36 41 } 37 42 38 43 static int vt_vm_init(struct kvm *kvm) ··· 874 869 875 870 .check_processor_compatibility = vmx_check_processor_compat, 876 871 877 - .hardware_unsetup = vmx_hardware_unsetup, 872 + .hardware_unsetup = vt_op(hardware_unsetup), 878 873 879 874 .enable_virtualization_cpu = vmx_enable_virtualization_cpu, 880 875 .disable_virtualization_cpu = vt_op(disable_virtualization_cpu), ··· 1034 1029 static void __exit vt_exit(void) 1035 1030 { 1036 1031 kvm_exit(); 1037 - tdx_cleanup(); 1038 1032 vmx_exit(); 1039 1033 } 1040 1034 module_exit(vt_exit); ··· 1046 1042 r = vmx_init(); 1047 1043 if (r) 1048 1044 return r; 1049 - 1050 - /* tdx_init() has been taken */ 1051 - r = tdx_bringup(); 1052 - if (r) 1053 - goto err_tdx_bringup; 1054 1045 1055 1046 /* 1056 1047 * TDX and VMX have different vCPU structures. Calculate the ··· 1073 1074 return 0; 1074 1075 1075 1076 err_kvm_init: 1076 - tdx_cleanup(); 1077 - err_tdx_bringup: 1078 1077 vmx_exit(); 1079 1078 return r; 1080 1079 }
+21 -189
arch/x86/kvm/vmx/tdx.c
··· 6 6 #include <linux/misc_cgroup.h> 7 7 #include <linux/mmu_context.h> 8 8 #include <asm/tdx.h> 9 + #include <asm/virt.h> 9 10 #include "capabilities.h" 10 11 #include "mmu.h" 11 12 #include "x86_ops.h" ··· 58 57 59 58 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) 60 59 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) 61 - 62 - static enum cpuhp_state tdx_cpuhp_state; 63 60 64 61 static const struct tdx_sys_info *tdx_sysinfo; 65 62 ··· 217 218 */ 218 219 static DEFINE_MUTEX(tdx_lock); 219 220 220 - static atomic_t nr_configured_hkid; 221 - 222 221 static bool tdx_operand_busy(u64 err) 223 222 { 224 223 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; ··· 264 267 { 265 268 tdx_guest_keyid_free(kvm_tdx->hkid); 266 269 kvm_tdx->hkid = -1; 267 - atomic_dec(&nr_configured_hkid); 268 270 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); 269 271 put_misc_cg(kvm_tdx->misc_cg); 270 272 kvm_tdx->misc_cg = NULL; ··· 1984 1988 * TDX_SEAMCALL_VMFAILINVALID. 1985 1989 */ 1986 1990 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { 1987 - KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); 1991 + KVM_BUG_ON(!virt_rebooting, vcpu->kvm); 1988 1992 goto unhandled_exit; 1989 1993 } 1990 1994 ··· 2386 2390 goto free_hkid; 2387 2391 2388 2392 ret = -ENOMEM; 2389 - 2390 - atomic_inc(&nr_configured_hkid); 2391 2393 2392 2394 tdr_page = alloc_page(GFP_KERNEL); 2393 2395 if (!tdr_page) ··· 3278 3284 return PG_LEVEL_4K; 3279 3285 } 3280 3286 3281 - static int tdx_online_cpu(unsigned int cpu) 3287 + void tdx_hardware_unsetup(void) 3282 3288 { 3283 - unsigned long flags; 3284 - int r; 3285 - 3286 - /* Sanity check CPU is already in post-VMXON */ 3287 - WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); 3288 - 3289 - local_irq_save(flags); 3290 - r = tdx_cpu_enable(); 3291 - local_irq_restore(flags); 3292 - 3293 - return r; 3289 + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); 3294 3290 } 3295 3291 3296 - static int tdx_offline_cpu(unsigned int cpu) 3297 - { 3298 - int i; 3299 - 3300 - /* No TD is running. Allow any cpu to be offline. */ 3301 - if (!atomic_read(&nr_configured_hkid)) 3302 - return 0; 3303 - 3304 - /* 3305 - * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to 3306 - * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory 3307 - * controller with pconfig. If we have active TDX HKID, refuse to 3308 - * offline the last online cpu. 3309 - */ 3310 - for_each_online_cpu(i) { 3311 - /* 3312 - * Found another online cpu on the same package. 3313 - * Allow to offline. 3314 - */ 3315 - if (i != cpu && topology_physical_package_id(i) == 3316 - topology_physical_package_id(cpu)) 3317 - return 0; 3318 - } 3319 - 3320 - /* 3321 - * This is the last cpu of this package. Don't offline it. 3322 - * 3323 - * Because it's hard for human operator to understand the 3324 - * reason, warn it. 3325 - */ 3326 - #define MSG_ALLPKG_ONLINE \ 3327 - "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" 3328 - pr_warn_ratelimited(MSG_ALLPKG_ONLINE); 3329 - return -EBUSY; 3330 - } 3331 - 3332 - static void __do_tdx_cleanup(void) 3333 - { 3334 - /* 3335 - * Once TDX module is initialized, it cannot be disabled and 3336 - * re-initialized again w/o runtime update (which isn't 3337 - * supported by kernel). Only need to remove the cpuhp here. 3338 - * The TDX host core code tracks TDX status and can handle 3339 - * 'multiple enabling' scenario. 3340 - */ 3341 - WARN_ON_ONCE(!tdx_cpuhp_state); 3342 - cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); 3343 - tdx_cpuhp_state = 0; 3344 - } 3345 - 3346 - static void __tdx_cleanup(void) 3347 - { 3348 - cpus_read_lock(); 3349 - __do_tdx_cleanup(); 3350 - cpus_read_unlock(); 3351 - } 3352 - 3353 - static int __init __do_tdx_bringup(void) 3354 - { 3355 - int r; 3356 - 3357 - /* 3358 - * TDX-specific cpuhp callback to call tdx_cpu_enable() on all 3359 - * online CPUs before calling tdx_enable(), and on any new 3360 - * going-online CPU to make sure it is ready for TDX guest. 3361 - */ 3362 - r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, 3363 - "kvm/cpu/tdx:online", 3364 - tdx_online_cpu, tdx_offline_cpu); 3365 - if (r < 0) 3366 - return r; 3367 - 3368 - tdx_cpuhp_state = r; 3369 - 3370 - r = tdx_enable(); 3371 - if (r) 3372 - __do_tdx_cleanup(); 3373 - 3374 - return r; 3375 - } 3376 - 3377 - static int __init __tdx_bringup(void) 3292 + static int __init __tdx_hardware_setup(void) 3378 3293 { 3379 3294 const struct tdx_sys_info_td_conf *td_conf; 3380 - int r, i; 3295 + int i; 3381 3296 3382 3297 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { 3383 3298 /* ··· 3302 3399 } 3303 3400 } 3304 3401 3305 - /* 3306 - * Enabling TDX requires enabling hardware virtualization first, 3307 - * as making SEAMCALLs requires CPU being in post-VMXON state. 3308 - */ 3309 - r = kvm_enable_virtualization(); 3310 - if (r) 3311 - return r; 3312 - 3313 - cpus_read_lock(); 3314 - r = __do_tdx_bringup(); 3315 - cpus_read_unlock(); 3316 - 3317 - if (r) 3318 - goto tdx_bringup_err; 3319 - 3320 - r = -EINVAL; 3321 3402 /* Get TDX global information for later use */ 3322 3403 tdx_sysinfo = tdx_get_sysinfo(); 3323 - if (WARN_ON_ONCE(!tdx_sysinfo)) 3324 - goto get_sysinfo_err; 3404 + if (!tdx_sysinfo) 3405 + return -ENODEV; 3325 3406 3326 3407 /* Check TDX module and KVM capabilities */ 3327 3408 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || 3328 3409 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) 3329 - goto get_sysinfo_err; 3410 + return -EINVAL; 3330 3411 3331 3412 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) 3332 - goto get_sysinfo_err; 3413 + return -EINVAL; 3333 3414 3334 3415 /* 3335 3416 * TDX has its own limit of maximum vCPUs it can support for all ··· 3348 3461 if (td_conf->max_vcpus_per_td < num_present_cpus()) { 3349 3462 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", 3350 3463 td_conf->max_vcpus_per_td, num_present_cpus()); 3351 - goto get_sysinfo_err; 3464 + return -EINVAL; 3352 3465 } 3353 3466 3354 3467 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) 3355 - goto get_sysinfo_err; 3468 + return -EINVAL; 3356 3469 3357 - /* 3358 - * Leave hardware virtualization enabled after TDX is enabled 3359 - * successfully. TDX CPU hotplug depends on this. 3360 - */ 3361 3470 return 0; 3362 - 3363 - get_sysinfo_err: 3364 - __tdx_cleanup(); 3365 - tdx_bringup_err: 3366 - kvm_disable_virtualization(); 3367 - return r; 3368 3471 } 3369 3472 3370 - void tdx_cleanup(void) 3371 - { 3372 - if (enable_tdx) { 3373 - misc_cg_set_capacity(MISC_CG_RES_TDX, 0); 3374 - __tdx_cleanup(); 3375 - kvm_disable_virtualization(); 3376 - } 3377 - } 3378 - 3379 - int __init tdx_bringup(void) 3473 + int __init tdx_hardware_setup(void) 3380 3474 { 3381 3475 int r, i; 3382 3476 ··· 3388 3520 goto success_disable_tdx; 3389 3521 } 3390 3522 3391 - if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { 3392 - pr_err("tdx: MOVDIR64B is required for TDX\n"); 3393 - goto success_disable_tdx; 3394 - } 3395 - 3396 - if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { 3397 - pr_err("Self-snoop is required for TDX\n"); 3398 - goto success_disable_tdx; 3399 - } 3400 - 3401 3523 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 3402 - pr_err("tdx: no TDX private KeyIDs available\n"); 3524 + pr_err("TDX not supported by the host platform\n"); 3403 3525 goto success_disable_tdx; 3404 3526 } 3405 3527 3406 - if (!enable_virt_at_load) { 3407 - pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); 3408 - goto success_disable_tdx; 3409 - } 3410 - 3411 - /* 3412 - * Ideally KVM should probe whether TDX module has been loaded 3413 - * first and then try to bring it up. But TDX needs to use SEAMCALL 3414 - * to probe whether the module is loaded (there is no CPUID or MSR 3415 - * for that), and making SEAMCALL requires enabling virtualization 3416 - * first, just like the rest steps of bringing up TDX module. 3417 - * 3418 - * So, for simplicity do everything in __tdx_bringup(); the first 3419 - * SEAMCALL will return -ENODEV when the module is not loaded. The 3420 - * only complication is having to make sure that initialization 3421 - * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other 3422 - * cases. 3423 - */ 3424 - r = __tdx_bringup(); 3528 + r = __tdx_hardware_setup(); 3425 3529 if (r) { 3426 3530 /* 3427 3531 * Disable TDX only but don't fail to load module if the TDX ··· 3408 3568 if (r == -ENODEV) 3409 3569 goto success_disable_tdx; 3410 3570 3411 - enable_tdx = 0; 3571 + return r; 3412 3572 } 3413 3573 3414 - return r; 3415 - 3416 - success_disable_tdx: 3417 - enable_tdx = 0; 3418 - return 0; 3419 - } 3420 - 3421 - void __init tdx_hardware_setup(void) 3422 - { 3423 3574 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx); 3424 3575 3425 - /* 3426 - * Note, if the TDX module can't be loaded, KVM TDX support will be 3427 - * disabled but KVM will continue loading (see tdx_bringup()). 3428 - */ 3429 3576 vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); 3430 3577 3431 3578 vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; ··· 3420 3593 vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; 3421 3594 vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; 3422 3595 vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; 3596 + return 0; 3597 + 3598 + success_disable_tdx: 3599 + enable_tdx = 0; 3600 + return 0; 3423 3601 }
+2 -6
arch/x86/kvm/vmx/tdx.h
··· 8 8 #ifdef CONFIG_KVM_INTEL_TDX 9 9 #include "common.h" 10 10 11 - void tdx_hardware_setup(void); 12 - int tdx_bringup(void); 13 - void tdx_cleanup(void); 11 + int tdx_hardware_setup(void); 12 + void tdx_hardware_unsetup(void); 14 13 15 14 extern bool enable_tdx; 16 15 ··· 186 187 TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch); 187 188 188 189 #else 189 - static inline int tdx_bringup(void) { return 0; } 190 - static inline void tdx_cleanup(void) {} 191 - 192 190 #define enable_tdx 0 193 191 194 192 struct kvm_tdx {
-11
arch/x86/kvm/vmx/vmcs.h
··· 22 22 #define VMCS12_IDX_TO_ENC(idx) ROL16(idx, 10) 23 23 #define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6) 24 24 25 - struct vmcs_hdr { 26 - u32 revision_id:31; 27 - u32 shadow_vmcs:1; 28 - }; 29 - 30 - struct vmcs { 31 - struct vmcs_hdr hdr; 32 - u32 abort; 33 - char data[]; 34 - }; 35 - 36 25 DECLARE_PER_CPU(struct vmcs *, current_vmcs); 37 26 38 27 /*
+1 -1
arch/x86/kvm/vmx/vmenter.S
··· 310 310 RET 311 311 312 312 .Lfixup: 313 - cmpb $0, _ASM_RIP(kvm_rebooting) 313 + cmpb $0, _ASM_RIP(virt_rebooting) 314 314 jne .Lvmfail 315 315 ud2 316 316 .Lvmfail:
+9 -129
arch/x86/kvm/vmx/vmx.c
··· 48 48 #include <asm/msr.h> 49 49 #include <asm/mwait.h> 50 50 #include <asm/spec-ctrl.h> 51 + #include <asm/virt.h> 51 52 #include <asm/vmx.h> 52 53 53 54 #include <trace/events/ipi.h> ··· 580 579 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 581 580 } 582 581 583 - static DEFINE_PER_CPU(struct vmcs *, vmxarea); 584 582 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 585 583 /* 586 584 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed ··· 786 786 return ret; 787 787 } 788 788 789 - /* 790 - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 791 - * 792 - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 793 - * atomically track post-VMXON state, e.g. this may be called in NMI context. 794 - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 795 - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 796 - * magically in RM, VM86, compat mode, or at CPL>0. 797 - */ 798 - static int kvm_cpu_vmxoff(void) 799 - { 800 - asm goto("1: vmxoff\n\t" 801 - _ASM_EXTABLE(1b, %l[fault]) 802 - ::: "cc", "memory" : fault); 803 - 804 - cr4_clear_bits(X86_CR4_VMXE); 805 - return 0; 806 - 807 - fault: 808 - cr4_clear_bits(X86_CR4_VMXE); 809 - return -EIO; 810 - } 811 - 812 789 void vmx_emergency_disable_virtualization_cpu(void) 813 790 { 814 791 int cpu = raw_smp_processor_id(); 815 792 struct loaded_vmcs *v; 816 - 817 - kvm_rebooting = true; 818 - 819 - /* 820 - * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 821 - * set in task context. If this races with VMX is disabled by an NMI, 822 - * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 823 - * kvm_rebooting set. 824 - */ 825 - if (!(__read_cr4() & X86_CR4_VMXE)) 826 - return; 827 793 828 794 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 829 795 loaded_vmcss_on_cpu_link) { ··· 797 831 if (v->shadow_vmcs) 798 832 vmcs_clear(v->shadow_vmcs); 799 833 } 800 - 801 - kvm_cpu_vmxoff(); 802 834 } 803 835 804 836 static void __loaded_vmcs_clear(void *arg) ··· 2891 2927 return false; 2892 2928 } 2893 2929 2894 - if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2895 - !this_cpu_has(X86_FEATURE_VMX)) { 2930 + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL)) { 2896 2931 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2932 + return false; 2933 + } 2934 + 2935 + if (!this_cpu_has(X86_FEATURE_VMX)) { 2936 + pr_err("VMX not fully enabled on CPU %d. Check kernel logs and/or BIOS\n", cpu); 2897 2937 return false; 2898 2938 } 2899 2939 ··· 2952 2984 return 0; 2953 2985 } 2954 2986 2955 - static int kvm_cpu_vmxon(u64 vmxon_pointer) 2956 - { 2957 - u64 msr; 2958 - 2959 - cr4_set_bits(X86_CR4_VMXE); 2960 - 2961 - asm goto("1: vmxon %[vmxon_pointer]\n\t" 2962 - _ASM_EXTABLE(1b, %l[fault]) 2963 - : : [vmxon_pointer] "m"(vmxon_pointer) 2964 - : : fault); 2965 - return 0; 2966 - 2967 - fault: 2968 - WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2969 - rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2970 - cr4_clear_bits(X86_CR4_VMXE); 2971 - 2972 - return -EFAULT; 2973 - } 2974 - 2975 2987 int vmx_enable_virtualization_cpu(void) 2976 2988 { 2977 2989 int cpu = raw_smp_processor_id(); 2978 - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2979 - int r; 2980 - 2981 - if (cr4_read_shadow() & X86_CR4_VMXE) 2982 - return -EBUSY; 2983 2990 2984 2991 /* 2985 2992 * This can happen if we hot-added a CPU but failed to allocate ··· 2963 3020 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2964 3021 return -EFAULT; 2965 3022 2966 - intel_pt_handle_vmx(1); 2967 - 2968 - r = kvm_cpu_vmxon(phys_addr); 2969 - if (r) { 2970 - intel_pt_handle_vmx(0); 2971 - return r; 2972 - } 2973 - 2974 - return 0; 3023 + return x86_virt_get_ref(X86_FEATURE_VMX); 2975 3024 } 2976 3025 2977 3026 static void vmclear_local_loaded_vmcss(void) ··· 2980 3045 { 2981 3046 vmclear_local_loaded_vmcss(); 2982 3047 2983 - if (kvm_cpu_vmxoff()) 2984 - kvm_spurious_fault(); 3048 + x86_virt_put_ref(X86_FEATURE_VMX); 2985 3049 2986 3050 hv_reset_evmcs(); 2987 - 2988 - intel_pt_handle_vmx(0); 2989 3051 } 2990 3052 2991 3053 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) ··· 3058 3126 out_vmcs: 3059 3127 free_loaded_vmcs(loaded_vmcs); 3060 3128 return -ENOMEM; 3061 - } 3062 - 3063 - static void free_kvm_area(void) 3064 - { 3065 - int cpu; 3066 - 3067 - for_each_possible_cpu(cpu) { 3068 - free_vmcs(per_cpu(vmxarea, cpu)); 3069 - per_cpu(vmxarea, cpu) = NULL; 3070 - } 3071 - } 3072 - 3073 - static __init int alloc_kvm_area(void) 3074 - { 3075 - int cpu; 3076 - 3077 - for_each_possible_cpu(cpu) { 3078 - struct vmcs *vmcs; 3079 - 3080 - vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 3081 - if (!vmcs) { 3082 - free_kvm_area(); 3083 - return -ENOMEM; 3084 - } 3085 - 3086 - /* 3087 - * When eVMCS is enabled, alloc_vmcs_cpu() sets 3088 - * vmcs->revision_id to KVM_EVMCS_VERSION instead of 3089 - * revision_id reported by MSR_IA32_VMX_BASIC. 3090 - * 3091 - * However, even though not explicitly documented by 3092 - * TLFS, VMXArea passed as VMXON argument should 3093 - * still be marked with revision_id reported by 3094 - * physical CPU. 3095 - */ 3096 - if (kvm_is_using_evmcs()) 3097 - vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3098 - 3099 - per_cpu(vmxarea, cpu) = vmcs; 3100 - } 3101 - return 0; 3102 3129 } 3103 3130 3104 3131 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, ··· 8460 8569 8461 8570 if (nested) 8462 8571 nested_vmx_hardware_unsetup(); 8463 - 8464 - free_kvm_area(); 8465 8572 } 8466 8573 8467 8574 void vmx_vm_destroy(struct kvm *kvm) ··· 8758 8869 return r; 8759 8870 } 8760 8871 8761 - r = alloc_kvm_area(); 8762 - if (r) 8763 - goto err_kvm_area; 8764 - 8765 8872 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8766 8873 8767 8874 /* ··· 8784 8899 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8785 8900 8786 8901 return 0; 8787 - 8788 - err_kvm_area: 8789 - if (nested) 8790 - nested_vmx_hardware_unsetup(); 8791 - return r; 8792 8902 } 8793 8903 8794 8904 void vmx_exit(void)
+25 -4
arch/x86/kvm/x86.c
··· 83 83 #include <asm/intel_pt.h> 84 84 #include <asm/emulate_prefix.h> 85 85 #include <asm/sgx.h> 86 + #include <asm/virt.h> 87 + 86 88 #include <clocksource/hyperv_timer.h> 87 89 88 90 #define CREATE_TRACE_POINTS ··· 715 713 noinstr void kvm_spurious_fault(void) 716 714 { 717 715 /* Fault while not rebooting. We want the trace. */ 718 - BUG_ON(!kvm_rebooting); 716 + BUG_ON(!virt_rebooting); 719 717 } 720 718 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault); 721 719 ··· 13127 13125 13128 13126 void kvm_arch_enable_virtualization(void) 13129 13127 { 13130 - cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); 13128 + x86_virt_register_emergency_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); 13131 13129 } 13132 13130 13133 13131 void kvm_arch_disable_virtualization(void) 13134 13132 { 13135 - cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); 13133 + x86_virt_unregister_emergency_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); 13136 13134 } 13137 13135 13138 13136 int kvm_arch_enable_virtualization_cpu(void) ··· 13231 13229 return 0; 13232 13230 } 13233 13231 13232 + void kvm_arch_shutdown(void) 13233 + { 13234 + /* 13235 + * Set virt_rebooting to indicate that KVM has asynchronously disabled 13236 + * hardware virtualization, i.e. that errors and/or exceptions on SVM 13237 + * and VMX instructions are expected and should be ignored. 13238 + */ 13239 + virt_rebooting = true; 13240 + 13241 + /* 13242 + * Ensure virt_rebooting is visible before IPIs are sent to other CPUs 13243 + * to disable virtualization. Effectively pairs with the reception of 13244 + * the IPI (virt_rebooting is read in task/exception context, but only 13245 + * _needs_ to be read as %true after the IPI function callback disables 13246 + * virtualization). 13247 + */ 13248 + smp_wmb(); 13249 + } 13250 + 13234 13251 void kvm_arch_disable_virtualization_cpu(void) 13235 13252 { 13236 13253 kvm_x86_call(disable_virtualization_cpu)(); ··· 13264 13243 * disable virtualization arrives. Handle the extreme edge case here 13265 13244 * instead of trying to account for it in the normal flows. 13266 13245 */ 13267 - if (in_task() || WARN_ON_ONCE(!kvm_rebooting)) 13246 + if (in_task() || WARN_ON_ONCE(!virt_rebooting)) 13268 13247 drop_user_return_notifiers(); 13269 13248 else 13270 13249 __module_get(THIS_MODULE);
+2
arch/x86/virt/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 obj-y += svm/ vmx/ 3 + 4 + obj-$(subst m,y,$(CONFIG_KVM_X86)) += hw.o
+360
arch/x86/virt/hw.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <linux/cpu.h> 3 + #include <linux/cpumask.h> 4 + #include <linux/errno.h> 5 + #include <linux/kvm_types.h> 6 + #include <linux/list.h> 7 + #include <linux/percpu.h> 8 + 9 + #include <asm/perf_event.h> 10 + #include <asm/processor.h> 11 + #include <asm/virt.h> 12 + #include <asm/vmx.h> 13 + 14 + struct x86_virt_ops { 15 + int feature; 16 + int (*enable_virtualization_cpu)(void); 17 + int (*disable_virtualization_cpu)(void); 18 + void (*emergency_disable_virtualization_cpu)(void); 19 + }; 20 + static struct x86_virt_ops virt_ops __ro_after_init; 21 + 22 + __visible bool virt_rebooting; 23 + EXPORT_SYMBOL_FOR_KVM(virt_rebooting); 24 + 25 + static DEFINE_PER_CPU(int, virtualization_nr_users); 26 + 27 + static cpu_emergency_virt_cb __rcu *kvm_emergency_callback; 28 + 29 + void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback) 30 + { 31 + if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback))) 32 + return; 33 + 34 + rcu_assign_pointer(kvm_emergency_callback, callback); 35 + } 36 + EXPORT_SYMBOL_FOR_KVM(x86_virt_register_emergency_callback); 37 + 38 + void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback) 39 + { 40 + if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback) != callback)) 41 + return; 42 + 43 + rcu_assign_pointer(kvm_emergency_callback, NULL); 44 + synchronize_rcu(); 45 + } 46 + EXPORT_SYMBOL_FOR_KVM(x86_virt_unregister_emergency_callback); 47 + 48 + static void x86_virt_invoke_kvm_emergency_callback(void) 49 + { 50 + cpu_emergency_virt_cb *kvm_callback; 51 + 52 + kvm_callback = rcu_dereference(kvm_emergency_callback); 53 + if (kvm_callback) 54 + kvm_callback(); 55 + } 56 + 57 + #if IS_ENABLED(CONFIG_KVM_INTEL) 58 + static DEFINE_PER_CPU(struct vmcs *, root_vmcs); 59 + 60 + static int x86_virt_cpu_vmxon(void) 61 + { 62 + u64 vmxon_pointer = __pa(per_cpu(root_vmcs, raw_smp_processor_id())); 63 + u64 msr; 64 + 65 + cr4_set_bits(X86_CR4_VMXE); 66 + 67 + asm goto("1: vmxon %[vmxon_pointer]\n\t" 68 + _ASM_EXTABLE(1b, %l[fault]) 69 + : : [vmxon_pointer] "m"(vmxon_pointer) 70 + : : fault); 71 + return 0; 72 + 73 + fault: 74 + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 75 + rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 76 + cr4_clear_bits(X86_CR4_VMXE); 77 + 78 + return -EFAULT; 79 + } 80 + 81 + static int x86_vmx_enable_virtualization_cpu(void) 82 + { 83 + int r; 84 + 85 + if (cr4_read_shadow() & X86_CR4_VMXE) 86 + return -EBUSY; 87 + 88 + intel_pt_handle_vmx(1); 89 + 90 + r = x86_virt_cpu_vmxon(); 91 + if (r) { 92 + intel_pt_handle_vmx(0); 93 + return r; 94 + } 95 + 96 + return 0; 97 + } 98 + 99 + /* 100 + * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 101 + * 102 + * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 103 + * atomically track post-VMXON state, e.g. this may be called in NMI context. 104 + * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 105 + * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 106 + * magically in RM, VM86, compat mode, or at CPL>0. 107 + */ 108 + static int x86_vmx_disable_virtualization_cpu(void) 109 + { 110 + int r = -EIO; 111 + 112 + asm goto("1: vmxoff\n\t" 113 + _ASM_EXTABLE(1b, %l[fault]) 114 + ::: "cc", "memory" : fault); 115 + r = 0; 116 + 117 + fault: 118 + cr4_clear_bits(X86_CR4_VMXE); 119 + intel_pt_handle_vmx(0); 120 + return r; 121 + } 122 + 123 + static void x86_vmx_emergency_disable_virtualization_cpu(void) 124 + { 125 + virt_rebooting = true; 126 + 127 + /* 128 + * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 129 + * set in task context. If this races with _another_ emergency call 130 + * from NMI context, VMCLEAR (in KVM) and VMXOFF may #UD, but KVM and 131 + * the kernel will eat those faults due to virt_rebooting being set by 132 + * the interrupting NMI callback. 133 + */ 134 + if (!(__read_cr4() & X86_CR4_VMXE)) 135 + return; 136 + 137 + x86_virt_invoke_kvm_emergency_callback(); 138 + 139 + x86_vmx_disable_virtualization_cpu(); 140 + } 141 + 142 + static __init void x86_vmx_exit(void) 143 + { 144 + int cpu; 145 + 146 + for_each_possible_cpu(cpu) { 147 + free_page((unsigned long)per_cpu(root_vmcs, cpu)); 148 + per_cpu(root_vmcs, cpu) = NULL; 149 + } 150 + } 151 + 152 + static __init int __x86_vmx_init(void) 153 + { 154 + const struct x86_virt_ops vmx_ops = { 155 + .feature = X86_FEATURE_VMX, 156 + .enable_virtualization_cpu = x86_vmx_enable_virtualization_cpu, 157 + .disable_virtualization_cpu = x86_vmx_disable_virtualization_cpu, 158 + .emergency_disable_virtualization_cpu = x86_vmx_emergency_disable_virtualization_cpu, 159 + }; 160 + 161 + u64 basic_msr; 162 + u32 rev_id; 163 + int cpu; 164 + 165 + if (!cpu_feature_enabled(X86_FEATURE_VMX)) 166 + return -EOPNOTSUPP; 167 + 168 + rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); 169 + 170 + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 171 + if (WARN_ON_ONCE(vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)) 172 + return -EIO; 173 + 174 + /* 175 + * Even if eVMCS is enabled (or will be enabled?), and even though not 176 + * explicitly documented by TLFS, the root VMCS passed to VMXON should 177 + * still be marked with the revision_id reported by the physical CPU. 178 + */ 179 + rev_id = vmx_basic_vmcs_revision_id(basic_msr); 180 + 181 + for_each_possible_cpu(cpu) { 182 + int node = cpu_to_node(cpu); 183 + struct page *page; 184 + struct vmcs *vmcs; 185 + 186 + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 187 + if (WARN_ON_ONCE(!page)) { 188 + x86_vmx_exit(); 189 + return -ENOMEM; 190 + } 191 + 192 + vmcs = page_address(page); 193 + vmcs->hdr.revision_id = rev_id; 194 + per_cpu(root_vmcs, cpu) = vmcs; 195 + } 196 + 197 + memcpy(&virt_ops, &vmx_ops, sizeof(virt_ops)); 198 + return 0; 199 + } 200 + 201 + static __init int x86_vmx_init(void) 202 + { 203 + int r; 204 + 205 + r = __x86_vmx_init(); 206 + if (r) 207 + setup_clear_cpu_cap(X86_FEATURE_VMX); 208 + return r; 209 + } 210 + #else 211 + static __init int x86_vmx_init(void) { return -EOPNOTSUPP; } 212 + static __init void x86_vmx_exit(void) { } 213 + #endif 214 + 215 + #if IS_ENABLED(CONFIG_KVM_AMD) 216 + static int x86_svm_enable_virtualization_cpu(void) 217 + { 218 + u64 efer; 219 + 220 + rdmsrq(MSR_EFER, efer); 221 + if (efer & EFER_SVME) 222 + return -EBUSY; 223 + 224 + wrmsrq(MSR_EFER, efer | EFER_SVME); 225 + return 0; 226 + } 227 + 228 + static int x86_svm_disable_virtualization_cpu(void) 229 + { 230 + int r = -EIO; 231 + u64 efer; 232 + 233 + /* 234 + * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and 235 + * NMI aren't blocked. 236 + */ 237 + asm goto("1: stgi\n\t" 238 + _ASM_EXTABLE(1b, %l[fault]) 239 + ::: "memory" : fault); 240 + r = 0; 241 + 242 + fault: 243 + rdmsrq(MSR_EFER, efer); 244 + wrmsrq(MSR_EFER, efer & ~EFER_SVME); 245 + return r; 246 + } 247 + 248 + static void x86_svm_emergency_disable_virtualization_cpu(void) 249 + { 250 + u64 efer; 251 + 252 + virt_rebooting = true; 253 + 254 + rdmsrq(MSR_EFER, efer); 255 + if (!(efer & EFER_SVME)) 256 + return; 257 + 258 + x86_virt_invoke_kvm_emergency_callback(); 259 + 260 + x86_svm_disable_virtualization_cpu(); 261 + } 262 + 263 + static __init int x86_svm_init(void) 264 + { 265 + const struct x86_virt_ops svm_ops = { 266 + .feature = X86_FEATURE_SVM, 267 + .enable_virtualization_cpu = x86_svm_enable_virtualization_cpu, 268 + .disable_virtualization_cpu = x86_svm_disable_virtualization_cpu, 269 + .emergency_disable_virtualization_cpu = x86_svm_emergency_disable_virtualization_cpu, 270 + }; 271 + 272 + if (!cpu_feature_enabled(X86_FEATURE_SVM) || 273 + cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) 274 + return -EOPNOTSUPP; 275 + 276 + memcpy(&virt_ops, &svm_ops, sizeof(virt_ops)); 277 + return 0; 278 + } 279 + #else 280 + static __init int x86_svm_init(void) { return -EOPNOTSUPP; } 281 + #endif 282 + 283 + int x86_virt_get_ref(int feat) 284 + { 285 + int r; 286 + 287 + /* Ensure the !feature check can't get false positives. */ 288 + BUILD_BUG_ON(!X86_FEATURE_SVM || !X86_FEATURE_VMX); 289 + 290 + if (!virt_ops.feature || virt_ops.feature != feat) 291 + return -EOPNOTSUPP; 292 + 293 + guard(preempt)(); 294 + 295 + if (this_cpu_inc_return(virtualization_nr_users) > 1) 296 + return 0; 297 + 298 + r = virt_ops.enable_virtualization_cpu(); 299 + if (r) 300 + WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users)); 301 + 302 + return r; 303 + } 304 + EXPORT_SYMBOL_FOR_KVM(x86_virt_get_ref); 305 + 306 + void x86_virt_put_ref(int feat) 307 + { 308 + guard(preempt)(); 309 + 310 + if (WARN_ON_ONCE(!this_cpu_read(virtualization_nr_users)) || 311 + this_cpu_dec_return(virtualization_nr_users)) 312 + return; 313 + 314 + BUG_ON(virt_ops.disable_virtualization_cpu() && !virt_rebooting); 315 + } 316 + EXPORT_SYMBOL_FOR_KVM(x86_virt_put_ref); 317 + 318 + /* 319 + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during 320 + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if 321 + * GIF=0, i.e. if the crash occurred between CLGI and STGI. 322 + */ 323 + int x86_virt_emergency_disable_virtualization_cpu(void) 324 + { 325 + if (!virt_ops.feature) 326 + return -EOPNOTSUPP; 327 + 328 + /* 329 + * IRQs must be disabled as virtualization is enabled in hardware via 330 + * function call IPIs, i.e. IRQs need to be disabled to guarantee 331 + * virtualization stays disabled. 332 + */ 333 + lockdep_assert_irqs_disabled(); 334 + 335 + /* 336 + * Do the NMI shootdown even if virtualization is off on _this_ CPU, as 337 + * other CPUs may have virtualization enabled. 338 + * 339 + * TODO: Track whether or not virtualization might be enabled on other 340 + * CPUs? May not be worth avoiding the NMI shootdown... 341 + */ 342 + virt_ops.emergency_disable_virtualization_cpu(); 343 + return 0; 344 + } 345 + 346 + void __init x86_virt_init(void) 347 + { 348 + /* 349 + * Attempt to initialize both SVM and VMX, and simply use whichever one 350 + * is present. Rsefuse to enable/use SVM or VMX if both are somehow 351 + * supported. No known CPU supports both SVM and VMX. 352 + */ 353 + bool has_vmx = !x86_vmx_init(); 354 + bool has_svm = !x86_svm_init(); 355 + 356 + if (WARN_ON_ONCE(has_vmx && has_svm)) { 357 + x86_vmx_exit(); 358 + memset(&virt_ops, 0, sizeof(virt_ops)); 359 + } 360 + }
+189 -137
arch/x86/virt/vmx/tdx/tdx.c
··· 28 28 #include <linux/log2.h> 29 29 #include <linux/acpi.h> 30 30 #include <linux/suspend.h> 31 + #include <linux/syscore_ops.h> 31 32 #include <linux/idr.h> 32 33 #include <linux/kvm_types.h> 33 34 #include <asm/page.h> ··· 40 39 #include <asm/cpu_device_id.h> 41 40 #include <asm/processor.h> 42 41 #include <asm/mce.h> 42 + #include <asm/virt.h> 43 43 #include "tdx.h" 44 44 45 45 static u32 tdx_global_keyid __ro_after_init; ··· 53 51 54 52 static struct tdmr_info_list tdx_tdmr_list; 55 53 56 - static enum tdx_module_status_t tdx_module_status; 57 - static DEFINE_MUTEX(tdx_module_lock); 58 - 59 54 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */ 60 55 static LIST_HEAD(tdx_memlist); 61 56 62 - static struct tdx_sys_info tdx_sysinfo; 57 + static struct tdx_sys_info tdx_sysinfo __ro_after_init; 58 + static bool tdx_module_initialized __ro_after_init; 63 59 64 60 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args); 65 61 ··· 106 106 107 107 /* 108 108 * Do the module global initialization once and return its result. 109 - * It can be done on any cpu. It's always called with interrupts 110 - * disabled. 109 + * It can be done on any cpu, and from task or IRQ context. 111 110 */ 112 111 static int try_init_module_global(void) 113 112 { ··· 114 115 static DEFINE_RAW_SPINLOCK(sysinit_lock); 115 116 static bool sysinit_done; 116 117 static int sysinit_ret; 117 - 118 - lockdep_assert_irqs_disabled(); 119 118 120 119 raw_spin_lock(&sysinit_lock); 121 120 ··· 139 142 } 140 143 141 144 /** 142 - * tdx_cpu_enable - Enable TDX on local cpu 143 - * 144 - * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module 145 - * global initialization SEAMCALL if not done) on local cpu to make this 146 - * cpu be ready to run any other SEAMCALLs. 147 - * 148 - * Always call this function via IPI function calls. 149 - * 150 - * Return 0 on success, otherwise errors. 145 + * Enable VMXON and then do one-time TDX module per-cpu initialization SEAMCALL 146 + * (and TDX module global initialization SEAMCALL if not done) on local cpu to 147 + * make this cpu be ready to run any other SEAMCALLs. 151 148 */ 152 - int tdx_cpu_enable(void) 149 + static int tdx_cpu_enable(void) 153 150 { 154 151 struct tdx_module_args args = {}; 155 152 int ret; 156 - 157 - if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 158 - return -ENODEV; 159 - 160 - lockdep_assert_irqs_disabled(); 161 153 162 154 if (__this_cpu_read(tdx_lp_initialized)) 163 155 return 0; ··· 168 182 169 183 return 0; 170 184 } 171 - EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable); 185 + 186 + static int tdx_online_cpu(unsigned int cpu) 187 + { 188 + int ret; 189 + 190 + ret = x86_virt_get_ref(X86_FEATURE_VMX); 191 + if (ret) 192 + return ret; 193 + 194 + ret = tdx_cpu_enable(); 195 + if (ret) 196 + x86_virt_put_ref(X86_FEATURE_VMX); 197 + 198 + return ret; 199 + } 200 + 201 + static int tdx_offline_cpu(unsigned int cpu) 202 + { 203 + int i; 204 + 205 + /* No TD is running. Allow any cpu to be offline. */ 206 + if (ida_is_empty(&tdx_guest_keyid_pool)) 207 + goto done; 208 + 209 + /* 210 + * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to 211 + * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory 212 + * controller with pconfig. If we have active TDX HKID, refuse to 213 + * offline the last online cpu. 214 + */ 215 + for_each_online_cpu(i) { 216 + /* 217 + * Found another online cpu on the same package. 218 + * Allow to offline. 219 + */ 220 + if (i != cpu && topology_physical_package_id(i) == 221 + topology_physical_package_id(cpu)) 222 + goto done; 223 + } 224 + 225 + /* 226 + * This is the last cpu of this package. Don't offline it. 227 + * 228 + * Because it's hard for human operator to understand the 229 + * reason, warn it. 230 + */ 231 + #define MSG_ALLPKG_ONLINE \ 232 + "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" 233 + pr_warn_ratelimited(MSG_ALLPKG_ONLINE); 234 + return -EBUSY; 235 + 236 + done: 237 + x86_virt_put_ref(X86_FEATURE_VMX); 238 + return 0; 239 + } 240 + 241 + static void tdx_shutdown_cpu(void *ign) 242 + { 243 + x86_virt_put_ref(X86_FEATURE_VMX); 244 + } 245 + 246 + static void tdx_shutdown(void *ign) 247 + { 248 + on_each_cpu(tdx_shutdown_cpu, NULL, 1); 249 + } 250 + 251 + static int tdx_suspend(void *ign) 252 + { 253 + x86_virt_put_ref(X86_FEATURE_VMX); 254 + return 0; 255 + } 256 + 257 + static void tdx_resume(void *ign) 258 + { 259 + WARN_ON_ONCE(x86_virt_get_ref(X86_FEATURE_VMX)); 260 + } 261 + 262 + static const struct syscore_ops tdx_syscore_ops = { 263 + .suspend = tdx_suspend, 264 + .resume = tdx_resume, 265 + .shutdown = tdx_shutdown, 266 + }; 267 + 268 + static struct syscore tdx_syscore = { 269 + .ops = &tdx_syscore_ops, 270 + }; 172 271 173 272 /* 174 273 * Add a memory region as a TDX memory block. The caller must make sure 175 274 * all memory regions are added in address ascending order and don't 176 275 * overlap. 177 276 */ 178 - static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn, 179 - unsigned long end_pfn, int nid) 277 + static __init int add_tdx_memblock(struct list_head *tmb_list, 278 + unsigned long start_pfn, 279 + unsigned long end_pfn, int nid) 180 280 { 181 281 struct tdx_memblock *tmb; 182 282 ··· 280 208 return 0; 281 209 } 282 210 283 - static void free_tdx_memlist(struct list_head *tmb_list) 211 + static __init void free_tdx_memlist(struct list_head *tmb_list) 284 212 { 285 213 /* @tmb_list is protected by mem_hotplug_lock */ 286 214 while (!list_empty(tmb_list)) { ··· 298 226 * ranges off in a secondary structure because memblock is modified 299 227 * in memory hotplug while TDX memory regions are fixed. 300 228 */ 301 - static int build_tdx_memlist(struct list_head *tmb_list) 229 + static __init int build_tdx_memlist(struct list_head *tmb_list) 302 230 { 303 231 unsigned long start_pfn, end_pfn; 304 232 int i, nid, ret; ··· 330 258 return ret; 331 259 } 332 260 333 - static int read_sys_metadata_field(u64 field_id, u64 *data) 261 + static __init int read_sys_metadata_field(u64 field_id, u64 *data) 334 262 { 335 263 struct tdx_module_args args = {}; 336 264 int ret; ··· 352 280 353 281 #include "tdx_global_metadata.c" 354 282 355 - static int check_features(struct tdx_sys_info *sysinfo) 283 + static __init int check_features(struct tdx_sys_info *sysinfo) 356 284 { 357 285 u64 tdx_features0 = sysinfo->features.tdx_features0; 358 286 ··· 365 293 } 366 294 367 295 /* Calculate the actual TDMR size */ 368 - static int tdmr_size_single(u16 max_reserved_per_tdmr) 296 + static __init int tdmr_size_single(u16 max_reserved_per_tdmr) 369 297 { 370 298 int tdmr_sz; 371 299 ··· 379 307 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT); 380 308 } 381 309 382 - static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, 383 - struct tdx_sys_info_tdmr *sysinfo_tdmr) 310 + static __init int alloc_tdmr_list(struct tdmr_info_list *tdmr_list, 311 + struct tdx_sys_info_tdmr *sysinfo_tdmr) 384 312 { 385 313 size_t tdmr_sz, tdmr_array_sz; 386 314 void *tdmr_array; ··· 411 339 return 0; 412 340 } 413 341 414 - static void free_tdmr_list(struct tdmr_info_list *tdmr_list) 342 + static __init void free_tdmr_list(struct tdmr_info_list *tdmr_list) 415 343 { 416 344 free_pages_exact(tdmr_list->tdmrs, 417 345 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz); ··· 440 368 * preallocated @tdmr_list, following all the special alignment 441 369 * and size rules for TDMR. 442 370 */ 443 - static int fill_out_tdmrs(struct list_head *tmb_list, 444 - struct tdmr_info_list *tdmr_list) 371 + static __init int fill_out_tdmrs(struct list_head *tmb_list, 372 + struct tdmr_info_list *tdmr_list) 445 373 { 446 374 struct tdx_memblock *tmb; 447 375 int tdmr_idx = 0; ··· 517 445 * Calculate PAMT size given a TDMR and a page size. The returned 518 446 * PAMT size is always aligned up to 4K page boundary. 519 447 */ 520 - static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, 521 - u16 pamt_entry_size) 448 + static __init unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz, 449 + u16 pamt_entry_size) 522 450 { 523 451 unsigned long pamt_sz, nr_pamt_entries; 524 452 ··· 549 477 * PAMT. This node will have some memory covered by the TDMR. The 550 478 * relative amount of memory covered is not considered. 551 479 */ 552 - static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) 480 + static __init int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list) 553 481 { 554 482 struct tdx_memblock *tmb; 555 483 ··· 578 506 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list 579 507 * within @tdmr, and set up PAMTs for @tdmr. 580 508 */ 581 - static int tdmr_set_up_pamt(struct tdmr_info *tdmr, 582 - struct list_head *tmb_list, 583 - u16 pamt_entry_size[]) 509 + static __init int tdmr_set_up_pamt(struct tdmr_info *tdmr, 510 + struct list_head *tmb_list, 511 + u16 pamt_entry_size[]) 584 512 { 585 513 unsigned long pamt_base[TDX_PS_NR]; 586 514 unsigned long pamt_size[TDX_PS_NR]; ··· 650 578 *pamt_size = pamt_sz; 651 579 } 652 580 653 - static void tdmr_do_pamt_func(struct tdmr_info *tdmr, 581 + static __init void tdmr_do_pamt_func(struct tdmr_info *tdmr, 654 582 void (*pamt_func)(unsigned long base, unsigned long size)) 655 583 { 656 584 unsigned long pamt_base, pamt_size; ··· 667 595 pamt_func(pamt_base, pamt_size); 668 596 } 669 597 670 - static void free_pamt(unsigned long pamt_base, unsigned long pamt_size) 598 + static __init void free_pamt(unsigned long pamt_base, unsigned long pamt_size) 671 599 { 672 600 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT); 673 601 } 674 602 675 - static void tdmr_free_pamt(struct tdmr_info *tdmr) 603 + static __init void tdmr_free_pamt(struct tdmr_info *tdmr) 676 604 { 677 605 tdmr_do_pamt_func(tdmr, free_pamt); 678 606 } 679 607 680 - static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) 608 + static __init void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list) 681 609 { 682 610 int i; 683 611 ··· 686 614 } 687 615 688 616 /* Allocate and set up PAMTs for all TDMRs */ 689 - static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, 690 - struct list_head *tmb_list, 691 - u16 pamt_entry_size[]) 617 + static __init int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, 618 + struct list_head *tmb_list, 619 + u16 pamt_entry_size[]) 692 620 { 693 621 int i, ret = 0; 694 622 ··· 737 665 } 738 666 EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page); 739 667 740 - static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) 668 + static __init void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) 669 + 741 670 { 742 671 tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr); 743 672 } 744 673 745 - static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) 674 + static __init void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) 746 675 { 747 676 int i; 748 677 ··· 751 678 tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i)); 752 679 } 753 680 754 - static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) 681 + static __init unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) 755 682 { 756 683 unsigned long pamt_size = 0; 757 684 int i; ··· 766 693 return pamt_size / 1024; 767 694 } 768 695 769 - static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr, 770 - u64 size, u16 max_reserved_per_tdmr) 696 + static __init int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, 697 + u64 addr, u64 size, u16 max_reserved_per_tdmr) 771 698 { 772 699 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas; 773 700 int idx = *p_idx; ··· 800 727 * those holes fall within @tdmr, set up a TDMR reserved area to cover 801 728 * the hole. 802 729 */ 803 - static int tdmr_populate_rsvd_holes(struct list_head *tmb_list, 804 - struct tdmr_info *tdmr, 805 - int *rsvd_idx, 806 - u16 max_reserved_per_tdmr) 730 + static __init int tdmr_populate_rsvd_holes(struct list_head *tmb_list, 731 + struct tdmr_info *tdmr, 732 + int *rsvd_idx, 733 + u16 max_reserved_per_tdmr) 807 734 { 808 735 struct tdx_memblock *tmb; 809 736 u64 prev_end; ··· 864 791 * overlaps with @tdmr, set up a TDMR reserved area to cover the 865 792 * overlapping part. 866 793 */ 867 - static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, 868 - struct tdmr_info *tdmr, 869 - int *rsvd_idx, 870 - u16 max_reserved_per_tdmr) 794 + static __init int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list, 795 + struct tdmr_info *tdmr, 796 + int *rsvd_idx, 797 + u16 max_reserved_per_tdmr) 871 798 { 872 799 int i, ret; 873 800 ··· 902 829 } 903 830 904 831 /* Compare function called by sort() for TDMR reserved areas */ 905 - static int rsvd_area_cmp_func(const void *a, const void *b) 832 + static __init int rsvd_area_cmp_func(const void *a, const void *b) 906 833 { 907 834 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a; 908 835 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b; ··· 921 848 * Populate reserved areas for the given @tdmr, including memory holes 922 849 * (via @tmb_list) and PAMTs (via @tdmr_list). 923 850 */ 924 - static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, 925 - struct list_head *tmb_list, 926 - struct tdmr_info_list *tdmr_list, 927 - u16 max_reserved_per_tdmr) 851 + static __init int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr, 852 + struct list_head *tmb_list, 853 + struct tdmr_info_list *tdmr_list, 854 + u16 max_reserved_per_tdmr) 928 855 { 929 856 int ret, rsvd_idx = 0; 930 857 ··· 949 876 * Populate reserved areas for all TDMRs in @tdmr_list, including memory 950 877 * holes (via @tmb_list) and PAMTs. 951 878 */ 952 - static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, 953 - struct list_head *tmb_list, 954 - u16 max_reserved_per_tdmr) 879 + static __init int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list, 880 + struct list_head *tmb_list, 881 + u16 max_reserved_per_tdmr) 955 882 { 956 883 int i; 957 884 ··· 972 899 * to cover all TDX memory regions in @tmb_list based on the TDX module 973 900 * TDMR global information in @sysinfo_tdmr. 974 901 */ 975 - static int construct_tdmrs(struct list_head *tmb_list, 976 - struct tdmr_info_list *tdmr_list, 977 - struct tdx_sys_info_tdmr *sysinfo_tdmr) 902 + static __init int construct_tdmrs(struct list_head *tmb_list, 903 + struct tdmr_info_list *tdmr_list, 904 + struct tdx_sys_info_tdmr *sysinfo_tdmr) 978 905 { 979 906 u16 pamt_entry_size[TDX_PS_NR] = { 980 907 sysinfo_tdmr->pamt_4k_entry_size, ··· 1006 933 return ret; 1007 934 } 1008 935 1009 - static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid) 936 + static __init int config_tdx_module(struct tdmr_info_list *tdmr_list, 937 + u64 global_keyid) 1010 938 { 1011 939 struct tdx_module_args args = {}; 1012 940 u64 *tdmr_pa_array; ··· 1042 968 return ret; 1043 969 } 1044 970 1045 - static int do_global_key_config(void *unused) 971 + static __init int do_global_key_config(void *unused) 1046 972 { 1047 973 struct tdx_module_args args = {}; 1048 974 ··· 1060 986 * KVM) can ensure success by ensuring sufficient CPUs are online and 1061 987 * can run SEAMCALLs. 1062 988 */ 1063 - static int config_global_keyid(void) 989 + static __init int config_global_keyid(void) 1064 990 { 1065 991 cpumask_var_t packages; 1066 992 int cpu, ret = -EINVAL; ··· 1100 1026 return ret; 1101 1027 } 1102 1028 1103 - static int init_tdmr(struct tdmr_info *tdmr) 1029 + static __init int init_tdmr(struct tdmr_info *tdmr) 1104 1030 { 1105 1031 u64 next; 1106 1032 ··· 1131 1057 return 0; 1132 1058 } 1133 1059 1134 - static int init_tdmrs(struct tdmr_info_list *tdmr_list) 1060 + static __init int init_tdmrs(struct tdmr_info_list *tdmr_list) 1135 1061 { 1136 1062 int i; 1137 1063 ··· 1150 1076 return 0; 1151 1077 } 1152 1078 1153 - static int init_tdx_module(void) 1079 + static __init int init_tdx_module(void) 1154 1080 { 1155 1081 int ret; 1156 1082 ··· 1231 1157 goto out_put_tdxmem; 1232 1158 } 1233 1159 1234 - static int __tdx_enable(void) 1160 + static __init int tdx_enable(void) 1235 1161 { 1162 + enum cpuhp_state state; 1236 1163 int ret; 1164 + 1165 + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { 1166 + pr_err("TDX not supported by the host platform\n"); 1167 + return -ENODEV; 1168 + } 1169 + 1170 + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { 1171 + pr_err("XSAVE is required for TDX\n"); 1172 + return -EINVAL; 1173 + } 1174 + 1175 + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { 1176 + pr_err("MOVDIR64B is required for TDX\n"); 1177 + return -EINVAL; 1178 + } 1179 + 1180 + if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { 1181 + pr_err("Self-snoop is required for TDX\n"); 1182 + return -ENODEV; 1183 + } 1184 + 1185 + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "virt/tdx:online", 1186 + tdx_online_cpu, tdx_offline_cpu); 1187 + if (state < 0) 1188 + return state; 1237 1189 1238 1190 ret = init_tdx_module(); 1239 1191 if (ret) { 1240 - pr_err("module initialization failed (%d)\n", ret); 1241 - tdx_module_status = TDX_MODULE_ERROR; 1192 + pr_err("TDX-Module initialization failed (%d)\n", ret); 1193 + cpuhp_remove_state(state); 1242 1194 return ret; 1243 1195 } 1244 1196 1245 - pr_info("module initialized\n"); 1246 - tdx_module_status = TDX_MODULE_INITIALIZED; 1197 + register_syscore(&tdx_syscore); 1247 1198 1199 + tdx_module_initialized = true; 1200 + pr_info("TDX-Module initialized\n"); 1248 1201 return 0; 1249 1202 } 1250 - 1251 - /** 1252 - * tdx_enable - Enable TDX module to make it ready to run TDX guests 1253 - * 1254 - * This function assumes the caller has: 1) held read lock of CPU hotplug 1255 - * lock to prevent any new cpu from becoming online; 2) done both VMXON 1256 - * and tdx_cpu_enable() on all online cpus. 1257 - * 1258 - * This function requires there's at least one online cpu for each CPU 1259 - * package to succeed. 1260 - * 1261 - * This function can be called in parallel by multiple callers. 1262 - * 1263 - * Return 0 if TDX is enabled successfully, otherwise error. 1264 - */ 1265 - int tdx_enable(void) 1266 - { 1267 - int ret; 1268 - 1269 - if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM)) 1270 - return -ENODEV; 1271 - 1272 - lockdep_assert_cpus_held(); 1273 - 1274 - mutex_lock(&tdx_module_lock); 1275 - 1276 - switch (tdx_module_status) { 1277 - case TDX_MODULE_UNINITIALIZED: 1278 - ret = __tdx_enable(); 1279 - break; 1280 - case TDX_MODULE_INITIALIZED: 1281 - /* Already initialized, great, tell the caller. */ 1282 - ret = 0; 1283 - break; 1284 - default: 1285 - /* Failed to initialize in the previous attempts */ 1286 - ret = -EINVAL; 1287 - break; 1288 - } 1289 - 1290 - mutex_unlock(&tdx_module_lock); 1291 - 1292 - return ret; 1293 - } 1294 - EXPORT_SYMBOL_FOR_KVM(tdx_enable); 1203 + subsys_initcall(tdx_enable); 1295 1204 1296 1205 static bool is_pamt_page(unsigned long phys) 1297 1206 { ··· 1525 1468 1526 1469 const struct tdx_sys_info *tdx_get_sysinfo(void) 1527 1470 { 1528 - const struct tdx_sys_info *p = NULL; 1471 + if (!tdx_module_initialized) 1472 + return NULL; 1529 1473 1530 - /* Make sure all fields in @tdx_sysinfo have been populated */ 1531 - mutex_lock(&tdx_module_lock); 1532 - if (tdx_module_status == TDX_MODULE_INITIALIZED) 1533 - p = (const struct tdx_sys_info *)&tdx_sysinfo; 1534 - mutex_unlock(&tdx_module_lock); 1535 - 1536 - return p; 1474 + return (const struct tdx_sys_info *)&tdx_sysinfo; 1537 1475 } 1538 1476 EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo); 1539 1477
-8
arch/x86/virt/vmx/tdx/tdx.h
··· 91 91 * Do not put any hardware-defined TDX structure representations below 92 92 * this comment! 93 93 */ 94 - 95 - /* Kernel defined TDX module status during module initialization. */ 96 - enum tdx_module_status_t { 97 - TDX_MODULE_UNINITIALIZED, 98 - TDX_MODULE_INITIALIZED, 99 - TDX_MODULE_ERROR 100 - }; 101 - 102 94 struct tdx_memblock { 103 95 struct list_head list; 104 96 unsigned long start_pfn;
+5 -5
arch/x86/virt/vmx/tdx/tdx_global_metadata.c
··· 7 7 * Include this file to other C file instead. 8 8 */ 9 9 10 - static int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features) 10 + static __init int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinfo_features) 11 11 { 12 12 int ret = 0; 13 13 u64 val; ··· 18 18 return ret; 19 19 } 20 20 21 - static int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) 21 + static __init int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr) 22 22 { 23 23 int ret = 0; 24 24 u64 val; ··· 37 37 return ret; 38 38 } 39 39 40 - static int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl) 40 + static __init int get_tdx_sys_info_td_ctrl(struct tdx_sys_info_td_ctrl *sysinfo_td_ctrl) 41 41 { 42 42 int ret = 0; 43 43 u64 val; ··· 52 52 return ret; 53 53 } 54 54 55 - static int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf) 55 + static __init int get_tdx_sys_info_td_conf(struct tdx_sys_info_td_conf *sysinfo_td_conf) 56 56 { 57 57 int ret = 0; 58 58 u64 val; ··· 85 85 return ret; 86 86 } 87 87 88 - static int get_tdx_sys_info(struct tdx_sys_info *sysinfo) 88 + static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo) 89 89 { 90 90 int ret = 0; 91 91
+7 -9
include/linux/kvm_host.h
··· 1629 1629 1630 1630 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 1631 1631 /* 1632 + * kvm_arch_shutdown() is invoked immediately prior to forcefully disabling 1633 + * hardware virtualization on all CPUs via IPI function calls (in preparation 1634 + * for shutdown or reboot), e.g. to allow arch code to prepare for disabling 1635 + * virtualization while KVM may be actively running vCPUs. 1636 + */ 1637 + void kvm_arch_shutdown(void); 1638 + /* 1632 1639 * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under 1633 1640 * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of 1634 1641 * kvm_usage_count, i.e. at the beginning of the generic hardware enabling ··· 2308 2301 2309 2302 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 2310 2303 extern bool enable_virt_at_load; 2311 - extern bool kvm_rebooting; 2312 2304 #endif 2313 2305 2314 2306 extern unsigned int halt_poll_ns; ··· 2600 2594 #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY 2601 2595 long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, 2602 2596 struct kvm_pre_fault_memory *range); 2603 - #endif 2604 - 2605 - #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING 2606 - int kvm_enable_virtualization(void); 2607 - void kvm_disable_virtualization(void); 2608 - #else 2609 - static inline int kvm_enable_virtualization(void) { return 0; } 2610 - static inline void kvm_disable_virtualization(void) { } 2611 2597 #endif 2612 2598 2613 2599 #endif
+20 -11
virt/kvm/kvm_main.c
··· 1102 1102 !refcount_read(&kvm->users_count)); 1103 1103 } 1104 1104 1105 + static int kvm_enable_virtualization(void); 1106 + static void kvm_disable_virtualization(void); 1107 + 1105 1108 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) 1106 1109 { 1107 1110 struct kvm *kvm = kvm_arch_alloc_vm(); ··· 5581 5578 module_param(enable_virt_at_load, bool, 0444); 5582 5579 EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load); 5583 5580 5584 - __visible bool kvm_rebooting; 5585 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); 5586 - 5587 5581 static DEFINE_PER_CPU(bool, virtualization_enabled); 5588 5582 static DEFINE_MUTEX(kvm_usage_lock); 5589 5583 static int kvm_usage_count; 5584 + 5585 + __weak void kvm_arch_shutdown(void) 5586 + { 5587 + 5588 + } 5590 5589 5591 5590 __weak void kvm_arch_enable_virtualization(void) 5592 5591 { ··· 5643 5638 5644 5639 static void kvm_shutdown(void *data) 5645 5640 { 5641 + kvm_arch_shutdown(); 5642 + 5646 5643 /* 5647 - * Disable hardware virtualization and set kvm_rebooting to indicate 5648 - * that KVM has asynchronously disabled hardware virtualization, i.e. 5649 - * that relevant errors and exceptions aren't entirely unexpected. 5650 5644 * Some flavors of hardware virtualization need to be disabled before 5651 5645 * transferring control to firmware (to perform shutdown/reboot), e.g. 5652 5646 * on x86, virtualization can block INIT interrupts, which are used by ··· 5654 5650 * 100% comprehensive. 5655 5651 */ 5656 5652 pr_info("kvm: exiting hardware virtualization\n"); 5657 - kvm_rebooting = true; 5658 5653 on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); 5659 5654 } 5660 5655 ··· 5692 5689 .ops = &kvm_syscore_ops, 5693 5690 }; 5694 5691 5695 - int kvm_enable_virtualization(void) 5692 + static int kvm_enable_virtualization(void) 5696 5693 { 5697 5694 int r; 5698 5695 ··· 5737 5734 --kvm_usage_count; 5738 5735 return r; 5739 5736 } 5740 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization); 5741 5737 5742 - void kvm_disable_virtualization(void) 5738 + static void kvm_disable_virtualization(void) 5743 5739 { 5744 5740 guard(mutex)(&kvm_usage_lock); 5745 5741 ··· 5749 5747 cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); 5750 5748 kvm_arch_disable_virtualization(); 5751 5749 } 5752 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization); 5753 5750 5754 5751 static int kvm_init_virtualization(void) 5755 5752 { ··· 5764 5763 kvm_disable_virtualization(); 5765 5764 } 5766 5765 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ 5766 + static int kvm_enable_virtualization(void) 5767 + { 5768 + return 0; 5769 + } 5770 + static void kvm_disable_virtualization(void) 5771 + { 5772 + 5773 + } 5767 5774 static int kvm_init_virtualization(void) 5768 5775 { 5769 5776 return 0;