Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86_tdx_for_6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 TDX updates from Dave Hansen:
"The biggest change here is making TDX and kexec play nicely together.

Before this, the memory encryption hardware (which doesn't respect
cache coherency) could write back old cachelines on top of data in the
new kernel, so kexec and TDX were made mutually exclusive. This
removes the limitation.

There is also some work to tighten up a hardware bug workaround and
some MAINTAINERS updates.

- Make TDX and kexec work together

- Skip TDX bug workaround when the bug is not present

- Update maintainers entries"

* tag 'x86_tdx_for_6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/virt/tdx: Use precalculated TDVPR page physical address
KVM/TDX: Explicitly do WBINVD when no more TDX SEAMCALLs
x86/virt/tdx: Update the kexec section in the TDX documentation
x86/virt/tdx: Remove the !KEXEC_CORE dependency
x86/kexec: Disable kexec/kdump on platforms with TDX partial write erratum
x86/virt/tdx: Mark memory cache state incoherent when making SEAMCALL
x86/sme: Use percpu boolean to control WBINVD during kexec
x86/kexec: Consolidate relocate_kernel() function parameters
x86/tdx: Skip clearing reclaimed pages unless X86_BUG_TDX_PW_MCE is present
x86/tdx: Tidy reset_pamt functions
x86/tdx: Eliminate duplicate code in tdx_clear_page()
MAINTAINERS: Add KVM mail list to the TDX entry
MAINTAINERS: Add Rick Edgecombe as a TDX reviewer
MAINTAINERS: Update the file list in the TDX entry.

+214 -106
+7 -7
Documentation/arch/x86/tdx.rst
··· 142 142 Note TDX works with CPU logical online/offline, thus the kernel still 143 143 allows to offline logical CPU and online it again. 144 144 145 - Kexec() 146 - ~~~~~~~ 147 - 148 - TDX host support currently lacks the ability to handle kexec. For 149 - simplicity only one of them can be enabled in the Kconfig. This will be 150 - fixed in the future. 151 - 152 145 Erratum 153 146 ~~~~~~~ 154 147 ··· 163 170 If the platform has such erratum, the kernel prints additional message in 164 171 machine check handler to tell user the machine check may be caused by 165 172 kernel bug on TDX private memory. 173 + 174 + Kexec 175 + ~~~~~~~ 176 + 177 + Currently kexec doesn't work on the TDX platforms with the aforementioned 178 + erratum. It fails when loading the kexec kernel image. Otherwise it 179 + works normally. 166 180 167 181 Interaction vs S3 and deeper states 168 182 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+4 -7
MAINTAINERS
··· 27723 27723 X86 TRUST DOMAIN EXTENSIONS (TDX) 27724 27724 M: Kirill A. Shutemov <kas@kernel.org> 27725 27725 R: Dave Hansen <dave.hansen@linux.intel.com> 27726 + R: Rick Edgecombe <rick.p.edgecombe@intel.com> 27726 27727 L: x86@kernel.org 27727 27728 L: linux-coco@lists.linux.dev 27729 + L: kvm@vger.kernel.org 27728 27730 S: Supported 27729 27731 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/tdx 27730 - F: Documentation/ABI/testing/sysfs-devices-virtual-misc-tdx_guest 27731 - F: arch/x86/boot/compressed/tdx* 27732 - F: arch/x86/coco/tdx/ 27733 - F: arch/x86/include/asm/shared/tdx.h 27734 - F: arch/x86/include/asm/tdx.h 27735 - F: arch/x86/virt/vmx/tdx/ 27736 - F: drivers/virt/coco/tdx-guest 27732 + N: tdx 27733 + K: \b(tdx) 27737 27734 27738 27735 X86 VDSO 27739 27736 M: Andy Lutomirski <luto@kernel.org>
-1
arch/x86/Kconfig
··· 1902 1902 depends on X86_X2APIC 1903 1903 select ARCH_KEEP_MEMBLOCK 1904 1904 depends on CONTIG_ALLOC 1905 - depends on !KEXEC_CORE 1906 1905 depends on X86_MCE 1907 1906 help 1908 1907 Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
+10 -2
arch/x86/include/asm/kexec.h
··· 13 13 # define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */ 14 14 #endif 15 15 16 + #ifdef CONFIG_X86_64 17 + 18 + #include <linux/bits.h> 19 + 20 + #define RELOC_KERNEL_PRESERVE_CONTEXT BIT(0) 21 + #define RELOC_KERNEL_CACHE_INCOHERENT BIT(1) 22 + 23 + #endif 24 + 16 25 # define KEXEC_CONTROL_PAGE_SIZE 4096 17 26 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 18 27 ··· 130 121 relocate_kernel_fn(unsigned long indirection_page, 131 122 unsigned long pa_control_page, 132 123 unsigned long start_address, 133 - unsigned int preserve_context, 134 - unsigned int host_mem_enc_active); 124 + unsigned int flags); 135 125 #endif 136 126 extern relocate_kernel_fn relocate_kernel; 137 127 #define ARCH_HAS_KIMAGE_ARCH
+2
arch/x86/include/asm/processor.h
··· 731 731 void microcode_check(struct cpuinfo_x86 *prev_info); 732 732 void store_cpu_caps(struct cpuinfo_x86 *info); 733 733 734 + DECLARE_PER_CPU(bool, cache_state_incoherent); 735 + 734 736 enum l1tf_mitigations { 735 737 L1TF_MITIGATION_OFF, 736 738 L1TF_MITIGATION_AUTO,
+34 -1
arch/x86/include/asm/tdx.h
··· 102 102 u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); 103 103 void tdx_init(void); 104 104 105 + #include <linux/preempt.h> 105 106 #include <asm/archrandom.h> 107 + #include <asm/processor.h> 106 108 107 109 typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); 110 + 111 + static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn, 112 + struct tdx_module_args *args) 113 + { 114 + lockdep_assert_preemption_disabled(); 115 + 116 + /* 117 + * SEAMCALLs are made to the TDX module and can generate dirty 118 + * cachelines of TDX private memory. Mark cache state incoherent 119 + * so that the cache can be flushed during kexec. 120 + * 121 + * This needs to be done before actually making the SEAMCALL, 122 + * because kexec-ing CPU could send NMI to stop remote CPUs, 123 + * in which case even disabling IRQ won't help here. 124 + */ 125 + this_cpu_write(cache_state_incoherent, true); 126 + 127 + return func(fn, args); 128 + } 108 129 109 130 static __always_inline u64 sc_retry(sc_func_t func, u64 fn, 110 131 struct tdx_module_args *args) ··· 134 113 u64 ret; 135 114 136 115 do { 137 - ret = func(fn, args); 116 + preempt_disable(); 117 + ret = __seamcall_dirty_cache(func, fn, args); 118 + preempt_enable(); 138 119 } while (ret == TDX_RND_NO_ENTROPY && --retry); 139 120 140 121 return ret; ··· 154 131 u32 tdx_get_nr_guest_keyids(void); 155 132 void tdx_guest_keyid_free(unsigned int keyid); 156 133 134 + void tdx_quirk_reset_page(struct page *page); 135 + 157 136 struct tdx_td { 158 137 /* TD root structure: */ 159 138 struct page *tdr_page; ··· 171 146 struct tdx_vp { 172 147 /* TDVP root page */ 173 148 struct page *tdvpr_page; 149 + /* precalculated page_to_phys(tdvpr_page) for use in noinstr code */ 150 + phys_addr_t tdvpr_pa; 174 151 175 152 /* TD vCPU control structure: */ 176 153 struct page **tdcx_pages; ··· 229 202 static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } 230 203 static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } 231 204 #endif /* CONFIG_INTEL_TDX_HOST */ 205 + 206 + #ifdef CONFIG_KEXEC_CORE 207 + void tdx_cpu_flush_cache_for_kexec(void); 208 + #else 209 + static inline void tdx_cpu_flush_cache_for_kexec(void) { } 210 + #endif 232 211 233 212 #endif /* !__ASSEMBLER__ */ 234 213 #endif /* _ASM_X86_TDX_H */
+17
arch/x86/kernel/cpu/amd.c
··· 546 546 u64 msr; 547 547 548 548 /* 549 + * Mark using WBINVD is needed during kexec on processors that 550 + * support SME. This provides support for performing a successful 551 + * kexec when going from SME inactive to SME active (or vice-versa). 552 + * 553 + * The cache must be cleared so that if there are entries with the 554 + * same physical address, both with and without the encryption bit, 555 + * they don't race each other when flushed and potentially end up 556 + * with the wrong entry being committed to memory. 557 + * 558 + * Test the CPUID bit directly because with mem_encrypt=off the 559 + * BSP will clear the X86_FEATURE_SME bit and the APs will not 560 + * see it set after that. 561 + */ 562 + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) 563 + __this_cpu_write(cache_state_incoherent, true); 564 + 565 + /* 549 566 * BIOS support is required for SME and SEV. 550 567 * For SME: If BIOS has enabled SME then adjust x86_phys_bits by 551 568 * the SME physical address space reduction value.
+35 -9
arch/x86/kernel/machine_kexec_64.c
··· 29 29 #include <asm/set_memory.h> 30 30 #include <asm/cpu.h> 31 31 #include <asm/efi.h> 32 + #include <asm/processor.h> 32 33 33 34 #ifdef CONFIG_ACPI 34 35 /* ··· 347 346 unsigned long reloc_end = (unsigned long)__relocate_kernel_end; 348 347 int result; 349 348 349 + /* 350 + * Some early TDX-capable platforms have an erratum. A kernel 351 + * partial write (a write transaction of less than cacheline 352 + * lands at memory controller) to TDX private memory poisons that 353 + * memory, and a subsequent read triggers a machine check. 354 + * 355 + * On those platforms the old kernel must reset TDX private 356 + * memory before jumping to the new kernel otherwise the new 357 + * kernel may see unexpected machine check. For simplicity 358 + * just fail kexec/kdump on those platforms. 359 + */ 360 + if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) { 361 + pr_info_once("Not allowed on platform with tdx_pw_mce bug\n"); 362 + return -EOPNOTSUPP; 363 + } 364 + 350 365 /* Setup the identity mapped 64bit page table */ 351 366 result = init_pgtable(image, __pa(control_page)); 352 367 if (result) ··· 401 384 { 402 385 unsigned long reloc_start = (unsigned long)__relocate_kernel_start; 403 386 relocate_kernel_fn *relocate_kernel_ptr; 404 - unsigned int host_mem_enc_active; 387 + unsigned int relocate_kernel_flags; 405 388 int save_ftrace_enabled; 406 389 void *control_page; 407 - 408 - /* 409 - * This must be done before load_segments() since if call depth tracking 410 - * is used then GS must be valid to make any function calls. 411 - */ 412 - host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); 413 390 414 391 #ifdef CONFIG_KEXEC_JUMP 415 392 if (image->preserve_context) ··· 438 427 */ 439 428 relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start; 440 429 430 + relocate_kernel_flags = 0; 431 + if (image->preserve_context) 432 + relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT; 433 + 434 + /* 435 + * This must be done before load_segments() since it resets 436 + * GS to 0 and percpu data needs the correct GS to work. 437 + */ 438 + if (this_cpu_read(cache_state_incoherent)) 439 + relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT; 440 + 441 441 /* 442 442 * The segment registers are funny things, they have both a 443 443 * visible and an invisible part. Whenever the visible part is ··· 458 436 * 459 437 * Take advantage of this here by force loading the segments, 460 438 * before the GDT is zapped with an invalid value. 439 + * 440 + * load_segments() resets GS to 0. Don't make any function call 441 + * after here since call depth tracking uses percpu variables to 442 + * operate (relocate_kernel() is explicitly ignored by call depth 443 + * tracking). 461 444 */ 462 445 load_segments(); 463 446 ··· 470 443 image->start = relocate_kernel_ptr((unsigned long)image->head, 471 444 virt_to_phys(control_page), 472 445 image->start, 473 - image->preserve_context, 474 - host_mem_enc_active); 446 + relocate_kernel_flags); 475 447 476 448 #ifdef CONFIG_KEXEC_JUMP 477 449 if (image->preserve_context)
+11 -13
arch/x86/kernel/process.c
··· 89 89 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); 90 90 91 91 /* 92 + * The cache may be in an incoherent state and needs flushing during kexec. 93 + * E.g., on SME/TDX platforms, dirty cacheline aliases with and without 94 + * encryption bit(s) can coexist and the cache needs to be flushed before 95 + * booting to the new kernel to avoid the silent memory corruption due to 96 + * dirty cachelines with different encryption property being written back 97 + * to the memory. 98 + */ 99 + DEFINE_PER_CPU(bool, cache_state_incoherent); 100 + 101 + /* 92 102 * this gets called so that we can store lazy state into memory and copy the 93 103 * current task into the new thread. 94 104 */ ··· 837 827 disable_local_APIC(); 838 828 mcheck_cpu_clear(c); 839 829 840 - /* 841 - * Use wbinvd on processors that support SME. This provides support 842 - * for performing a successful kexec when going from SME inactive 843 - * to SME active (or vice-versa). The cache must be cleared so that 844 - * if there are entries with the same physical address, both with and 845 - * without the encryption bit, they don't race each other when flushed 846 - * and potentially end up with the wrong entry being committed to 847 - * memory. 848 - * 849 - * Test the CPUID bit directly because the machine might've cleared 850 - * X86_FEATURE_SME due to cmdline options. 851 - */ 852 - if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) 830 + if (this_cpu_read(cache_state_incoherent)) 853 831 wbinvd(); 854 832 855 833 /*
+24 -12
arch/x86/kernel/relocate_kernel_64.S
··· 66 66 * %rdi indirection_page 67 67 * %rsi pa_control_page 68 68 * %rdx start address 69 - * %rcx preserve_context 70 - * %r8 host_mem_enc_active 69 + * %rcx flags: RELOC_KERNEL_* 71 70 */ 72 71 73 72 /* Save the CPU context, used for jumping back */ ··· 110 111 /* save indirection list for jumping back */ 111 112 movq %rdi, pa_backup_pages_map(%rip) 112 113 113 - /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ 114 + /* Save the flags to %r11 as swap_pages clobbers %rcx. */ 114 115 movq %rcx, %r11 115 116 116 117 /* setup a new stack at the end of the physical control page */ ··· 128 129 /* 129 130 * %rdi indirection page 130 131 * %rdx start address 131 - * %r8 host_mem_enc_active 132 132 * %r9 page table page 133 - * %r11 preserve_context 133 + * %r11 flags: RELOC_KERNEL_* 134 134 * %r13 original CR4 when relocate_kernel() was invoked 135 135 */ 136 136 ··· 198 200 movq %r9, %cr3 199 201 200 202 /* 203 + * If the memory cache is in incoherent state, e.g., due to 204 + * memory encryption, do WBINVD to flush cache. 205 + * 201 206 * If SME is active, there could be old encrypted cache line 202 207 * entries that will conflict with the now unencrypted memory 203 208 * used by kexec. Flush the caches before copying the kernel. 209 + * 210 + * Note SME sets this flag to true when the platform supports 211 + * SME, so the WBINVD is performed even SME is not activated 212 + * by the kernel. But this has no harm. 204 213 */ 205 - testq %r8, %r8 206 - jz .Lsme_off 214 + testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b 215 + jz .Lnowbinvd 207 216 wbinvd 208 - .Lsme_off: 217 + .Lnowbinvd: 209 218 210 219 call swap_pages 211 220 ··· 225 220 movq %cr3, %rax 226 221 movq %rax, %cr3 227 222 228 - testq %r11, %r11 /* preserve_context */ 223 + testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b 229 224 jnz .Lrelocate 230 225 231 226 /* ··· 278 273 ANNOTATE_NOENDBR 279 274 andq $PAGE_MASK, %r8 280 275 lea PAGE_SIZE(%r8), %rsp 281 - movl $1, %r11d /* Ensure preserve_context flag is set */ 276 + /* 277 + * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that 278 + * swap_pages() can swap pages correctly. Note all other 279 + * RELOC_KERNEL_* flags passed to relocate_kernel() are not 280 + * restored. 281 + */ 282 + movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d 282 283 call swap_pages 283 284 movq kexec_va_control_page(%rip), %rax 284 285 0: addq $virtual_mapped - 0b, %rax ··· 332 321 UNWIND_HINT_END_OF_STACK 333 322 /* 334 323 * %rdi indirection page 335 - * %r11 preserve_context 324 + * %r11 flags: RELOC_KERNEL_* 336 325 */ 337 326 movq %rdi, %rcx /* Put the indirection_page in %rcx */ 338 327 xorl %edi, %edi ··· 368 357 movq %rdi, %rdx /* Save destination page to %rdx */ 369 358 movq %rsi, %rax /* Save source page to %rax */ 370 359 371 - testq %r11, %r11 /* Only actually swap for ::preserve_context */ 360 + /* Only actually swap for ::preserve_context */ 361 + testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b 372 362 jz .Lnoswap 373 363 374 364 /* copy source page to swap page */
+22 -22
arch/x86/kvm/vmx/tdx.c
··· 281 281 vcpu->cpu = -1; 282 282 } 283 283 284 - static void tdx_clear_page(struct page *page) 285 - { 286 - const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); 287 - void *dest = page_to_virt(page); 288 - unsigned long i; 289 - 290 - /* 291 - * The page could have been poisoned. MOVDIR64B also clears 292 - * the poison bit so the kernel can safely use the page again. 293 - */ 294 - for (i = 0; i < PAGE_SIZE; i += 64) 295 - movdir64b(dest + i, zero_page); 296 - /* 297 - * MOVDIR64B store uses WC buffer. Prevent following memory reads 298 - * from seeing potentially poisoned cache. 299 - */ 300 - __mb(); 301 - } 302 - 303 284 static void tdx_no_vcpus_enter_start(struct kvm *kvm) 304 285 { 305 286 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); ··· 326 345 327 346 r = __tdx_reclaim_page(page); 328 347 if (!r) 329 - tdx_clear_page(page); 348 + tdx_quirk_reset_page(page); 330 349 return r; 331 350 } 332 351 ··· 423 442 tdx_flush_vp(&arg); 424 443 } 425 444 local_irq_restore(flags); 445 + 446 + /* 447 + * Flush cache now if kexec is possible: this is necessary to avoid 448 + * having dirty private memory cachelines when the new kernel boots, 449 + * but WBINVD is a relatively expensive operation and doing it during 450 + * kexec can exacerbate races in native_stop_other_cpus(). Do it 451 + * now, since this is a safe moment and there is going to be no more 452 + * TDX activity on this CPU from this point on. 453 + */ 454 + tdx_cpu_flush_cache_for_kexec(); 426 455 } 427 456 428 457 #define TDX_SEAMCALL_RETRIES 10000 ··· 584 593 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 585 594 return; 586 595 } 587 - tdx_clear_page(kvm_tdx->td.tdr_page); 596 + tdx_quirk_reset_page(kvm_tdx->td.tdr_page); 588 597 589 598 __free_page(kvm_tdx->td.tdr_page); 590 599 kvm_tdx->td.tdr_page = NULL; ··· 852 861 if (tdx->vp.tdvpr_page) { 853 862 tdx_reclaim_control_page(tdx->vp.tdvpr_page); 854 863 tdx->vp.tdvpr_page = 0; 864 + tdx->vp.tdvpr_pa = 0; 855 865 } 856 866 857 867 tdx->state = VCPU_TD_STATE_UNINITIALIZED; ··· 1706 1714 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 1707 1715 return -EIO; 1708 1716 } 1709 - tdx_clear_page(page); 1717 + tdx_quirk_reset_page(page); 1710 1718 tdx_unpin(kvm, page); 1711 1719 return 0; 1712 1720 } ··· 2932 2940 return -ENOMEM; 2933 2941 tdx->vp.tdvpr_page = page; 2934 2942 2943 + /* 2944 + * page_to_phys() does not work in 'noinstr' code, like guest 2945 + * entry via tdh_vp_enter(). Precalculate and store it instead 2946 + * of doing it at runtime later. 2947 + */ 2948 + tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page); 2949 + 2935 2950 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), 2936 2951 GFP_KERNEL); 2937 2952 if (!tdx->vp.tdcx_pages) { ··· 3001 3002 if (tdx->vp.tdvpr_page) 3002 3003 __free_page(tdx->vp.tdvpr_page); 3003 3004 tdx->vp.tdvpr_page = 0; 3005 + tdx->vp.tdvpr_pa = 0; 3004 3006 3005 3007 return ret; 3006 3008 }
+48 -32
arch/x86/virt/vmx/tdx/tdx.c
··· 633 633 } 634 634 635 635 /* 636 - * Convert TDX private pages back to normal by using MOVDIR64B to 637 - * clear these pages. Note this function doesn't flush cache of 638 - * these TDX private pages. The caller should make sure of that. 636 + * Convert TDX private pages back to normal by using MOVDIR64B to clear these 637 + * pages. Typically, any write to the page will convert it from TDX private back 638 + * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to 639 + * do the conversion explicitly via MOVDIR64B. 639 640 */ 640 - static void reset_tdx_pages(unsigned long base, unsigned long size) 641 + static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) 641 642 { 642 643 const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); 643 644 unsigned long phys, end; 645 + 646 + if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) 647 + return; 644 648 645 649 end = base + size; 646 650 for (phys = base; phys < end; phys += 64) ··· 658 654 mb(); 659 655 } 660 656 661 - static void tdmr_reset_pamt(struct tdmr_info *tdmr) 657 + void tdx_quirk_reset_page(struct page *page) 662 658 { 663 - tdmr_do_pamt_func(tdmr, reset_tdx_pages); 659 + tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); 660 + } 661 + EXPORT_SYMBOL_GPL(tdx_quirk_reset_page); 662 + 663 + static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) 664 + { 665 + tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr); 664 666 } 665 667 666 - static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list) 668 + static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) 667 669 { 668 670 int i; 669 671 670 672 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) 671 - tdmr_reset_pamt(tdmr_entry(tdmr_list, i)); 673 + tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i)); 672 674 } 673 675 674 676 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) ··· 1146 1136 * to the kernel. 1147 1137 */ 1148 1138 wbinvd_on_all_cpus(); 1149 - /* 1150 - * According to the TDX hardware spec, if the platform 1151 - * doesn't have the "partial write machine check" 1152 - * erratum, any kernel read/write will never cause #MC 1153 - * in kernel space, thus it's OK to not convert PAMTs 1154 - * back to normal. But do the conversion anyway here 1155 - * as suggested by the TDX spec. 1156 - */ 1157 - tdmrs_reset_pamt_all(&tdx_tdmr_list); 1139 + tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list); 1158 1140 err_free_pamts: 1159 1141 tdmrs_free_pamt_all(&tdx_tdmr_list); 1160 1142 err_free_tdmrs: ··· 1268 1266 return false; 1269 1267 1270 1268 /* Get page type from the TDX module */ 1271 - sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args); 1269 + sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args); 1272 1270 1273 1271 /* 1274 1272 * The SEAMCALL will not return success unless there is a ··· 1504 1502 return page_to_phys(td->tdr_page); 1505 1503 } 1506 1504 1507 - static inline u64 tdx_tdvpr_pa(struct tdx_vp *td) 1508 - { 1509 - return page_to_phys(td->tdvpr_page); 1510 - } 1511 - 1512 1505 /* 1513 1506 * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether 1514 1507 * a CLFLUSH of pages is required before handing them to the TDX module. ··· 1515 1518 clflush_cache_range(page_to_virt(page), PAGE_SIZE); 1516 1519 } 1517 1520 1518 - noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) 1521 + noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) 1519 1522 { 1520 - args->rcx = tdx_tdvpr_pa(td); 1523 + args->rcx = td->tdvpr_pa; 1521 1524 1522 - return __seamcall_saved_ret(TDH_VP_ENTER, args); 1525 + return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args); 1523 1526 } 1524 1527 EXPORT_SYMBOL_GPL(tdh_vp_enter); 1525 1528 ··· 1578 1581 { 1579 1582 struct tdx_module_args args = { 1580 1583 .rcx = page_to_phys(tdcx_page), 1581 - .rdx = tdx_tdvpr_pa(vp), 1584 + .rdx = vp->tdvpr_pa, 1582 1585 }; 1583 1586 1584 1587 tdx_clflush_page(tdcx_page); ··· 1647 1650 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) 1648 1651 { 1649 1652 struct tdx_module_args args = { 1650 - .rcx = tdx_tdvpr_pa(vp), 1653 + .rcx = vp->tdvpr_pa, 1651 1654 .rdx = tdx_tdr_pa(td), 1652 1655 }; 1653 1656 ··· 1703 1706 u64 tdh_vp_flush(struct tdx_vp *vp) 1704 1707 { 1705 1708 struct tdx_module_args args = { 1706 - .rcx = tdx_tdvpr_pa(vp), 1709 + .rcx = vp->tdvpr_pa, 1707 1710 }; 1708 1711 1709 1712 return seamcall(TDH_VP_FLUSH, &args); ··· 1749 1752 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) 1750 1753 { 1751 1754 struct tdx_module_args args = { 1752 - .rcx = tdx_tdvpr_pa(vp), 1755 + .rcx = vp->tdvpr_pa, 1753 1756 .rdx = field, 1754 1757 }; 1755 1758 u64 ret; ··· 1766 1769 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) 1767 1770 { 1768 1771 struct tdx_module_args args = { 1769 - .rcx = tdx_tdvpr_pa(vp), 1772 + .rcx = vp->tdvpr_pa, 1770 1773 .rdx = field, 1771 1774 .r8 = data, 1772 1775 .r9 = mask, ··· 1779 1782 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) 1780 1783 { 1781 1784 struct tdx_module_args args = { 1782 - .rcx = tdx_tdvpr_pa(vp), 1785 + .rcx = vp->tdvpr_pa, 1783 1786 .rdx = initial_rcx, 1784 1787 .r8 = x2apicid, 1785 1788 }; ··· 1867 1870 return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); 1868 1871 } 1869 1872 EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); 1873 + 1874 + #ifdef CONFIG_KEXEC_CORE 1875 + void tdx_cpu_flush_cache_for_kexec(void) 1876 + { 1877 + lockdep_assert_preemption_disabled(); 1878 + 1879 + if (!this_cpu_read(cache_state_incoherent)) 1880 + return; 1881 + 1882 + /* 1883 + * Private memory cachelines need to be clean at the time of 1884 + * kexec. Write them back now, as the caller promises that 1885 + * there should be no more SEAMCALLs on this CPU. 1886 + */ 1887 + wbinvd(); 1888 + this_cpu_write(cache_state_incoherent, false); 1889 + } 1890 + EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec); 1891 + #endif