Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'hyperv-next-signed-20230902' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyperv updates from Wei Liu:

- Support for SEV-SNP guests on Hyper-V (Tianyu Lan)

- Support for TDX guests on Hyper-V (Dexuan Cui)

- Use SBRM API in Hyper-V balloon driver (Mitchell Levy)

- Avoid dereferencing ACPI root object handle in VMBus driver (Maciej
Szmigiero)

- A few misecllaneous fixes (Jiapeng Chong, Nathan Chancellor, Saurabh
Sengar)

* tag 'hyperv-next-signed-20230902' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (24 commits)
x86/hyperv: Remove duplicate include
x86/hyperv: Move the code in ivm.c around to avoid unnecessary ifdef's
x86/hyperv: Remove hv_isolation_type_en_snp
x86/hyperv: Use TDX GHCI to access some MSRs in a TDX VM with the paravisor
Drivers: hv: vmbus: Bring the post_msg_page back for TDX VMs with the paravisor
x86/hyperv: Introduce a global variable hyperv_paravisor_present
Drivers: hv: vmbus: Support >64 VPs for a fully enlightened TDX/SNP VM
x86/hyperv: Fix serial console interrupts for fully enlightened TDX guests
Drivers: hv: vmbus: Support fully enlightened TDX guests
x86/hyperv: Support hypercalls for fully enlightened TDX guests
x86/hyperv: Add hv_isolation_type_tdx() to detect TDX guests
x86/hyperv: Fix undefined reference to isolation_type_en_snp without CONFIG_HYPERV
x86/hyperv: Add missing 'inline' to hv_snp_boot_ap() stub
hv: hyperv.h: Replace one-element array with flexible-array member
Drivers: hv: vmbus: Don't dereference ACPI root object handle
x86/hyperv: Add hyperv-specific handling for VMMCALL under SEV-ES
x86/hyperv: Add smp support for SEV-SNP guest
clocksource: hyper-v: Mark hyperv tsc page unencrypted in sev-snp enlightened guest
x86/hyperv: Use vmmcall to implement Hyper-V hypercall in sev-snp enlightened guest
drivers: hv: Mark percpu hvcall input arg page unencrypted in SEV-SNP enlightened guest
...

+759 -113
+12 -3
arch/x86/hyperv/hv_apic.c
··· 175 175 (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask))) 176 176 return true; 177 177 178 - if (!hv_hypercall_pg) 179 - return false; 178 + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ 179 + if (!hv_hypercall_pg) { 180 + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) 181 + return false; 182 + } 180 183 181 184 if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) 182 185 return false; ··· 232 229 233 230 trace_hyperv_send_ipi_one(cpu, vector); 234 231 235 - if (!hv_hypercall_pg || (vp == VP_INVAL)) 232 + if (vp == VP_INVAL) 236 233 return false; 234 + 235 + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ 236 + if (!hv_hypercall_pg) { 237 + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) 238 + return false; 239 + } 237 240 238 241 if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) 239 242 return false;
+94 -11
arch/x86/hyperv/hv_init.c
··· 19 19 #include <asm/hyperv-tlfs.h> 20 20 #include <asm/mshyperv.h> 21 21 #include <asm/idtentry.h> 22 + #include <asm/set_memory.h> 22 23 #include <linux/kexec.h> 23 24 #include <linux/version.h> 24 25 #include <linux/vmalloc.h> ··· 53 52 void *ghcb_va; 54 53 void **ghcb_base; 55 54 56 - if (!hv_isolation_type_snp()) 55 + if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp()) 57 56 return 0; 58 57 59 58 if (!hv_ghcb_pg) ··· 81 80 static int hv_cpu_init(unsigned int cpu) 82 81 { 83 82 union hv_vp_assist_msr_contents msr = { 0 }; 84 - struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu]; 83 + struct hv_vp_assist_page **hvp; 85 84 int ret; 86 85 87 86 ret = hv_common_cpu_init(cpu); ··· 91 90 if (!hv_vp_assist_page) 92 91 return 0; 93 92 93 + hvp = &hv_vp_assist_page[cpu]; 94 94 if (hv_root_partition) { 95 95 /* 96 96 * For root partition we get the hypervisor provided VP assist ··· 109 107 * in hv_cpu_die(), otherwise a CPU may not be stopped in the 110 108 * case of CPU offlining and the VM will hang. 111 109 */ 112 - if (!*hvp) 110 + if (!*hvp) { 113 111 *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); 112 + 113 + /* 114 + * Hyper-V should never specify a VM that is a Confidential 115 + * VM and also running in the root partition. Root partition 116 + * is blocked to run in Confidential VM. So only decrypt assist 117 + * page in non-root partition here. 118 + */ 119 + if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) { 120 + WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1)); 121 + memset(*hvp, 0, PAGE_SIZE); 122 + } 123 + } 124 + 114 125 if (*hvp) 115 126 msr.pfn = vmalloc_to_pfn(*hvp); 116 127 ··· 394 379 local_irq_restore(flags); 395 380 } 396 381 382 + static u8 __init get_vtl(void) 383 + { 384 + u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; 385 + struct hv_get_vp_registers_input *input; 386 + struct hv_get_vp_registers_output *output; 387 + unsigned long flags; 388 + u64 ret; 389 + 390 + local_irq_save(flags); 391 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 392 + output = (struct hv_get_vp_registers_output *)input; 393 + 394 + memset(input, 0, struct_size(input, element, 1)); 395 + input->header.partitionid = HV_PARTITION_ID_SELF; 396 + input->header.vpindex = HV_VP_INDEX_SELF; 397 + input->header.inputvtl = 0; 398 + input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS; 399 + 400 + ret = hv_do_hypercall(control, input, output); 401 + if (hv_result_success(ret)) { 402 + ret = output->as64.low & HV_X64_VTL_MASK; 403 + } else { 404 + pr_err("Failed to get VTL(%lld) and set VTL to zero by default.\n", ret); 405 + ret = 0; 406 + } 407 + 408 + local_irq_restore(flags); 409 + return ret; 410 + } 411 + 397 412 /* 398 413 * This function is to be invoked early in the boot sequence after the 399 414 * hypervisor has been detected. ··· 444 399 if (hv_common_init()) 445 400 return; 446 401 447 - hv_vp_assist_page = kcalloc(num_possible_cpus(), 448 - sizeof(*hv_vp_assist_page), GFP_KERNEL); 402 + /* 403 + * The VP assist page is useless to a TDX guest: the only use we 404 + * would have for it is lazy EOI, which can not be used with TDX. 405 + */ 406 + if (hv_isolation_type_tdx()) 407 + hv_vp_assist_page = NULL; 408 + else 409 + hv_vp_assist_page = kcalloc(num_possible_cpus(), 410 + sizeof(*hv_vp_assist_page), 411 + GFP_KERNEL); 449 412 if (!hv_vp_assist_page) { 450 413 ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; 451 - goto common_free; 414 + 415 + if (!hv_isolation_type_tdx()) 416 + goto common_free; 452 417 } 453 418 454 - if (hv_isolation_type_snp()) { 419 + if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) { 455 420 /* Negotiate GHCB Version. */ 456 421 if (!hv_ghcb_negotiate_protocol()) 457 422 hv_ghcb_terminate(SEV_TERM_SET_GEN, ··· 481 426 * Setup the hypercall page and enable hypercalls. 482 427 * 1. Register the guest ID 483 428 * 2. Enable the hypercall and register the hypercall page 429 + * 430 + * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg: 431 + * when the hypercall input is a page, such a VM must pass a decrypted 432 + * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page 433 + * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present. 434 + * 435 + * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls, 436 + * which are handled by the paravisor and the VM must use an encrypted 437 + * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and 438 + * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and 439 + * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls: 440 + * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8(). 441 + * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e. 442 + * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg; 443 + * instead, hv_post_message() uses the post_msg_page, which is decrypted 444 + * in such a VM and is only used in such a VM. 484 445 */ 485 446 guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); 486 447 wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); 487 448 488 - /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */ 489 - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); 449 + /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ 450 + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); 451 + 452 + /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */ 453 + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) 454 + goto skip_hypercall_pg_init; 490 455 491 456 hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, 492 457 VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, ··· 547 472 wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); 548 473 } 549 474 475 + skip_hypercall_pg_init: 550 476 /* 551 477 * Some versions of Hyper-V that provide IBT in guest VMs have a bug 552 478 * in that there's no ENDBR64 instruction at the entry to the ··· 603 527 /* Query the VMs extended capability once, so that it can be cached. */ 604 528 hv_query_ext_cap(0); 605 529 530 + /* Find the VTL */ 531 + if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) 532 + ms_hyperv.vtl = get_vtl(); 533 + 606 534 return; 607 535 608 536 clean_guest_os_id: 609 537 wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); 610 - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); 538 + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); 611 539 cpuhp_remove_state(cpuhp); 612 540 free_ghcb_page: 613 541 free_percpu(hv_ghcb_pg); ··· 632 552 633 553 /* Reset our OS id */ 634 554 wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); 635 - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); 555 + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); 636 556 637 557 /* 638 558 * Reset hypercall page reference before reset the page, ··· 695 615 if (x86_hyper_type != X86_HYPER_MS_HYPERV) 696 616 return false; 697 617 618 + /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */ 619 + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) 620 + return true; 698 621 /* 699 622 * Verify that earlier initialization succeeded by checking 700 623 * that the hypercall page is setup
+255 -8
arch/x86/hyperv/ivm.c
··· 18 18 #include <asm/mshyperv.h> 19 19 #include <asm/hypervisor.h> 20 20 #include <asm/mtrr.h> 21 + #include <asm/io_apic.h> 22 + #include <asm/realmode.h> 23 + #include <asm/e820/api.h> 24 + #include <asm/desc.h> 25 + #include <uapi/asm/vmx.h> 21 26 22 27 #ifdef CONFIG_AMD_MEM_ENCRYPT 23 28 ··· 61 56 } hypercall; 62 57 } __packed __aligned(HV_HYP_PAGE_SIZE); 63 58 59 + /* Only used in an SNP VM with the paravisor */ 64 60 static u16 hv_ghcb_version __ro_after_init; 65 61 62 + /* Functions only used in an SNP VM with the paravisor go here. */ 66 63 u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) 67 64 { 68 65 union hv_ghcb *hv_ghcb; ··· 182 175 return true; 183 176 } 184 177 185 - void hv_ghcb_msr_write(u64 msr, u64 value) 178 + static void hv_ghcb_msr_write(u64 msr, u64 value) 186 179 { 187 180 union hv_ghcb *hv_ghcb; 188 181 void **ghcb_base; ··· 210 203 211 204 local_irq_restore(flags); 212 205 } 213 - EXPORT_SYMBOL_GPL(hv_ghcb_msr_write); 214 206 215 - void hv_ghcb_msr_read(u64 msr, u64 *value) 207 + static void hv_ghcb_msr_read(u64 msr, u64 *value) 216 208 { 217 209 union hv_ghcb *hv_ghcb; 218 210 void **ghcb_base; ··· 241 235 | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32); 242 236 local_irq_restore(flags); 243 237 } 244 - EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); 238 + 239 + /* Only used in a fully enlightened SNP VM, i.e. without the paravisor */ 240 + static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE); 241 + static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE); 242 + static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa); 243 + 244 + /* Functions only used in an SNP VM without the paravisor go here. */ 245 + 246 + #define hv_populate_vmcb_seg(seg, gdtr_base) \ 247 + do { \ 248 + if (seg.selector) { \ 249 + seg.base = 0; \ 250 + seg.limit = HV_AP_SEGMENT_LIMIT; \ 251 + seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \ 252 + seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \ 253 + } \ 254 + } while (0) \ 255 + 256 + static int snp_set_vmsa(void *va, bool vmsa) 257 + { 258 + u64 attrs; 259 + 260 + /* 261 + * Running at VMPL0 allows the kernel to change the VMSA bit for a page 262 + * using the RMPADJUST instruction. However, for the instruction to 263 + * succeed it must target the permissions of a lesser privileged 264 + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST 265 + * instruction in the AMD64 APM Volume 3). 266 + */ 267 + attrs = 1; 268 + if (vmsa) 269 + attrs |= RMPADJUST_VMSA_PAGE_BIT; 270 + 271 + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); 272 + } 273 + 274 + static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) 275 + { 276 + int err; 277 + 278 + err = snp_set_vmsa(vmsa, false); 279 + if (err) 280 + pr_err("clear VMSA page failed (%u), leaking page\n", err); 281 + else 282 + free_page((unsigned long)vmsa); 283 + } 284 + 285 + int hv_snp_boot_ap(int cpu, unsigned long start_ip) 286 + { 287 + struct sev_es_save_area *vmsa = (struct sev_es_save_area *) 288 + __get_free_page(GFP_KERNEL | __GFP_ZERO); 289 + struct sev_es_save_area *cur_vmsa; 290 + struct desc_ptr gdtr; 291 + u64 ret, retry = 5; 292 + struct hv_enable_vp_vtl *start_vp_input; 293 + unsigned long flags; 294 + 295 + if (!vmsa) 296 + return -ENOMEM; 297 + 298 + native_store_gdt(&gdtr); 299 + 300 + vmsa->gdtr.base = gdtr.address; 301 + vmsa->gdtr.limit = gdtr.size; 302 + 303 + asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector)); 304 + hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base); 305 + 306 + asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector)); 307 + hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base); 308 + 309 + asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector)); 310 + hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base); 311 + 312 + asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector)); 313 + hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base); 314 + 315 + vmsa->efer = native_read_msr(MSR_EFER); 316 + 317 + asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4)); 318 + asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3)); 319 + asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0)); 320 + 321 + vmsa->xcr0 = 1; 322 + vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT; 323 + vmsa->rip = (u64)secondary_startup_64_no_verify; 324 + vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE]; 325 + 326 + /* 327 + * Set the SNP-specific fields for this VMSA: 328 + * VMPL level 329 + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) 330 + */ 331 + vmsa->vmpl = 0; 332 + vmsa->sev_features = sev_status >> 2; 333 + 334 + ret = snp_set_vmsa(vmsa, true); 335 + if (!ret) { 336 + pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret); 337 + free_page((u64)vmsa); 338 + return ret; 339 + } 340 + 341 + local_irq_save(flags); 342 + start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg; 343 + memset(start_vp_input, 0, sizeof(*start_vp_input)); 344 + start_vp_input->partition_id = -1; 345 + start_vp_input->vp_index = cpu; 346 + start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl; 347 + *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1; 348 + 349 + do { 350 + ret = hv_do_hypercall(HVCALL_START_VP, 351 + start_vp_input, NULL); 352 + } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--); 353 + 354 + local_irq_restore(flags); 355 + 356 + if (!hv_result_success(ret)) { 357 + pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret); 358 + snp_cleanup_vmsa(vmsa); 359 + vmsa = NULL; 360 + } 361 + 362 + cur_vmsa = per_cpu(hv_sev_vmsa, cpu); 363 + /* Free up any previous VMSA page */ 364 + if (cur_vmsa) 365 + snp_cleanup_vmsa(cur_vmsa); 366 + 367 + /* Record the current VMSA page */ 368 + per_cpu(hv_sev_vmsa, cpu) = vmsa; 369 + 370 + return ret; 371 + } 372 + 373 + #else 374 + static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} 375 + static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} 376 + #endif /* CONFIG_AMD_MEM_ENCRYPT */ 377 + 378 + #ifdef CONFIG_INTEL_TDX_GUEST 379 + static void hv_tdx_msr_write(u64 msr, u64 val) 380 + { 381 + struct tdx_hypercall_args args = { 382 + .r10 = TDX_HYPERCALL_STANDARD, 383 + .r11 = EXIT_REASON_MSR_WRITE, 384 + .r12 = msr, 385 + .r13 = val, 386 + }; 387 + 388 + u64 ret = __tdx_hypercall(&args); 389 + 390 + WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret); 391 + } 392 + 393 + static void hv_tdx_msr_read(u64 msr, u64 *val) 394 + { 395 + struct tdx_hypercall_args args = { 396 + .r10 = TDX_HYPERCALL_STANDARD, 397 + .r11 = EXIT_REASON_MSR_READ, 398 + .r12 = msr, 399 + }; 400 + 401 + u64 ret = __tdx_hypercall_ret(&args); 402 + 403 + if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret)) 404 + *val = 0; 405 + else 406 + *val = args.r11; 407 + } 408 + 409 + u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) 410 + { 411 + struct tdx_hypercall_args args = { }; 412 + 413 + args.r10 = control; 414 + args.rdx = param1; 415 + args.r8 = param2; 416 + 417 + (void)__tdx_hypercall_ret(&args); 418 + 419 + return args.r11; 420 + } 421 + 422 + #else 423 + static inline void hv_tdx_msr_write(u64 msr, u64 value) {} 424 + static inline void hv_tdx_msr_read(u64 msr, u64 *value) {} 425 + #endif /* CONFIG_INTEL_TDX_GUEST */ 426 + 427 + #if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) 428 + void hv_ivm_msr_write(u64 msr, u64 value) 429 + { 430 + if (!ms_hyperv.paravisor_present) 431 + return; 432 + 433 + if (hv_isolation_type_tdx()) 434 + hv_tdx_msr_write(msr, value); 435 + else if (hv_isolation_type_snp()) 436 + hv_ghcb_msr_write(msr, value); 437 + } 438 + 439 + void hv_ivm_msr_read(u64 msr, u64 *value) 440 + { 441 + if (!ms_hyperv.paravisor_present) 442 + return; 443 + 444 + if (hv_isolation_type_tdx()) 445 + hv_tdx_msr_read(msr, value); 446 + else if (hv_isolation_type_snp()) 447 + hv_ghcb_msr_read(msr, value); 448 + } 245 449 246 450 /* 247 451 * hv_mark_gpa_visibility - Set pages visible to host via hvcall. ··· 574 358 575 359 void __init hv_vtom_init(void) 576 360 { 361 + enum hv_isolation_type type = hv_get_isolation_type(); 362 + 363 + switch (type) { 364 + case HV_ISOLATION_TYPE_VBS: 365 + fallthrough; 577 366 /* 578 367 * By design, a VM using vTOM doesn't see the SEV setting, 579 368 * so SEV initialization is bypassed and sev_status isn't set. 580 369 * Set it here to indicate a vTOM VM. 370 + * 371 + * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is 372 + * defined as 0ULL, to which we can't assigned a value. 581 373 */ 582 - sev_status = MSR_AMD64_SNP_VTOM; 583 - cc_vendor = CC_VENDOR_AMD; 374 + #ifdef CONFIG_AMD_MEM_ENCRYPT 375 + case HV_ISOLATION_TYPE_SNP: 376 + sev_status = MSR_AMD64_SNP_VTOM; 377 + cc_vendor = CC_VENDOR_AMD; 378 + break; 379 + #endif 380 + 381 + case HV_ISOLATION_TYPE_TDX: 382 + cc_vendor = CC_VENDOR_INTEL; 383 + break; 384 + 385 + default: 386 + panic("hv_vtom_init: unsupported isolation type %d\n", type); 387 + } 388 + 584 389 cc_set_mask(ms_hyperv.shared_gpa_boundary); 585 390 physical_mask &= ms_hyperv.shared_gpa_boundary - 1; 586 391 ··· 614 377 mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); 615 378 } 616 379 617 - #endif /* CONFIG_AMD_MEM_ENCRYPT */ 380 + #endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */ 618 381 619 382 enum hv_isolation_type hv_get_isolation_type(void) 620 383 { ··· 642 405 DEFINE_STATIC_KEY_FALSE(isolation_type_snp); 643 406 644 407 /* 645 - * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based 408 + * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based 646 409 * isolation VM. 647 410 */ 648 411 bool hv_isolation_type_snp(void) 649 412 { 650 413 return static_branch_unlikely(&isolation_type_snp); 414 + } 415 + 416 + DEFINE_STATIC_KEY_FALSE(isolation_type_tdx); 417 + /* 418 + * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based 419 + * isolated VM. 420 + */ 421 + bool hv_isolation_type_tdx(void) 422 + { 423 + return static_branch_unlikely(&isolation_type_tdx); 651 424 }
+9 -1
arch/x86/include/asm/hyperv-tlfs.h
··· 169 169 enum hv_isolation_type { 170 170 HV_ISOLATION_TYPE_NONE = 0, 171 171 HV_ISOLATION_TYPE_VBS = 1, 172 - HV_ISOLATION_TYPE_SNP = 2 172 + HV_ISOLATION_TYPE_SNP = 2, 173 + HV_ISOLATION_TYPE_TDX = 3 173 174 }; 174 175 175 176 /* Hyper-V specific model specific registers (MSRs) */ ··· 301 300 #define HV_X64_MSR_CRASH_CTL HV_REGISTER_CRASH_CTL 302 301 #define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT 303 302 #define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC 303 + 304 + /* 305 + * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and 306 + * there is not associated MSR address. 307 + */ 308 + #define HV_X64_REGISTER_VSM_VP_STATUS 0x000D0003 309 + #define HV_X64_VTL_MASK GENMASK(3, 0) 304 310 305 311 /* Hyper-V memory host visibility */ 306 312 enum hv_mem_host_visibility {
+62 -9
arch/x86/include/asm/mshyperv.h
··· 26 26 union hv_ghcb; 27 27 28 28 DECLARE_STATIC_KEY_FALSE(isolation_type_snp); 29 + DECLARE_STATIC_KEY_FALSE(isolation_type_tdx); 29 30 30 31 typedef int (*hyperv_fill_flush_list_func)( 31 32 struct hv_guest_mapping_flush_list *flush, ··· 41 40 42 41 #if IS_ENABLED(CONFIG_HYPERV) 43 42 extern int hyperv_init_cpuhp; 43 + extern bool hyperv_paravisor_present; 44 44 45 45 extern void *hv_hypercall_pg; 46 46 ··· 49 47 50 48 extern union hv_ghcb * __percpu *hv_ghcb_pg; 51 49 50 + bool hv_isolation_type_snp(void); 51 + bool hv_isolation_type_tdx(void); 52 + u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); 53 + 54 + /* 55 + * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA 56 + * to start AP in enlightened SEV guest. 57 + */ 58 + #define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL 59 + #define HV_AP_SEGMENT_LIMIT 0xffffffff 60 + 52 61 int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); 53 62 int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); 54 63 int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); 55 64 65 + /* 66 + * If the hypercall involves no input or output parameters, the hypervisor 67 + * ignores the corresponding GPA pointer. 68 + */ 56 69 static inline u64 hv_do_hypercall(u64 control, void *input, void *output) 57 70 { 58 71 u64 input_address = input ? virt_to_phys(input) : 0; ··· 75 58 u64 hv_status; 76 59 77 60 #ifdef CONFIG_X86_64 61 + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) 62 + return hv_tdx_hypercall(control, input_address, output_address); 63 + 64 + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { 65 + __asm__ __volatile__("mov %4, %%r8\n" 66 + "vmmcall" 67 + : "=a" (hv_status), ASM_CALL_CONSTRAINT, 68 + "+c" (control), "+d" (input_address) 69 + : "r" (output_address) 70 + : "cc", "memory", "r8", "r9", "r10", "r11"); 71 + return hv_status; 72 + } 73 + 78 74 if (!hv_hypercall_pg) 79 75 return U64_MAX; 80 76 ··· 131 101 u64 hv_status; 132 102 133 103 #ifdef CONFIG_X86_64 134 - { 104 + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) 105 + return hv_tdx_hypercall(control, input1, 0); 106 + 107 + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { 108 + __asm__ __volatile__( 109 + "vmmcall" 110 + : "=a" (hv_status), ASM_CALL_CONSTRAINT, 111 + "+c" (control), "+d" (input1) 112 + :: "cc", "r8", "r9", "r10", "r11"); 113 + } else { 135 114 __asm__ __volatile__(CALL_NOSPEC 136 115 : "=a" (hv_status), ASM_CALL_CONSTRAINT, 137 116 "+c" (control), "+d" (input1) ··· 185 146 u64 hv_status; 186 147 187 148 #ifdef CONFIG_X86_64 188 - { 149 + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) 150 + return hv_tdx_hypercall(control, input1, input2); 151 + 152 + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { 153 + __asm__ __volatile__("mov %4, %%r8\n" 154 + "vmmcall" 155 + : "=a" (hv_status), ASM_CALL_CONSTRAINT, 156 + "+c" (control), "+d" (input1) 157 + : "r" (input2) 158 + : "cc", "r8", "r9", "r10", "r11"); 159 + } else { 189 160 __asm__ __volatile__("mov %4, %%r8\n" 190 161 CALL_NOSPEC 191 162 : "=a" (hv_status), ASM_CALL_CONSTRAINT, ··· 274 225 int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); 275 226 276 227 #ifdef CONFIG_AMD_MEM_ENCRYPT 277 - void hv_ghcb_msr_write(u64 msr, u64 value); 278 - void hv_ghcb_msr_read(u64 msr, u64 *value); 279 228 bool hv_ghcb_negotiate_protocol(void); 280 229 void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); 281 - void hv_vtom_init(void); 230 + int hv_snp_boot_ap(int cpu, unsigned long start_ip); 282 231 #else 283 - static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} 284 - static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} 285 232 static inline bool hv_ghcb_negotiate_protocol(void) { return false; } 286 233 static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} 287 - static inline void hv_vtom_init(void) {} 234 + static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; } 288 235 #endif 289 236 290 - extern bool hv_isolation_type_snp(void); 237 + #if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) 238 + void hv_vtom_init(void); 239 + void hv_ivm_msr_write(u64 msr, u64 value); 240 + void hv_ivm_msr_read(u64 msr, u64 *value); 241 + #else 242 + static inline void hv_vtom_init(void) {} 243 + static inline void hv_ivm_msr_write(u64 msr, u64 value) {} 244 + static inline void hv_ivm_msr_read(u64 msr, u64 *value) {} 245 + #endif 291 246 292 247 static inline bool hv_is_synic_reg(unsigned int reg) 293 248 {
+84 -7
arch/x86/kernel/cpu/mshyperv.c
··· 32 32 #include <asm/nmi.h> 33 33 #include <clocksource/hyperv_timer.h> 34 34 #include <asm/numa.h> 35 + #include <asm/svm.h> 35 36 36 37 /* Is Linux running as the root partition? */ 37 38 bool hv_root_partition; 38 39 /* Is Linux running on nested Microsoft Hypervisor */ 39 40 bool hv_nested; 40 41 struct ms_hyperv_info ms_hyperv; 42 + 43 + /* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */ 44 + bool hyperv_paravisor_present __ro_after_init; 45 + EXPORT_SYMBOL_GPL(hyperv_paravisor_present); 41 46 42 47 #if IS_ENABLED(CONFIG_HYPERV) 43 48 static inline unsigned int hv_get_nested_reg(unsigned int reg) ··· 70 65 { 71 66 u64 value; 72 67 73 - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) 74 - hv_ghcb_msr_read(reg, &value); 68 + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) 69 + hv_ivm_msr_read(reg, &value); 75 70 else 76 71 rdmsrl(reg, value); 77 72 return value; ··· 80 75 81 76 void hv_set_non_nested_register(unsigned int reg, u64 value) 82 77 { 83 - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) { 84 - hv_ghcb_msr_write(reg, value); 78 + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) { 79 + hv_ivm_msr_write(reg, value); 85 80 86 81 /* Write proxy bit via wrmsl instruction */ 87 82 if (hv_is_sint_reg(reg)) ··· 300 295 301 296 native_smp_prepare_cpus(max_cpus); 302 297 298 + /* 299 + * Override wakeup_secondary_cpu_64 callback for SEV-SNP 300 + * enlightened guest. 301 + */ 302 + if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) { 303 + apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap; 304 + return; 305 + } 306 + 303 307 #ifdef CONFIG_X86_64 304 308 for_each_present_cpu(i) { 305 309 if (i == 0) ··· 326 312 #endif 327 313 } 328 314 #endif 315 + 316 + /* 317 + * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the 318 + * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0 319 + * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns 320 + * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a 321 + * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and 322 + * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs 323 + * from the array and hence doesn't create the necessary irq description info. 324 + * 325 + * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here, 326 + * except don't change 'legacy_pic', which keeps its default value 327 + * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero 328 + * nr_legacy_irqs() and eventually serial console interrupts works properly. 329 + */ 330 + static void __init reduced_hw_init(void) 331 + { 332 + x86_init.timers.timer_init = x86_init_noop; 333 + x86_init.irqs.pre_vector_init = x86_init_noop; 334 + } 329 335 330 336 static void __init ms_hyperv_init_platform(void) 331 337 { ··· 433 399 ms_hyperv.shared_gpa_boundary = 434 400 BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); 435 401 402 + hyperv_paravisor_present = !!ms_hyperv.paravisor_present; 403 + 436 404 pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", 437 405 ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); 438 406 439 - if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) 407 + 408 + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { 440 409 static_branch_enable(&isolation_type_snp); 410 + } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) { 411 + static_branch_enable(&isolation_type_tdx); 412 + 413 + /* A TDX VM must use x2APIC and doesn't use lazy EOI. */ 414 + ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; 415 + 416 + if (!ms_hyperv.paravisor_present) { 417 + /* To be supported: more work is required. */ 418 + ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; 419 + 420 + /* HV_REGISTER_CRASH_CTL is unsupported. */ 421 + ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; 422 + 423 + /* Don't trust Hyper-V's TLB-flushing hypercalls. */ 424 + ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; 425 + 426 + x86_init.acpi.reduced_hw_early_init = reduced_hw_init; 427 + } 428 + } 441 429 } 442 430 443 431 if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { ··· 529 473 530 474 #if IS_ENABLED(CONFIG_HYPERV) 531 475 if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || 532 - (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)) 476 + ms_hyperv.paravisor_present) 533 477 hv_vtom_init(); 534 478 /* 535 479 * Setup the hook to get control post apic initialization. ··· 553 497 554 498 # ifdef CONFIG_SMP 555 499 smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; 556 - if (hv_root_partition) 500 + if (hv_root_partition || 501 + (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) 557 502 smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; 558 503 # endif 559 504 ··· 617 560 return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; 618 561 } 619 562 563 + #ifdef CONFIG_AMD_MEM_ENCRYPT 564 + static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) 565 + { 566 + /* RAX and CPL are already in the GHCB */ 567 + ghcb_set_rcx(ghcb, regs->cx); 568 + ghcb_set_rdx(ghcb, regs->dx); 569 + ghcb_set_r8(ghcb, regs->r8); 570 + } 571 + 572 + static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) 573 + { 574 + /* No checking of the return state needed */ 575 + return true; 576 + } 577 + #endif 578 + 620 579 const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { 621 580 .name = "Microsoft Hyper-V", 622 581 .detect = ms_hyperv_platform, ··· 640 567 .init.x2apic_available = ms_hyperv_x2apic_available, 641 568 .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, 642 569 .init.init_platform = ms_hyperv_init_platform, 570 + #ifdef CONFIG_AMD_MEM_ENCRYPT 571 + .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, 572 + .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, 573 + #endif 643 574 };
+1 -1
drivers/clocksource/hyperv_timer.c
··· 390 390 static union { 391 391 struct ms_hyperv_tsc_page page; 392 392 u8 reserved[PAGE_SIZE]; 393 - } tsc_pg __aligned(PAGE_SIZE); 393 + } tsc_pg __bss_decrypted __aligned(PAGE_SIZE); 394 394 395 395 static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; 396 396 static unsigned long tsc_pfn;
+12 -4
drivers/hv/connection.c
··· 98 98 */ 99 99 if (version >= VERSION_WIN10_V5) { 100 100 msg->msg_sint = VMBUS_MESSAGE_SINT; 101 + msg->msg_vtl = ms_hyperv.vtl; 101 102 vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID_4; 102 103 } else { 103 104 msg->interrupt_page = virt_to_phys(vmbus_connection.int_page); ··· 483 482 484 483 ++channel->sig_events; 485 484 486 - if (hv_isolation_type_snp()) 487 - hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event, 488 - NULL, sizeof(channel->sig_event)); 489 - else 485 + if (ms_hyperv.paravisor_present) { 486 + if (hv_isolation_type_snp()) 487 + hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event, 488 + NULL, sizeof(channel->sig_event)); 489 + else if (hv_isolation_type_tdx()) 490 + hv_tdx_hypercall(HVCALL_SIGNAL_EVENT | HV_HYPERCALL_FAST_BIT, 491 + channel->sig_event, 0); 492 + else 493 + WARN_ON_ONCE(1); 494 + } else { 490 495 hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event); 496 + } 491 497 } 492 498 EXPORT_SYMBOL_GPL(vmbus_set_event);
+117 -14
drivers/hv/hv.c
··· 20 20 #include <linux/interrupt.h> 21 21 #include <clocksource/hyperv_timer.h> 22 22 #include <asm/mshyperv.h> 23 + #include <linux/set_memory.h> 23 24 #include "hyperv_vmbus.h" 24 25 25 26 /* The one and only */ ··· 57 56 58 57 local_irq_save(flags); 59 58 60 - aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg); 59 + /* 60 + * A TDX VM with the paravisor must use the decrypted post_msg_page: see 61 + * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor 62 + * can use the encrypted hyperv_pcpu_input_arg because it copies the 63 + * input into the GHCB page, which has been decrypted by the paravisor. 64 + */ 65 + if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present) 66 + aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page; 67 + else 68 + aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg); 69 + 61 70 aligned_msg->connectionid = connection_id; 62 71 aligned_msg->reserved = 0; 63 72 aligned_msg->message_type = message_type; 64 73 aligned_msg->payload_size = payload_size; 65 74 memcpy((void *)aligned_msg->payload, payload, payload_size); 66 75 67 - if (hv_isolation_type_snp()) 68 - status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE, 69 - (void *)aligned_msg, NULL, 70 - sizeof(*aligned_msg)); 71 - else 76 + if (ms_hyperv.paravisor_present) { 77 + if (hv_isolation_type_tdx()) 78 + status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, 79 + virt_to_phys(aligned_msg), 0); 80 + else if (hv_isolation_type_snp()) 81 + status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE, 82 + aligned_msg, NULL, 83 + sizeof(*aligned_msg)); 84 + else 85 + status = HV_STATUS_INVALID_PARAMETER; 86 + } else { 72 87 status = hv_do_hypercall(HVCALL_POST_MESSAGE, 73 88 aligned_msg, NULL); 89 + } 74 90 75 91 local_irq_restore(flags); 76 92 ··· 96 78 97 79 int hv_synic_alloc(void) 98 80 { 99 - int cpu; 81 + int cpu, ret = -ENOMEM; 100 82 struct hv_per_cpu_context *hv_cpu; 101 83 102 84 /* ··· 122 104 tasklet_init(&hv_cpu->msg_dpc, 123 105 vmbus_on_msg_dpc, (unsigned long) hv_cpu); 124 106 107 + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { 108 + hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); 109 + if (hv_cpu->post_msg_page == NULL) { 110 + pr_err("Unable to allocate post msg page\n"); 111 + goto err; 112 + } 113 + 114 + ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1); 115 + if (ret) { 116 + pr_err("Failed to decrypt post msg page: %d\n", ret); 117 + /* Just leak the page, as it's unsafe to free the page. */ 118 + hv_cpu->post_msg_page = NULL; 119 + goto err; 120 + } 121 + 122 + memset(hv_cpu->post_msg_page, 0, PAGE_SIZE); 123 + } 124 + 125 125 /* 126 126 * Synic message and event pages are allocated by paravisor. 127 127 * Skip these pages allocation here. 128 128 */ 129 - if (!hv_isolation_type_snp() && !hv_root_partition) { 129 + if (!ms_hyperv.paravisor_present && !hv_root_partition) { 130 130 hv_cpu->synic_message_page = 131 131 (void *)get_zeroed_page(GFP_ATOMIC); 132 132 if (hv_cpu->synic_message_page == NULL) { ··· 156 120 (void *)get_zeroed_page(GFP_ATOMIC); 157 121 if (hv_cpu->synic_event_page == NULL) { 158 122 pr_err("Unable to allocate SYNIC event page\n"); 123 + 124 + free_page((unsigned long)hv_cpu->synic_message_page); 125 + hv_cpu->synic_message_page = NULL; 159 126 goto err; 160 127 } 128 + } 129 + 130 + if (!ms_hyperv.paravisor_present && 131 + (hv_isolation_type_snp() || hv_isolation_type_tdx())) { 132 + ret = set_memory_decrypted((unsigned long) 133 + hv_cpu->synic_message_page, 1); 134 + if (ret) { 135 + pr_err("Failed to decrypt SYNIC msg page: %d\n", ret); 136 + hv_cpu->synic_message_page = NULL; 137 + 138 + /* 139 + * Free the event page here so that hv_synic_free() 140 + * won't later try to re-encrypt it. 141 + */ 142 + free_page((unsigned long)hv_cpu->synic_event_page); 143 + hv_cpu->synic_event_page = NULL; 144 + goto err; 145 + } 146 + 147 + ret = set_memory_decrypted((unsigned long) 148 + hv_cpu->synic_event_page, 1); 149 + if (ret) { 150 + pr_err("Failed to decrypt SYNIC event page: %d\n", ret); 151 + hv_cpu->synic_event_page = NULL; 152 + goto err; 153 + } 154 + 155 + memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); 156 + memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); 161 157 } 162 158 } 163 159 164 160 return 0; 161 + 165 162 err: 166 163 /* 167 164 * Any memory allocations that succeeded will be freed when 168 165 * the caller cleans up by calling hv_synic_free() 169 166 */ 170 - return -ENOMEM; 167 + return ret; 171 168 } 172 169 173 170 174 171 void hv_synic_free(void) 175 172 { 176 - int cpu; 173 + int cpu, ret; 177 174 178 175 for_each_present_cpu(cpu) { 179 176 struct hv_per_cpu_context *hv_cpu 180 177 = per_cpu_ptr(hv_context.cpu_context, cpu); 181 178 179 + /* It's better to leak the page if the encryption fails. */ 180 + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { 181 + if (hv_cpu->post_msg_page) { 182 + ret = set_memory_encrypted((unsigned long) 183 + hv_cpu->post_msg_page, 1); 184 + if (ret) { 185 + pr_err("Failed to encrypt post msg page: %d\n", ret); 186 + hv_cpu->post_msg_page = NULL; 187 + } 188 + } 189 + } 190 + 191 + if (!ms_hyperv.paravisor_present && 192 + (hv_isolation_type_snp() || hv_isolation_type_tdx())) { 193 + if (hv_cpu->synic_message_page) { 194 + ret = set_memory_encrypted((unsigned long) 195 + hv_cpu->synic_message_page, 1); 196 + if (ret) { 197 + pr_err("Failed to encrypt SYNIC msg page: %d\n", ret); 198 + hv_cpu->synic_message_page = NULL; 199 + } 200 + } 201 + 202 + if (hv_cpu->synic_event_page) { 203 + ret = set_memory_encrypted((unsigned long) 204 + hv_cpu->synic_event_page, 1); 205 + if (ret) { 206 + pr_err("Failed to encrypt SYNIC event page: %d\n", ret); 207 + hv_cpu->synic_event_page = NULL; 208 + } 209 + } 210 + } 211 + 212 + free_page((unsigned long)hv_cpu->post_msg_page); 182 213 free_page((unsigned long)hv_cpu->synic_event_page); 183 214 free_page((unsigned long)hv_cpu->synic_message_page); 184 215 } ··· 273 170 simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); 274 171 simp.simp_enabled = 1; 275 172 276 - if (hv_isolation_type_snp() || hv_root_partition) { 173 + if (ms_hyperv.paravisor_present || hv_root_partition) { 277 174 /* Mask out vTOM bit. ioremap_cache() maps decrypted */ 278 175 u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & 279 176 ~ms_hyperv.shared_gpa_boundary; ··· 292 189 siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); 293 190 siefp.siefp_enabled = 1; 294 191 295 - if (hv_isolation_type_snp() || hv_root_partition) { 192 + if (ms_hyperv.paravisor_present || hv_root_partition) { 296 193 /* Mask out vTOM bit. ioremap_cache() maps decrypted */ 297 194 u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & 298 195 ~ms_hyperv.shared_gpa_boundary; ··· 375 272 * addresses. 376 273 */ 377 274 simp.simp_enabled = 0; 378 - if (hv_isolation_type_snp() || hv_root_partition) { 275 + if (ms_hyperv.paravisor_present || hv_root_partition) { 379 276 iounmap(hv_cpu->synic_message_page); 380 277 hv_cpu->synic_message_page = NULL; 381 278 } else { ··· 387 284 siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); 388 285 siefp.siefp_enabled = 0; 389 286 390 - if (hv_isolation_type_snp() || hv_root_partition) { 287 + if (ms_hyperv.paravisor_present || hv_root_partition) { 391 288 iounmap(hv_cpu->synic_event_page); 392 289 hv_cpu->synic_event_page = NULL; 393 290 } else {
+38 -44
drivers/hv/hv_balloon.c
··· 8 8 9 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 10 11 + #include <linux/cleanup.h> 11 12 #include <linux/kernel.h> 12 13 #include <linux/jiffies.h> 13 14 #include <linux/mman.h> ··· 647 646 void *v) 648 647 { 649 648 struct memory_notify *mem = (struct memory_notify *)v; 650 - unsigned long flags, pfn_count; 649 + unsigned long pfn_count; 651 650 652 651 switch (val) { 653 652 case MEM_ONLINE: ··· 656 655 break; 657 656 658 657 case MEM_OFFLINE: 659 - spin_lock_irqsave(&dm_device.ha_lock, flags); 660 - pfn_count = hv_page_offline_check(mem->start_pfn, 661 - mem->nr_pages); 662 - if (pfn_count <= dm_device.num_pages_onlined) { 663 - dm_device.num_pages_onlined -= pfn_count; 664 - } else { 665 - /* 666 - * We're offlining more pages than we managed to online. 667 - * This is unexpected. In any case don't let 668 - * num_pages_onlined wrap around zero. 669 - */ 670 - WARN_ON_ONCE(1); 671 - dm_device.num_pages_onlined = 0; 658 + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { 659 + pfn_count = hv_page_offline_check(mem->start_pfn, 660 + mem->nr_pages); 661 + if (pfn_count <= dm_device.num_pages_onlined) { 662 + dm_device.num_pages_onlined -= pfn_count; 663 + } else { 664 + /* 665 + * We're offlining more pages than we 666 + * managed to online. This is 667 + * unexpected. In any case don't let 668 + * num_pages_onlined wrap around zero. 669 + */ 670 + WARN_ON_ONCE(1); 671 + dm_device.num_pages_onlined = 0; 672 + } 672 673 } 673 - spin_unlock_irqrestore(&dm_device.ha_lock, flags); 674 674 break; 675 675 case MEM_GOING_ONLINE: 676 676 case MEM_GOING_OFFLINE: ··· 723 721 unsigned long start_pfn; 724 722 unsigned long processed_pfn; 725 723 unsigned long total_pfn = pfn_count; 726 - unsigned long flags; 727 724 728 725 for (i = 0; i < (size/HA_CHUNK); i++) { 729 726 start_pfn = start + (i * HA_CHUNK); 730 727 731 - spin_lock_irqsave(&dm_device.ha_lock, flags); 732 - has->ha_end_pfn += HA_CHUNK; 728 + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { 729 + has->ha_end_pfn += HA_CHUNK; 733 730 734 - if (total_pfn > HA_CHUNK) { 735 - processed_pfn = HA_CHUNK; 736 - total_pfn -= HA_CHUNK; 737 - } else { 738 - processed_pfn = total_pfn; 739 - total_pfn = 0; 731 + if (total_pfn > HA_CHUNK) { 732 + processed_pfn = HA_CHUNK; 733 + total_pfn -= HA_CHUNK; 734 + } else { 735 + processed_pfn = total_pfn; 736 + total_pfn = 0; 737 + } 738 + 739 + has->covered_end_pfn += processed_pfn; 740 740 } 741 - 742 - has->covered_end_pfn += processed_pfn; 743 - spin_unlock_irqrestore(&dm_device.ha_lock, flags); 744 741 745 742 reinit_completion(&dm_device.ol_waitevent); 746 743 ··· 759 758 */ 760 759 do_hot_add = false; 761 760 } 762 - spin_lock_irqsave(&dm_device.ha_lock, flags); 763 - has->ha_end_pfn -= HA_CHUNK; 764 - has->covered_end_pfn -= processed_pfn; 765 - spin_unlock_irqrestore(&dm_device.ha_lock, flags); 761 + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { 762 + has->ha_end_pfn -= HA_CHUNK; 763 + has->covered_end_pfn -= processed_pfn; 764 + } 766 765 break; 767 766 } 768 767 ··· 782 781 static void hv_online_page(struct page *pg, unsigned int order) 783 782 { 784 783 struct hv_hotadd_state *has; 785 - unsigned long flags; 786 784 unsigned long pfn = page_to_pfn(pg); 787 785 788 - spin_lock_irqsave(&dm_device.ha_lock, flags); 786 + guard(spinlock_irqsave)(&dm_device.ha_lock); 789 787 list_for_each_entry(has, &dm_device.ha_region_list, list) { 790 788 /* The page belongs to a different HAS. */ 791 789 if ((pfn < has->start_pfn) || ··· 794 794 hv_bring_pgs_online(has, pfn, 1UL << order); 795 795 break; 796 796 } 797 - spin_unlock_irqrestore(&dm_device.ha_lock, flags); 798 797 } 799 798 800 799 static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) ··· 802 803 struct hv_hotadd_gap *gap; 803 804 unsigned long residual, new_inc; 804 805 int ret = 0; 805 - unsigned long flags; 806 806 807 - spin_lock_irqsave(&dm_device.ha_lock, flags); 807 + guard(spinlock_irqsave)(&dm_device.ha_lock); 808 808 list_for_each_entry(has, &dm_device.ha_region_list, list) { 809 809 /* 810 810 * If the pfn range we are dealing with is not in the current ··· 850 852 ret = 1; 851 853 break; 852 854 } 853 - spin_unlock_irqrestore(&dm_device.ha_lock, flags); 854 855 855 856 return ret; 856 857 } ··· 944 947 { 945 948 struct hv_hotadd_state *ha_region = NULL; 946 949 int covered; 947 - unsigned long flags; 948 950 949 951 if (pfn_cnt == 0) 950 952 return 0; ··· 975 979 ha_region->covered_end_pfn = pg_start; 976 980 ha_region->end_pfn = rg_start + rg_size; 977 981 978 - spin_lock_irqsave(&dm_device.ha_lock, flags); 979 - list_add_tail(&ha_region->list, &dm_device.ha_region_list); 980 - spin_unlock_irqrestore(&dm_device.ha_lock, flags); 982 + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { 983 + list_add_tail(&ha_region->list, &dm_device.ha_region_list); 984 + } 981 985 } 982 986 983 987 do_pg_range: ··· 2043 2047 struct hv_dynmem_device *dm = hv_get_drvdata(dev); 2044 2048 struct hv_hotadd_state *has, *tmp; 2045 2049 struct hv_hotadd_gap *gap, *tmp_gap; 2046 - unsigned long flags; 2047 2050 2048 2051 if (dm->num_pages_ballooned != 0) 2049 2052 pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); ··· 2068 2073 #endif 2069 2074 } 2070 2075 2071 - spin_lock_irqsave(&dm_device.ha_lock, flags); 2076 + guard(spinlock_irqsave)(&dm_device.ha_lock); 2072 2077 list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { 2073 2078 list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { 2074 2079 list_del(&gap->list); ··· 2077 2082 list_del(&has->list); 2078 2083 kfree(has); 2079 2084 } 2080 - spin_unlock_irqrestore(&dm_device.ha_lock, flags); 2081 2085 } 2082 2086 2083 2087 static int balloon_suspend(struct hv_device *hv_dev)
+45 -3
drivers/hv/hv_common.c
··· 24 24 #include <linux/kmsg_dump.h> 25 25 #include <linux/slab.h> 26 26 #include <linux/dma-map-ops.h> 27 + #include <linux/set_memory.h> 27 28 #include <asm/hyperv-tlfs.h> 28 29 #include <asm/mshyperv.h> 29 30 ··· 360 359 u64 msr_vp_index; 361 360 gfp_t flags; 362 361 int pgcount = hv_root_partition ? 2 : 1; 362 + void *mem; 363 + int ret; 363 364 364 365 /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ 365 366 flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; ··· 373 370 * allocated if this CPU was previously online and then taken offline 374 371 */ 375 372 if (!*inputarg) { 376 - *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); 377 - if (!(*inputarg)) 373 + mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); 374 + if (!mem) 378 375 return -ENOMEM; 379 376 380 377 if (hv_root_partition) { 381 378 outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); 382 - *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; 379 + *outputarg = (char *)mem + HV_HYP_PAGE_SIZE; 383 380 } 381 + 382 + if (!ms_hyperv.paravisor_present && 383 + (hv_isolation_type_snp() || hv_isolation_type_tdx())) { 384 + ret = set_memory_decrypted((unsigned long)mem, pgcount); 385 + if (ret) { 386 + /* It may be unsafe to free 'mem' */ 387 + return ret; 388 + } 389 + 390 + memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE); 391 + } 392 + 393 + /* 394 + * In a fully enlightened TDX/SNP VM with more than 64 VPs, if 395 + * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() -> 396 + * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to 397 + * use hyperv_pcpu_input_arg as the hypercall input page, which 398 + * must be a decrypted page in such a VM, but the page is still 399 + * encrypted before set_memory_decrypted() returns. Fix this by 400 + * setting *inputarg after the above set_memory_decrypted(): if 401 + * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns 402 + * HV_STATUS_INVALID_PARAMETER immediately, and the function 403 + * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(), 404 + * which may be slightly slower than the hypercall, but still 405 + * works correctly in such a VM. 406 + */ 407 + *inputarg = mem; 384 408 } 385 409 386 410 msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); ··· 532 502 } 533 503 EXPORT_SYMBOL_GPL(hv_isolation_type_snp); 534 504 505 + bool __weak hv_isolation_type_tdx(void) 506 + { 507 + return false; 508 + } 509 + EXPORT_SYMBOL_GPL(hv_isolation_type_tdx); 510 + 535 511 void __weak hv_setup_vmbus_handler(void (*handler)(void)) 536 512 { 537 513 } ··· 578 542 return HV_STATUS_INVALID_PARAMETER; 579 543 } 580 544 EXPORT_SYMBOL_GPL(hv_ghcb_hypercall); 545 + 546 + u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) 547 + { 548 + return HV_STATUS_INVALID_PARAMETER; 549 + } 550 + EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
+11
drivers/hv/hyperv_vmbus.h
··· 124 124 void *synic_event_page; 125 125 126 126 /* 127 + * The page is only used in hv_post_message() for a TDX VM (with the 128 + * paravisor) to post a messages to Hyper-V: when such a VM calls 129 + * HVCALL_POST_MESSAGE, it can't use the hyperv_pcpu_input_arg (which 130 + * is encrypted in such a VM) as the hypercall input page, because 131 + * the input page for HVCALL_POST_MESSAGE must be decrypted in such a 132 + * VM, so post_msg_page (which is decrypted in hv_synic_alloc()) is 133 + * introduced for this purpose. See hyperv_init() for more comments. 134 + */ 135 + void *post_msg_page; 136 + 137 + /* 127 138 * Starting with win8, we can take channel interrupts on any CPU; 128 139 * we will manage the tasklet that handles events messages on a per CPU 129 140 * basis.
+2 -1
drivers/hv/vmbus_drv.c
··· 2287 2287 * Some ancestor of the vmbus acpi device (Gen1 or Gen2 2288 2288 * firmware) is the VMOD that has the mmio ranges. Get that. 2289 2289 */ 2290 - for (ancestor = acpi_dev_parent(device); ancestor; 2290 + for (ancestor = acpi_dev_parent(device); 2291 + ancestor && ancestor->handle != ACPI_ROOT_OBJECT; 2291 2292 ancestor = acpi_dev_parent(ancestor)) { 2292 2293 result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS, 2293 2294 vmbus_walk_resources, NULL);
+1
include/asm-generic/hyperv-tlfs.h
··· 223 223 #define HV_STATUS_INVALID_PORT_ID 17 224 224 #define HV_STATUS_INVALID_CONNECTION_ID 18 225 225 #define HV_STATUS_INSUFFICIENT_BUFFERS 19 226 + #define HV_STATUS_TIME_OUT 120 226 227 #define HV_STATUS_VTL_ALREADY_ENABLED 134 227 228 228 229 /*
+13 -4
include/asm-generic/mshyperv.h
··· 36 36 u32 nested_features; 37 37 u32 max_vp_index; 38 38 u32 max_lp_index; 39 - u32 isolation_config_a; 39 + union { 40 + u32 isolation_config_a; 41 + struct { 42 + u32 paravisor_present : 1; 43 + u32 reserved_a1 : 31; 44 + }; 45 + }; 40 46 union { 41 47 u32 isolation_config_b; 42 48 struct { 43 49 u32 cvm_type : 4; 44 - u32 reserved1 : 1; 50 + u32 reserved_b1 : 1; 45 51 u32 shared_gpa_boundary_active : 1; 46 52 u32 shared_gpa_boundary_bits : 6; 47 - u32 reserved2 : 20; 53 + u32 reserved_b2 : 20; 48 54 }; 49 55 }; 50 56 u64 shared_gpa_boundary; 57 + u8 vtl; 51 58 }; 52 59 extern struct ms_hyperv_info ms_hyperv; 53 60 extern bool hv_nested; ··· 64 57 65 58 extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); 66 59 extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); 67 - extern bool hv_isolation_type_snp(void); 60 + bool hv_isolation_type_snp(void); 61 + bool hv_isolation_type_tdx(void); 68 62 69 63 /* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */ 70 64 static inline int hv_result(u64 status) ··· 282 274 bool hv_is_isolation_supported(void); 283 275 bool hv_isolation_type_snp(void); 284 276 u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); 277 + u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); 285 278 void hyperv_cleanup(void); 286 279 bool hv_query_ext_cap(u64 cap_query); 287 280 void hv_setup_dma_ops(struct device *dev, bool coherent);
+3 -3
include/linux/hyperv.h
··· 348 348 u8 sender_owns_set; 349 349 u8 reserved; 350 350 u32 range_cnt; 351 - struct vmtransfer_page_range ranges[1]; 351 + struct vmtransfer_page_range ranges[]; 352 352 } __packed; 353 353 354 354 struct vmgpadl_packet_header { ··· 665 665 u64 interrupt_page; 666 666 struct { 667 667 u8 msg_sint; 668 - u8 padding1[3]; 669 - u32 padding2; 668 + u8 msg_vtl; 669 + u8 reserved[6]; 670 670 }; 671 671 }; 672 672 u64 monitor_page1;