Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-x86-svm-7.1' of https://github.com/kvm-x86/linux into HEAD

KVM SVM changes for 7.1

- Fix and optimize IRQ window inhibit handling for AVIC (the tracking needs to
be per-vCPU, e.g. so that KVM doesn't prematurely re-enable AVIC if multiple
vCPUs have to-be-injected IRQs).

- Fix an undefined behavior warning where a crafty userspace can read the
"avic" module param before it's fully initialized.

- Fix a (likely benign) bug in the "OS-visible workarounds" handling, where
KVM could clobber state when enabling virtualization on multiple CPUs in
parallel, and clean up and optimize the code.

- Drop a WARN in KVM_MEMORY_ENCRYPT_REG_REGION where KVM complains about a
"too large" size based purely on user input, and clean up and harden the
related pinning code.

- Disallow synchronizing a VMSA of an already-launched/encrypted vCPU, as
doing so for an SNP guest will trigger an RMP violation #PF and crash the
host.

- Protect all of sev_mem_enc_register_region() with kvm->lock to ensure
sev_guest() is stable for the entire of the function.

- Lock all vCPUs when synchronizing VMSAs for SNP guests to ensure the VMSA
page isn't actively being used.

- Overhaul KVM's APIs for detecting SEV+ guests so that VM-scoped queries are
required to hold kvm->lock (KVM has had multiple bugs due "is SEV?" checks
becoming stale), enforced by lockdep. Add and use vCPU-scoped APIs when
possible/appropriate, as all checks that originate from a vCPU are
guaranteed to be stable.

- Convert a pile of kvm->lock SEV code to guard().

+466 -272
+28 -1
arch/x86/include/asm/kvm_host.h
··· 1449 1449 struct kvm_pit *vpit; 1450 1450 #endif 1451 1451 atomic_t vapics_in_nmi_mode; 1452 + 1452 1453 struct mutex apic_map_lock; 1453 1454 struct kvm_apic_map __rcu *apic_map; 1454 1455 atomic_t apic_map_dirty; ··· 1457 1456 bool apic_access_memslot_enabled; 1458 1457 bool apic_access_memslot_inhibited; 1459 1458 1460 - /* Protects apicv_inhibit_reasons */ 1459 + /* 1460 + * Force apicv_update_lock and apicv_nr_irq_window_req to reside in a 1461 + * dedicated cacheline. They are write-mostly, whereas most everything 1462 + * else in kvm_arch is read-mostly. Note that apicv_inhibit_reasons is 1463 + * read-mostly: toggling VM-wide inhibits is rare; _checking_ for 1464 + * inhibits is common. 1465 + */ 1466 + ____cacheline_aligned 1467 + /* 1468 + * Protects apicv_inhibit_reasons and apicv_nr_irq_window_req (with an 1469 + * asterisk, see kvm_inc_or_dec_irq_window_inhibit() for details). 1470 + */ 1461 1471 struct rw_semaphore apicv_update_lock; 1472 + atomic_t apicv_nr_irq_window_req; 1473 + ____cacheline_aligned 1474 + 1462 1475 unsigned long apicv_inhibit_reasons; 1463 1476 1464 1477 gpa_t wall_clock; ··· 2342 2327 enum kvm_apicv_inhibit reason) 2343 2328 { 2344 2329 kvm_set_or_clear_apicv_inhibit(kvm, reason, false); 2330 + } 2331 + 2332 + void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc); 2333 + 2334 + static inline void kvm_inc_apicv_irq_window_req(struct kvm *kvm) 2335 + { 2336 + kvm_inc_or_dec_irq_window_inhibit(kvm, true); 2337 + } 2338 + 2339 + static inline void kvm_dec_apicv_irq_window_req(struct kvm *kvm) 2340 + { 2341 + kvm_inc_or_dec_irq_window_inhibit(kvm, false); 2345 2342 } 2346 2343 2347 2344 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+13 -2
arch/x86/kvm/svm/avic.c
··· 19 19 #include <linux/amd-iommu.h> 20 20 #include <linux/kvm_host.h> 21 21 #include <linux/kvm_irqfd.h> 22 + #include <linux/sysfs.h> 22 23 23 24 #include <asm/irq_remapping.h> 24 25 #include <asm/msr.h> ··· 77 76 return param_set_bint(val, kp); 78 77 } 79 78 79 + static int avic_param_get(char *buffer, const struct kernel_param *kp) 80 + { 81 + int val = *(int *)kp->arg; 82 + 83 + if (val == AVIC_AUTO_MODE) 84 + return sysfs_emit(buffer, "N\n"); 85 + 86 + return param_get_bool(buffer, kp); 87 + } 88 + 80 89 static const struct kernel_param_ops avic_ops = { 81 90 .flags = KERNEL_PARAM_OPS_FL_NOARG, 82 91 .set = avic_param_set, 83 - .get = param_get_bool, 92 + .get = avic_param_get, 84 93 }; 85 94 86 95 /* ··· 237 226 vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); 238 227 vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; 239 228 240 - if (!sev_es_guest(svm->vcpu.kvm)) 229 + if (!is_sev_es_guest(&svm->vcpu)) 241 230 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 242 231 243 232 /*
+211 -155
arch/x86/kvm/svm/sev.c
··· 107 107 static unsigned long *sev_asid_bitmap; 108 108 static unsigned long *sev_reclaim_asid_bitmap; 109 109 110 + static __always_inline void kvm_lockdep_assert_sev_lock_held(struct kvm *kvm) 111 + { 112 + #ifdef CONFIG_PROVE_LOCKING 113 + /* 114 + * Querying SEV+ support is safe if there are no other references, i.e. 115 + * if concurrent initialization of SEV+ is impossible. 116 + */ 117 + if (!refcount_read(&kvm->users_count)) 118 + return; 119 + 120 + /* 121 + * Querying SEV+ support from vCPU context is always safe, as vCPUs can 122 + * only be created after SEV+ is initialized (and KVM disallows all SEV 123 + * sub-ioctls while vCPU creation is in-progress). 124 + */ 125 + if (kvm_get_running_vcpu()) 126 + return; 127 + 128 + lockdep_assert_held(&kvm->lock); 129 + #endif 130 + } 131 + 132 + static bool sev_guest(struct kvm *kvm) 133 + { 134 + kvm_lockdep_assert_sev_lock_held(kvm); 135 + return ____sev_guest(kvm); 136 + } 137 + static bool sev_es_guest(struct kvm *kvm) 138 + { 139 + kvm_lockdep_assert_sev_lock_held(kvm); 140 + return ____sev_es_guest(kvm); 141 + } 142 + 143 + static bool sev_snp_guest(struct kvm *kvm) 144 + { 145 + kvm_lockdep_assert_sev_lock_held(kvm); 146 + return ____sev_snp_guest(kvm); 147 + } 148 + 110 149 static int snp_decommission_context(struct kvm *kvm); 111 150 112 151 struct enc_region { ··· 237 198 misc_cg_uncharge(type, sev->misc_cg, 1); 238 199 } 239 200 201 + static unsigned int sev_alloc_asid(unsigned int min_asid, unsigned int max_asid) 202 + { 203 + unsigned int asid; 204 + bool retry = true; 205 + 206 + guard(mutex)(&sev_bitmap_lock); 207 + 208 + again: 209 + asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); 210 + if (asid > max_asid) { 211 + if (retry && __sev_recycle_asids(min_asid, max_asid)) { 212 + retry = false; 213 + goto again; 214 + } 215 + 216 + return asid; 217 + } 218 + 219 + __set_bit(asid, sev_asid_bitmap); 220 + return asid; 221 + } 222 + 240 223 static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type) 241 224 { 242 225 /* ··· 266 205 * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. 267 206 */ 268 207 unsigned int min_asid, max_asid, asid; 269 - bool retry = true; 270 208 int ret; 271 209 272 210 if (vm_type == KVM_X86_SNP_VM) { ··· 289 229 if (min_asid > max_asid) 290 230 return -ENOTTY; 291 231 292 - WARN_ON(sev->misc_cg); 232 + WARN_ON_ONCE(sev->misc_cg); 293 233 sev->misc_cg = get_current_misc_cg(); 294 234 ret = sev_misc_cg_try_charge(sev); 295 - if (ret) { 296 - put_misc_cg(sev->misc_cg); 297 - sev->misc_cg = NULL; 298 - return ret; 299 - } 235 + if (ret) 236 + goto e_put_cg; 300 237 301 - mutex_lock(&sev_bitmap_lock); 302 - 303 - again: 304 - asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); 238 + asid = sev_alloc_asid(min_asid, max_asid); 305 239 if (asid > max_asid) { 306 - if (retry && __sev_recycle_asids(min_asid, max_asid)) { 307 - retry = false; 308 - goto again; 309 - } 310 - mutex_unlock(&sev_bitmap_lock); 311 240 ret = -EBUSY; 312 241 goto e_uncharge; 313 242 } 314 243 315 - __set_bit(asid, sev_asid_bitmap); 316 - 317 - mutex_unlock(&sev_bitmap_lock); 318 - 319 244 sev->asid = asid; 320 245 return 0; 246 + 321 247 e_uncharge: 322 248 sev_misc_cg_uncharge(sev); 249 + e_put_cg: 323 250 put_misc_cg(sev->misc_cg); 324 251 sev->misc_cg = NULL; 325 252 return ret; ··· 725 678 unsigned int flags) 726 679 { 727 680 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 728 - unsigned long npages, size; 729 - int npinned; 730 - unsigned long locked, lock_limit; 681 + unsigned long npages, total_npages, lock_limit; 731 682 struct page **pages; 732 - unsigned long first, last; 733 - int ret; 683 + int npinned, ret; 734 684 735 685 lockdep_assert_held(&kvm->lock); 736 686 737 687 if (ulen == 0 || uaddr + ulen < uaddr) 738 688 return ERR_PTR(-EINVAL); 739 689 740 - /* Calculate number of pages. */ 741 - first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; 742 - last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT; 743 - npages = (last - first + 1); 690 + /* 691 + * Calculate the number of pages that need to be pinned to cover the 692 + * entire range. Note! This isn't simply PFN_DOWN(ulen), as KVM 693 + * doesn't require the incoming address+size to be page aligned! 694 + */ 695 + npages = PFN_DOWN(uaddr + ulen - 1) - PFN_DOWN(uaddr) + 1; 696 + if (npages > INT_MAX) 697 + return ERR_PTR(-EINVAL); 744 698 745 - locked = sev->pages_locked + npages; 699 + total_npages = sev->pages_locked + npages; 700 + if (total_npages > totalram_pages()) 701 + return ERR_PTR(-EINVAL); 702 + 746 703 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 747 - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { 748 - pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); 704 + if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) { 705 + pr_err("SEV: %lu total pages would exceed the lock limit of %lu.\n", 706 + total_npages, lock_limit); 749 707 return ERR_PTR(-ENOMEM); 750 708 } 751 709 752 - if (WARN_ON_ONCE(npages > INT_MAX)) 753 - return ERR_PTR(-EINVAL); 754 - 755 - /* Avoid using vmalloc for smaller buffers. */ 756 - size = npages * sizeof(struct page *); 757 - if (size > PAGE_SIZE) 758 - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT); 759 - else 760 - pages = kmalloc(size, GFP_KERNEL_ACCOUNT); 761 - 710 + /* 711 + * Don't WARN if the kernel (rightly) thinks the total size is absurd, 712 + * i.e. rely on the kernel to reject outrageous range sizes. The above 713 + * check on the number of pages is purely to avoid truncation as 714 + * pin_user_pages_fast() takes the number of pages as a 32-bit int. 715 + */ 716 + pages = kvzalloc_objs(*pages, npages, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 762 717 if (!pages) 763 718 return ERR_PTR(-ENOMEM); 764 719 ··· 773 724 } 774 725 775 726 *n = npages; 776 - sev->pages_locked = locked; 727 + sev->pages_locked = total_npages; 777 728 778 729 return pages; 779 730 ··· 931 882 u8 *d; 932 883 int i; 933 884 885 + lockdep_assert_held(&vcpu->mutex); 886 + 887 + if (vcpu->arch.guest_state_protected) 888 + return -EINVAL; 889 + 934 890 /* Check some debug related fields before encrypting the VMSA */ 935 891 if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) 936 892 return -EINVAL; ··· 1081 1027 if (!sev_es_guest(kvm)) 1082 1028 return -ENOTTY; 1083 1029 1030 + if (kvm_is_vcpu_creation_in_progress(kvm)) 1031 + return -EBUSY; 1032 + 1033 + ret = kvm_lock_all_vcpus(kvm); 1034 + if (ret) 1035 + return ret; 1036 + 1084 1037 kvm_for_each_vcpu(i, vcpu, kvm) { 1085 - ret = mutex_lock_killable(&vcpu->mutex); 1086 - if (ret) 1087 - return ret; 1088 - 1089 1038 ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); 1090 - 1091 - mutex_unlock(&vcpu->mutex); 1092 1039 if (ret) 1093 - return ret; 1040 + break; 1094 1041 } 1095 1042 1096 - return 0; 1043 + kvm_unlock_all_vcpus(kvm); 1044 + return ret; 1097 1045 } 1098 1046 1099 1047 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) ··· 2103 2047 struct kvm_vcpu *src_vcpu; 2104 2048 unsigned long i; 2105 2049 2106 - if (src->created_vcpus != atomic_read(&src->online_vcpus) || 2107 - dst->created_vcpus != atomic_read(&dst->online_vcpus)) 2050 + if (kvm_is_vcpu_creation_in_progress(src) || 2051 + kvm_is_vcpu_creation_in_progress(dst)) 2108 2052 return -EBUSY; 2109 2053 2110 2054 if (!sev_es_guest(src)) ··· 2415 2359 struct kvm_memory_slot *memslot; 2416 2360 long npages, count; 2417 2361 void __user *src; 2418 - int ret = 0; 2419 2362 2420 2363 if (!sev_snp_guest(kvm) || !sev->snp_context) 2421 2364 return -EINVAL; ··· 2459 2404 * initial expected state and better guard against unexpected 2460 2405 * situations. 2461 2406 */ 2462 - mutex_lock(&kvm->slots_lock); 2407 + guard(mutex)(&kvm->slots_lock); 2463 2408 2464 2409 memslot = gfn_to_memslot(kvm, params.gfn_start); 2465 - if (!kvm_slot_has_gmem(memslot)) { 2466 - ret = -EINVAL; 2467 - goto out; 2468 - } 2410 + if (!kvm_slot_has_gmem(memslot)) 2411 + return -EINVAL; 2469 2412 2470 2413 sev_populate_args.sev_fd = argp->sev_fd; 2471 2414 sev_populate_args.type = params.type; ··· 2474 2421 argp->error = sev_populate_args.fw_error; 2475 2422 pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", 2476 2423 __func__, count, argp->error); 2477 - ret = -EIO; 2478 - } else { 2479 - params.gfn_start += count; 2480 - params.len -= count * PAGE_SIZE; 2481 - if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) 2482 - params.uaddr += count * PAGE_SIZE; 2483 - 2484 - ret = 0; 2485 - if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) 2486 - ret = -EFAULT; 2424 + return -EIO; 2487 2425 } 2488 2426 2489 - out: 2490 - mutex_unlock(&kvm->slots_lock); 2427 + params.gfn_start += count; 2428 + params.len -= count * PAGE_SIZE; 2429 + if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) 2430 + params.uaddr += count * PAGE_SIZE; 2491 2431 2492 - return ret; 2432 + if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) 2433 + return -EFAULT; 2434 + 2435 + return 0; 2493 2436 } 2494 2437 2495 2438 static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) ··· 2496 2447 unsigned long i; 2497 2448 int ret; 2498 2449 2450 + if (kvm_is_vcpu_creation_in_progress(kvm)) 2451 + return -EBUSY; 2452 + 2453 + ret = kvm_lock_all_vcpus(kvm); 2454 + if (ret) 2455 + return ret; 2456 + 2499 2457 data.gctx_paddr = __psp_pa(sev->snp_context); 2500 2458 data.page_type = SNP_PAGE_TYPE_VMSA; 2501 2459 ··· 2512 2456 2513 2457 ret = sev_es_sync_vmsa(svm); 2514 2458 if (ret) 2515 - return ret; 2459 + goto out; 2516 2460 2517 2461 /* Transition the VMSA page to a firmware state. */ 2518 2462 ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); 2519 2463 if (ret) 2520 - return ret; 2464 + goto out; 2521 2465 2522 2466 /* Issue the SNP command to encrypt the VMSA */ 2523 2467 data.address = __sme_pa(svm->sev_es.vmsa); ··· 2526 2470 if (ret) { 2527 2471 snp_page_reclaim(kvm, pfn); 2528 2472 2529 - return ret; 2473 + goto out; 2530 2474 } 2531 2475 2532 2476 svm->vcpu.arch.guest_state_protected = true; ··· 2540 2484 svm_enable_lbrv(vcpu); 2541 2485 } 2542 2486 2543 - return 0; 2487 + out: 2488 + kvm_unlock_all_vcpus(kvm); 2489 + return ret; 2544 2490 } 2545 2491 2546 2492 static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) ··· 2645 2587 if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) 2646 2588 return -EFAULT; 2647 2589 2648 - mutex_lock(&kvm->lock); 2590 + guard(mutex)(&kvm->lock); 2649 2591 2650 2592 /* Only the enc_context_owner handles some memory enc operations. */ 2651 2593 if (is_mirroring_enc_context(kvm) && 2652 - !is_cmd_allowed_from_mirror(sev_cmd.id)) { 2653 - r = -EINVAL; 2654 - goto out; 2655 - } 2594 + !is_cmd_allowed_from_mirror(sev_cmd.id)) 2595 + return -EINVAL; 2656 2596 2657 2597 /* 2658 2598 * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only 2659 2599 * allow the use of SNP-specific commands. 2660 2600 */ 2661 - if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) { 2662 - r = -EPERM; 2663 - goto out; 2664 - } 2601 + if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) 2602 + return -EPERM; 2665 2603 2666 2604 switch (sev_cmd.id) { 2667 2605 case KVM_SEV_ES_INIT: 2668 - if (!sev_es_enabled) { 2669 - r = -ENOTTY; 2670 - goto out; 2671 - } 2606 + if (!sev_es_enabled) 2607 + return -ENOTTY; 2672 2608 fallthrough; 2673 2609 case KVM_SEV_INIT: 2674 2610 r = sev_guest_init(kvm, &sev_cmd); ··· 2734 2682 r = snp_enable_certs(kvm); 2735 2683 break; 2736 2684 default: 2737 - r = -EINVAL; 2738 - goto out; 2685 + return -EINVAL; 2739 2686 } 2740 2687 2741 2688 if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) 2742 2689 r = -EFAULT; 2743 2690 2744 - out: 2745 - mutex_unlock(&kvm->lock); 2746 2691 return r; 2747 2692 } 2748 2693 ··· 2750 2701 struct enc_region *region; 2751 2702 int ret = 0; 2752 2703 2704 + guard(mutex)(&kvm->lock); 2705 + 2753 2706 if (!sev_guest(kvm)) 2754 2707 return -ENOTTY; 2755 2708 ··· 2759 2708 if (is_mirroring_enc_context(kvm)) 2760 2709 return -EINVAL; 2761 2710 2762 - if (range->addr > ULONG_MAX || range->size > ULONG_MAX) 2763 - return -EINVAL; 2764 - 2765 2711 region = kzalloc_obj(*region, GFP_KERNEL_ACCOUNT); 2766 2712 if (!region) 2767 2713 return -ENOMEM; 2768 2714 2769 - mutex_lock(&kvm->lock); 2770 2715 region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 2771 2716 FOLL_WRITE | FOLL_LONGTERM); 2772 2717 if (IS_ERR(region->pages)) { 2773 2718 ret = PTR_ERR(region->pages); 2774 - mutex_unlock(&kvm->lock); 2775 2719 goto e_free; 2776 2720 } 2777 2721 ··· 2784 2738 region->size = range->size; 2785 2739 2786 2740 list_add_tail(&region->list, &sev->regions_list); 2787 - mutex_unlock(&kvm->lock); 2788 - 2789 2741 return ret; 2790 2742 2791 2743 e_free: ··· 2819 2775 struct kvm_enc_region *range) 2820 2776 { 2821 2777 struct enc_region *region; 2822 - int ret; 2823 2778 2824 2779 /* If kvm is mirroring encryption context it isn't responsible for it */ 2825 2780 if (is_mirroring_enc_context(kvm)) 2826 2781 return -EINVAL; 2827 2782 2828 - mutex_lock(&kvm->lock); 2783 + guard(mutex)(&kvm->lock); 2829 2784 2830 - if (!sev_guest(kvm)) { 2831 - ret = -ENOTTY; 2832 - goto failed; 2833 - } 2785 + if (!sev_guest(kvm)) 2786 + return -ENOTTY; 2834 2787 2835 2788 region = find_enc_region(kvm, range); 2836 - if (!region) { 2837 - ret = -EINVAL; 2838 - goto failed; 2839 - } 2789 + if (!region) 2790 + return -EINVAL; 2840 2791 2841 2792 sev_writeback_caches(kvm); 2842 2793 2843 2794 __unregister_enc_region_locked(kvm, region); 2844 2795 2845 - mutex_unlock(&kvm->lock); 2846 2796 return 0; 2847 - 2848 - failed: 2849 - mutex_unlock(&kvm->lock); 2850 - return ret; 2851 2797 } 2852 2798 2853 2799 int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) ··· 2930 2896 sev->snp_context = NULL; 2931 2897 2932 2898 return 0; 2899 + } 2900 + 2901 + void sev_vm_init(struct kvm *kvm) 2902 + { 2903 + switch (kvm->arch.vm_type) { 2904 + case KVM_X86_DEFAULT_VM: 2905 + case KVM_X86_SW_PROTECTED_VM: 2906 + break; 2907 + case KVM_X86_SNP_VM: 2908 + kvm->arch.has_private_mem = true; 2909 + fallthrough; 2910 + case KVM_X86_SEV_ES_VM: 2911 + kvm->arch.has_protected_state = true; 2912 + fallthrough; 2913 + case KVM_X86_SEV_VM: 2914 + kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 2915 + to_kvm_sev_info(kvm)->need_init = true; 2916 + break; 2917 + default: 2918 + WARN_ONCE(1, "Unsupported VM type %u", kvm->arch.vm_type); 2919 + break; 2920 + } 2933 2921 } 2934 2922 2935 2923 void sev_vm_destroy(struct kvm *kvm) ··· 3300 3244 * With SNP+gmem, private/encrypted memory is unreachable via the 3301 3245 * hva-based mmu notifiers, i.e. these events are explicitly scoped to 3302 3246 * shared pages, where there's no need to flush caches. 3247 + * 3248 + * Checking for SEV+ outside of kvm->lock is safe as __sev_guest_init() 3249 + * can only be done before vCPUs are created, caches can be incoherent 3250 + * if and only if a vCPU was run, and either this task will see the VM 3251 + * as being SEV+ or the vCPU won't be to access the memory (because of 3252 + * the in-progress invalidation). 3303 3253 */ 3304 - if (!sev_guest(kvm) || sev_snp_guest(kvm)) 3254 + if (!____sev_guest(kvm) || ____sev_snp_guest(kvm)) 3305 3255 return; 3306 3256 3307 3257 sev_writeback_caches(kvm); ··· 3317 3255 { 3318 3256 struct vcpu_svm *svm; 3319 3257 3320 - if (!sev_es_guest(vcpu->kvm)) 3258 + if (!is_sev_es_guest(vcpu)) 3321 3259 return; 3322 3260 3323 3261 svm = to_svm(vcpu); ··· 3327 3265 * a guest-owned page. Transition the page to hypervisor state before 3328 3266 * releasing it back to the system. 3329 3267 */ 3330 - if (sev_snp_guest(vcpu->kvm)) { 3268 + if (is_sev_snp_guest(vcpu)) { 3331 3269 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 3332 3270 3333 3271 if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) ··· 3528 3466 goto vmgexit_err; 3529 3467 break; 3530 3468 case SVM_VMGEXIT_AP_CREATION: 3531 - if (!sev_snp_guest(vcpu->kvm)) 3469 + if (!is_sev_snp_guest(vcpu)) 3532 3470 goto vmgexit_err; 3533 3471 if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) 3534 3472 if (!kvm_ghcb_rax_is_valid(svm)) ··· 3542 3480 case SVM_VMGEXIT_TERM_REQUEST: 3543 3481 break; 3544 3482 case SVM_VMGEXIT_PSC: 3545 - if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) 3483 + if (!is_sev_snp_guest(vcpu) || !kvm_ghcb_sw_scratch_is_valid(svm)) 3546 3484 goto vmgexit_err; 3547 3485 break; 3548 3486 case SVM_VMGEXIT_GUEST_REQUEST: 3549 3487 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 3550 - if (!sev_snp_guest(vcpu->kvm) || 3488 + if (!is_sev_snp_guest(vcpu) || 3551 3489 !PAGE_ALIGNED(control->exit_info_1) || 3552 3490 !PAGE_ALIGNED(control->exit_info_2) || 3553 3491 control->exit_info_1 == control->exit_info_2) ··· 3621 3559 int pre_sev_run(struct vcpu_svm *svm, int cpu) 3622 3560 { 3623 3561 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 3624 - struct kvm *kvm = svm->vcpu.kvm; 3562 + struct kvm_vcpu *vcpu = &svm->vcpu; 3563 + struct kvm *kvm = vcpu->kvm; 3625 3564 unsigned int asid = sev_get_asid(kvm); 3626 3565 3627 3566 /* ··· 3630 3567 * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP 3631 3568 * AP Destroy event. 3632 3569 */ 3633 - if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) 3570 + if (is_sev_es_guest(vcpu) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) 3634 3571 return -EINVAL; 3635 3572 3636 3573 /* ··· 4176 4113 sev_ret_code fw_err = 0; 4177 4114 int ret; 4178 4115 4179 - if (!sev_snp_guest(kvm)) 4116 + if (!is_sev_snp_guest(&svm->vcpu)) 4180 4117 return -EINVAL; 4181 4118 4182 - mutex_lock(&sev->guest_req_mutex); 4119 + guard(mutex)(&sev->guest_req_mutex); 4183 4120 4184 - if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) { 4185 - ret = -EIO; 4186 - goto out_unlock; 4187 - } 4121 + if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) 4122 + return -EIO; 4188 4123 4189 4124 data.gctx_paddr = __psp_pa(sev->snp_context); 4190 4125 data.req_paddr = __psp_pa(sev->guest_req_buf); ··· 4195 4134 */ 4196 4135 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); 4197 4136 if (ret && !fw_err) 4198 - goto out_unlock; 4137 + return ret; 4199 4138 4200 - if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) { 4201 - ret = -EIO; 4202 - goto out_unlock; 4203 - } 4139 + if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) 4140 + return -EIO; 4204 4141 4205 4142 /* No action is requested *from KVM* if there was a firmware error. */ 4206 4143 svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); 4207 4144 4208 - ret = 1; /* resume guest */ 4209 - 4210 - out_unlock: 4211 - mutex_unlock(&sev->guest_req_mutex); 4212 - return ret; 4145 + /* resume guest */ 4146 + return 1; 4213 4147 } 4214 4148 4215 4149 static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) ··· 4239 4183 4240 4184 static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4241 4185 { 4242 - struct kvm *kvm = svm->vcpu.kvm; 4186 + struct kvm_vcpu *vcpu = &svm->vcpu; 4187 + struct kvm *kvm = vcpu->kvm; 4188 + 4243 4189 u8 msg_type; 4244 4190 4245 - if (!sev_snp_guest(kvm)) 4191 + if (!is_sev_snp_guest(vcpu)) 4246 4192 return -EINVAL; 4247 4193 4248 4194 if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), ··· 4263 4205 */ 4264 4206 if (msg_type == SNP_MSG_REPORT_REQ) { 4265 4207 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 4266 - struct kvm_vcpu *vcpu = &svm->vcpu; 4267 4208 u64 data_npages; 4268 4209 gpa_t data_gpa; 4269 4210 ··· 4379 4322 GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); 4380 4323 break; 4381 4324 case GHCB_MSR_PREF_GPA_REQ: 4382 - if (!sev_snp_guest(vcpu->kvm)) 4325 + if (!is_sev_snp_guest(vcpu)) 4383 4326 goto out_terminate; 4384 4327 4385 4328 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, ··· 4390 4333 case GHCB_MSR_REG_GPA_REQ: { 4391 4334 u64 gfn; 4392 4335 4393 - if (!sev_snp_guest(vcpu->kvm)) 4336 + if (!is_sev_snp_guest(vcpu)) 4394 4337 goto out_terminate; 4395 4338 4396 4339 gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, ··· 4405 4348 break; 4406 4349 } 4407 4350 case GHCB_MSR_PSC_REQ: 4408 - if (!sev_snp_guest(vcpu->kvm)) 4351 + if (!is_sev_snp_guest(vcpu)) 4409 4352 goto out_terminate; 4410 4353 4411 4354 ret = snp_begin_psc_msr(svm, control->ghcb_gpa); ··· 4478 4421 sev_es_sync_from_ghcb(svm); 4479 4422 4480 4423 /* SEV-SNP guest requires that the GHCB GPA must be registered */ 4481 - if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { 4424 + if (is_sev_snp_guest(vcpu) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { 4482 4425 vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); 4483 4426 return -EINVAL; 4484 4427 } ··· 4728 4671 */ 4729 4672 clr_exception_intercept(svm, GP_VECTOR); 4730 4673 4731 - if (init_event && sev_snp_guest(vcpu->kvm)) 4674 + if (init_event && is_sev_snp_guest(vcpu)) 4732 4675 sev_snp_init_protected_guest_state(vcpu); 4733 4676 4734 - if (sev_es_guest(vcpu->kvm)) 4677 + if (is_sev_es_guest(vcpu)) 4735 4678 sev_es_init_vmcb(svm, init_event); 4736 4679 } 4737 4680 ··· 4742 4685 4743 4686 mutex_init(&svm->sev_es.snp_vmsa_mutex); 4744 4687 4745 - if (!sev_es_guest(vcpu->kvm)) 4688 + if (!is_sev_es_guest(vcpu)) 4746 4689 return 0; 4747 4690 4748 4691 /* ··· 4762 4705 4763 4706 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) 4764 4707 { 4765 - struct kvm *kvm = svm->vcpu.kvm; 4766 - 4767 4708 /* 4768 4709 * All host state for SEV-ES guests is categorized into three swap types 4769 4710 * based on how it is handled by hardware during a world switch: ··· 4800 4745 * loaded with the correct values *if* the CPU writes the MSRs. 4801 4746 */ 4802 4747 if (sev_vcpu_has_debug_swap(svm) || 4803 - (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { 4748 + (cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 4749 + is_sev_snp_guest(&svm->vcpu))) { 4804 4750 hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); 4805 4751 hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); 4806 4752 hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); ··· 5165 5109 int error = 0; 5166 5110 int ret; 5167 5111 5168 - if (!sev_es_guest(vcpu->kvm)) 5112 + if (!is_sev_es_guest(vcpu)) 5169 5113 return NULL; 5170 5114 5171 5115 /* ··· 5178 5122 sev = to_kvm_sev_info(vcpu->kvm); 5179 5123 5180 5124 /* Check if the SEV policy allows debugging */ 5181 - if (sev_snp_guest(vcpu->kvm)) { 5125 + if (is_sev_snp_guest(vcpu)) { 5182 5126 if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) 5183 5127 return NULL; 5184 5128 } else { ··· 5186 5130 return NULL; 5187 5131 } 5188 5132 5189 - if (sev_snp_guest(vcpu->kvm)) { 5133 + if (is_sev_snp_guest(vcpu)) { 5190 5134 struct sev_data_snp_dbg dbg = {0}; 5191 5135 5192 5136 vmsa = snp_alloc_firmware_page(__GFP_ZERO);
+134 -103
arch/x86/kvm/svm/svm.c
··· 79 79 * are published and we know what the new status bits are 80 80 */ 81 81 static uint64_t osvw_len = 4, osvw_status; 82 + static DEFINE_SPINLOCK(osvw_lock); 82 83 83 84 static DEFINE_PER_CPU(u64, current_tsc_ratio); 84 85 ··· 257 256 * Never intercept #GP for SEV guests, KVM can't 258 257 * decrypt guest memory to workaround the erratum. 259 258 */ 260 - if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) 259 + if (svm_gp_erratum_intercept && !is_sev_guest(vcpu)) 261 260 set_exception_intercept(svm, GP_VECTOR); 262 261 } 263 262 ··· 301 300 * SEV-ES does not expose the next RIP. The RIP update is controlled by 302 301 * the type of exit and the #VC handler in the guest. 303 302 */ 304 - if (sev_es_guest(vcpu->kvm)) 303 + if (is_sev_es_guest(vcpu)) 305 304 goto done; 306 305 307 306 if (nrips && svm->vmcb->control.next_rip != 0) { ··· 438 437 vcpu->arch.osvw.status |= 1; 439 438 } 440 439 440 + static void svm_init_os_visible_workarounds(void) 441 + { 442 + u64 len, status; 443 + 444 + /* 445 + * Get OS-Visible Workarounds (OSVW) bits. 446 + * 447 + * Note that it is possible to have a system with mixed processor 448 + * revisions and therefore different OSVW bits. If bits are not the same 449 + * on different processors then choose the worst case (i.e. if erratum 450 + * is present on one processor and not on another then assume that the 451 + * erratum is present everywhere). 452 + * 453 + * Note #2! The OSVW MSRs are used to communciate that an erratum is 454 + * NOT present! Software must assume erratum as present if its bit is 455 + * set in OSVW_STATUS *or* the bit number exceeds OSVW_ID_LENGTH. If 456 + * either RDMSR fails, simply zero out the length to treat all errata 457 + * as being present. Similarly, use the *minimum* length across all 458 + * CPUs, not the maximum length. 459 + * 460 + * If the length is zero, then is KVM already treating all errata as 461 + * being present and there's nothing left to do. 462 + */ 463 + if (!osvw_len) 464 + return; 465 + 466 + if (!this_cpu_has(X86_FEATURE_OSVW) || 467 + native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len) || 468 + native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status)) 469 + len = status = 0; 470 + 471 + if (status == READ_ONCE(osvw_status) && len >= READ_ONCE(osvw_len)) 472 + return; 473 + 474 + guard(spinlock)(&osvw_lock); 475 + 476 + if (len < osvw_len) 477 + osvw_len = len; 478 + osvw_status |= status; 479 + osvw_status &= (1ULL << osvw_len) - 1; 480 + } 481 + 441 482 static bool __kvm_is_svm_supported(void) 442 483 { 443 484 int cpu = smp_processor_id(); ··· 581 538 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 582 539 } 583 540 584 - /* 585 - * Get OSVW bits. 586 - * 587 - * Note that it is possible to have a system with mixed processor 588 - * revisions and therefore different OSVW bits. If bits are not the same 589 - * on different processors then choose the worst case (i.e. if erratum 590 - * is present on one processor and not on another then assume that the 591 - * erratum is present everywhere). 592 - */ 593 - if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 594 - u64 len, status = 0; 595 - int err; 596 - 597 - err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); 598 - if (!err) 599 - err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); 600 - 601 - if (err) 602 - osvw_status = osvw_len = 0; 603 - else { 604 - if (len < osvw_len) 605 - osvw_len = len; 606 - osvw_status |= status; 607 - osvw_status &= (1ULL << osvw_len) - 1; 608 - } 609 - } else 610 - osvw_status = osvw_len = 0; 541 + svm_init_os_visible_workarounds(); 611 542 612 543 svm_init_erratum_383(); 613 544 ··· 734 717 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); 735 718 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); 736 719 737 - if (sev_es_guest(vcpu->kvm)) 720 + if (is_sev_es_guest(vcpu)) 738 721 svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); 739 722 740 723 svm->lbr_msrs_intercepted = intercept; ··· 844 827 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled); 845 828 } 846 829 847 - if (sev_es_guest(vcpu->kvm)) 830 + if (is_sev_es_guest(vcpu)) 848 831 sev_es_recalc_msr_intercepts(vcpu); 849 832 850 833 svm_recalc_pmu_msr_intercepts(vcpu); ··· 868 851 869 852 static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) 870 853 { 871 - KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); 854 + KVM_BUG_ON(is_sev_es_guest(vcpu), vcpu->kvm); 872 855 to_svm(vcpu)->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_LBR; 873 856 } 874 857 ··· 1242 1225 if (vcpu->kvm->arch.bus_lock_detection_enabled) 1243 1226 svm_set_intercept(svm, INTERCEPT_BUSLOCK); 1244 1227 1245 - if (sev_guest(vcpu->kvm)) 1228 + if (is_sev_guest(vcpu)) 1246 1229 sev_init_vmcb(svm, init_event); 1247 1230 1248 1231 svm_hv_init_vmcb(vmcb); ··· 1416 1399 struct vcpu_svm *svm = to_svm(vcpu); 1417 1400 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 1418 1401 1419 - if (sev_es_guest(vcpu->kvm)) 1402 + if (is_sev_es_guest(vcpu)) 1420 1403 sev_es_unmap_ghcb(svm); 1421 1404 1422 1405 if (svm->guest_state_loaded) ··· 1427 1410 * or subsequent vmload of host save area. 1428 1411 */ 1429 1412 vmsave(sd->save_area_pa); 1430 - if (sev_es_guest(vcpu->kvm)) 1413 + if (is_sev_es_guest(vcpu)) 1431 1414 sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd)); 1432 1415 1433 1416 if (tsc_scaling) ··· 1440 1423 * all CPUs support TSC_AUX virtualization). 1441 1424 */ 1442 1425 if (likely(tsc_aux_uret_slot >= 0) && 1443 - (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) 1426 + (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !is_sev_es_guest(vcpu))) 1444 1427 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); 1445 1428 1446 1429 if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && ··· 1507 1490 { 1508 1491 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1509 1492 1510 - return sev_es_guest(vcpu->kvm) 1493 + return is_sev_es_guest(vcpu) 1511 1494 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK 1512 1495 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; 1513 1496 } ··· 1741 1724 * contents of the VMSA, and future VMCB save area updates won't be 1742 1725 * seen. 1743 1726 */ 1744 - if (sev_es_guest(vcpu->kvm)) { 1727 + if (is_sev_es_guest(vcpu)) { 1745 1728 svm->vmcb->save.cr3 = cr3; 1746 1729 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1747 1730 } ··· 1796 1779 * SEV-ES guests must always keep the CR intercepts cleared. CR 1797 1780 * tracking is done using the CR write traps. 1798 1781 */ 1799 - if (sev_es_guest(vcpu->kvm)) 1782 + if (is_sev_es_guest(vcpu)) 1800 1783 return; 1801 1784 1802 1785 if (hcr0 == cr0) { ··· 1907 1890 { 1908 1891 struct vcpu_svm *svm = to_svm(vcpu); 1909 1892 1910 - if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) 1893 + if (WARN_ON_ONCE(is_sev_es_guest(vcpu))) 1911 1894 return; 1912 1895 1913 1896 get_debugreg(vcpu->arch.db[0], 0); ··· 1986 1969 } 1987 1970 } 1988 1971 1989 - if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) 1972 + if (is_sev_snp_guest(vcpu) && (error_code & PFERR_GUEST_ENC_MASK)) 1990 1973 error_code |= PFERR_PRIVATE_ACCESS; 1991 1974 1992 1975 trace_kvm_page_fault(vcpu, gpa, error_code); ··· 2131 2114 * The VM save area for SEV-ES guests has already been encrypted so it 2132 2115 * cannot be reinitialized, i.e. synthesizing INIT is futile. 2133 2116 */ 2134 - if (!sev_es_guest(vcpu->kvm)) { 2117 + if (!is_sev_es_guest(vcpu)) { 2135 2118 clear_page(svm->vmcb); 2136 2119 #ifdef CONFIG_KVM_SMM 2137 2120 if (is_smm(vcpu)) ··· 2158 2141 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2159 2142 2160 2143 if (string) { 2161 - if (sev_es_guest(vcpu->kvm)) 2144 + if (is_sev_es_guest(vcpu)) 2162 2145 return sev_es_string_io(svm, size, port, in); 2163 2146 else 2164 2147 return kvm_emulate_instruction(vcpu, 0); ··· 2472 2455 2473 2456 static void svm_clr_iret_intercept(struct vcpu_svm *svm) 2474 2457 { 2475 - if (!sev_es_guest(svm->vcpu.kvm)) 2458 + if (!is_sev_es_guest(&svm->vcpu)) 2476 2459 svm_clr_intercept(svm, INTERCEPT_IRET); 2477 2460 } 2478 2461 2479 2462 static void svm_set_iret_intercept(struct vcpu_svm *svm) 2480 2463 { 2481 - if (!sev_es_guest(svm->vcpu.kvm)) 2464 + if (!is_sev_es_guest(&svm->vcpu)) 2482 2465 svm_set_intercept(svm, INTERCEPT_IRET); 2483 2466 } 2484 2467 ··· 2486 2469 { 2487 2470 struct vcpu_svm *svm = to_svm(vcpu); 2488 2471 2489 - WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); 2472 + WARN_ON_ONCE(is_sev_es_guest(vcpu)); 2490 2473 2491 2474 ++vcpu->stat.nmi_window_exits; 2492 2475 svm->awaiting_iret_completion = true; ··· 2660 2643 * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT 2661 2644 * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. 2662 2645 */ 2663 - if (sev_es_guest(vcpu->kvm)) 2646 + if (is_sev_es_guest(vcpu)) 2664 2647 return 1; 2665 2648 2666 2649 if (vcpu->guest_debug == 0) { ··· 2762 2745 static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, 2763 2746 struct msr_data *msr_info) 2764 2747 { 2765 - return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && 2748 + return is_sev_es_guest(vcpu) && vcpu->arch.guest_state_protected && 2766 2749 msr_info->index != MSR_IA32_XSS && 2767 2750 !msr_write_intercepted(vcpu, msr_info->index); 2768 2751 } ··· 2892 2875 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2893 2876 { 2894 2877 struct vcpu_svm *svm = to_svm(vcpu); 2895 - if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) 2878 + if (!err || !is_sev_es_guest(vcpu) || WARN_ON_ONCE(!svm->sev_es.ghcb)) 2896 2879 return kvm_complete_insn_gp(vcpu, err); 2897 2880 2898 2881 svm_vmgexit_inject_exception(svm, X86_TRAP_GP); ··· 3073 3056 * required in this case because TSC_AUX is restored on #VMEXIT 3074 3057 * from the host save area. 3075 3058 */ 3076 - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) 3059 + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && is_sev_es_guest(vcpu)) 3077 3060 break; 3078 3061 3079 3062 /* ··· 3172 3155 kvm_make_request(KVM_REQ_EVENT, vcpu); 3173 3156 svm_clear_vintr(to_svm(vcpu)); 3174 3157 3175 - /* 3176 - * If not running nested, for AVIC, the only reason to end up here is ExtINTs. 3177 - * In this case AVIC was temporarily disabled for 3178 - * requesting the IRQ window and we have to re-enable it. 3179 - * 3180 - * If running nested, still remove the VM wide AVIC inhibit to 3181 - * support case in which the interrupt window was requested when the 3182 - * vCPU was not running nested. 3183 - 3184 - * All vCPUs which run still run nested, will remain to have their 3185 - * AVIC still inhibited due to per-cpu AVIC inhibition. 3186 - */ 3187 - kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3188 - 3189 3158 ++vcpu->stat.irq_window_exits; 3190 3159 return 1; 3191 3160 } ··· 3184 3181 * vcpu->arch.preempted_in_kernel can never be true. Just 3185 3182 * set in_kernel to false as well. 3186 3183 */ 3187 - in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; 3184 + in_kernel = !is_sev_es_guest(vcpu) && svm_get_cpl(vcpu) == 0; 3188 3185 3189 3186 grow_ple_window(vcpu); 3190 3187 ··· 3365 3362 3366 3363 guard(mutex)(&vmcb_dump_mutex); 3367 3364 3368 - vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : 3369 - sev_es_guest(vcpu->kvm) ? "SEV-ES" : 3370 - sev_guest(vcpu->kvm) ? "SEV" : "SVM"; 3365 + vm_type = is_sev_snp_guest(vcpu) ? "SEV-SNP" : 3366 + is_sev_es_guest(vcpu) ? "SEV-ES" : 3367 + is_sev_guest(vcpu) ? "SEV" : "SVM"; 3371 3368 3372 3369 pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", 3373 3370 vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); ··· 3412 3409 pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); 3413 3410 pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); 3414 3411 3415 - if (sev_es_guest(vcpu->kvm)) { 3412 + if (is_sev_es_guest(vcpu)) { 3416 3413 save = sev_decrypt_vmsa(vcpu); 3417 3414 if (!save) 3418 3415 goto no_vmsa; ··· 3495 3492 "excp_from:", save->last_excp_from, 3496 3493 "excp_to:", save->last_excp_to); 3497 3494 3498 - if (sev_es_guest(vcpu->kvm)) { 3495 + if (is_sev_es_guest(vcpu)) { 3499 3496 struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; 3500 3497 3501 3498 pr_err("%-15s %016llx\n", ··· 3556 3553 } 3557 3554 3558 3555 no_vmsa: 3559 - if (sev_es_guest(vcpu->kvm)) 3556 + if (is_sev_es_guest(vcpu)) 3560 3557 sev_free_decrypted_vmsa(vcpu, save); 3561 3558 } 3562 3559 ··· 3645 3642 struct kvm_run *kvm_run = vcpu->run; 3646 3643 3647 3644 /* SEV-ES guests must use the CR write traps to track CR registers. */ 3648 - if (!sev_es_guest(vcpu->kvm)) { 3645 + if (!is_sev_es_guest(vcpu)) { 3649 3646 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) 3650 3647 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3651 3648 if (npt_enabled) ··· 3707 3704 svm->current_vmcb->cpu = vcpu->cpu; 3708 3705 } 3709 3706 3710 - if (sev_guest(vcpu->kvm)) 3707 + if (is_sev_guest(vcpu)) 3711 3708 return pre_sev_run(svm, vcpu->cpu); 3712 3709 3713 3710 /* FIXME: handle wraparound of asid_generation */ ··· 3784 3781 type = SVM_EVTINJ_TYPE_SOFT; 3785 3782 } else { 3786 3783 type = SVM_EVTINJ_TYPE_INTR; 3784 + } 3785 + 3786 + /* 3787 + * If AVIC was inhibited in order to detect an IRQ window, and there's 3788 + * no other injectable interrupts pending or L2 is active (see below), 3789 + * then drop the inhibit as the window has served its purpose. 3790 + * 3791 + * If L2 is active, this path is reachable if L1 is not intercepting 3792 + * IRQs, i.e. if KVM is injecting L1 IRQs into L2. AVIC is locally 3793 + * inhibited while L2 is active; drop the VM-wide inhibit to optimize 3794 + * the case in which the interrupt window was requested while L1 was 3795 + * active (the vCPU was not running nested). 3796 + */ 3797 + if (svm->avic_irq_window && 3798 + (!kvm_cpu_has_injectable_intr(vcpu) || is_guest_mode(vcpu))) { 3799 + svm->avic_irq_window = false; 3800 + kvm_dec_apicv_irq_window_req(svm->vcpu.kvm); 3787 3801 } 3788 3802 3789 3803 trace_kvm_inj_virq(intr->nr, intr->soft, reinjected); ··· 3897 3877 * SEV-ES guests must always keep the CR intercepts cleared. CR 3898 3878 * tracking is done using the CR write traps. 3899 3879 */ 3900 - if (sev_es_guest(vcpu->kvm)) 3880 + if (is_sev_es_guest(vcpu)) 3901 3881 return; 3902 3882 3903 3883 if (nested_svm_virtualize_tpr(vcpu)) ··· 4033 4013 */ 4034 4014 if (vgif || gif_set(svm)) { 4035 4015 /* 4036 - * IRQ window is not needed when AVIC is enabled, 4037 - * unless we have pending ExtINT since it cannot be injected 4038 - * via AVIC. In such case, KVM needs to temporarily disable AVIC, 4039 - * and fallback to injecting IRQ via V_IRQ. 4016 + * KVM only enables IRQ windows when AVIC is enabled if there's 4017 + * pending ExtINT since it cannot be injected via AVIC (ExtINT 4018 + * bypasses the local APIC). V_IRQ is ignored by hardware when 4019 + * AVIC is enabled, and so KVM needs to temporarily disable 4020 + * AVIC in order to detect when it's ok to inject the ExtINT. 4040 4021 * 4041 - * If running nested, AVIC is already locally inhibited 4042 - * on this vCPU, therefore there is no need to request 4043 - * the VM wide AVIC inhibition. 4022 + * If running nested, AVIC is already locally inhibited on this 4023 + * vCPU (L2 vCPUs use a different MMU that never maps the AVIC 4024 + * backing page), therefore there is no need to increment the 4025 + * VM-wide AVIC inhibit. KVM will re-evaluate events when the 4026 + * vCPU exits to L1 and enable an IRQ window if the ExtINT is 4027 + * still pending. 4028 + * 4029 + * Note, the IRQ window inhibit needs to be updated even if 4030 + * AVIC is inhibited for a different reason, as KVM needs to 4031 + * keep AVIC inhibited if the other reason is cleared and there 4032 + * is still an injectable interrupt pending. 4044 4033 */ 4045 - if (!is_guest_mode(vcpu)) 4046 - kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 4034 + if (enable_apicv && !svm->avic_irq_window && !is_guest_mode(vcpu)) { 4035 + svm->avic_irq_window = true; 4036 + kvm_inc_apicv_irq_window_req(vcpu->kvm); 4037 + } 4047 4038 4048 4039 svm_set_vintr(svm); 4049 4040 } ··· 4097 4066 * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not 4098 4067 * supported NAEs in the GHCB protocol. 4099 4068 */ 4100 - if (sev_es_guest(vcpu->kvm)) 4069 + if (is_sev_es_guest(vcpu)) 4101 4070 return; 4102 4071 4103 4072 if (!gif_set(svm)) { ··· 4339 4308 4340 4309 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) 4341 4310 { 4311 + #ifdef CONFIG_KVM_AMD_SEV 4342 4312 if (to_kvm_sev_info(vcpu->kvm)->need_init) 4343 4313 return -EINVAL; 4314 + #endif 4344 4315 4345 4316 return 1; 4346 4317 } ··· 4399 4366 4400 4367 amd_clear_divider(); 4401 4368 4402 - if (sev_es_guest(vcpu->kvm)) 4369 + if (is_sev_es_guest(vcpu)) 4403 4370 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, 4404 4371 sev_es_host_save_area(sd)); 4405 4372 else ··· 4502 4469 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 4503 4470 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); 4504 4471 4505 - if (!sev_es_guest(vcpu->kvm)) { 4472 + if (!is_sev_es_guest(vcpu)) { 4506 4473 vcpu->arch.cr2 = svm->vmcb->save.cr2; 4507 4474 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 4508 4475 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; ··· 4625 4592 case MSR_IA32_SMBASE: 4626 4593 if (!IS_ENABLED(CONFIG_KVM_SMM)) 4627 4594 return false; 4628 - /* SEV-ES guests do not support SMM, so report false */ 4629 - if (kvm && sev_es_guest(kvm)) 4595 + 4596 + #ifdef CONFIG_KVM_AMD_SEV 4597 + /* 4598 + * KVM can't access register state to emulate SMM for SEV-ES 4599 + * guests. Conusming stale data here is "fine", as KVM only 4600 + * checks for MSR_IA32_SMBASE support without a vCPU when 4601 + * userspace is querying KVM_CAP_X86_SMM. 4602 + */ 4603 + if (kvm && ____sev_es_guest(kvm)) 4630 4604 return false; 4605 + #endif 4631 4606 break; 4632 4607 default: 4633 4608 break; ··· 4670 4629 if (guest_cpuid_is_intel_compatible(vcpu)) 4671 4630 guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); 4672 4631 4673 - if (sev_guest(vcpu->kvm)) 4632 + if (is_sev_guest(vcpu)) 4674 4633 sev_vcpu_after_set_cpuid(svm); 4675 4634 } 4676 4635 ··· 5066 5025 return X86EMUL_UNHANDLEABLE_VECTORING; 5067 5026 5068 5027 /* Emulation is always possible when KVM has access to all guest state. */ 5069 - if (!sev_guest(vcpu->kvm)) 5028 + if (!is_sev_guest(vcpu)) 5070 5029 return X86EMUL_CONTINUE; 5071 5030 5072 5031 /* #UD and #GP should never be intercepted for SEV guests. */ ··· 5078 5037 * Emulation is impossible for SEV-ES guests as KVM doesn't have access 5079 5038 * to guest register state. 5080 5039 */ 5081 - if (sev_es_guest(vcpu->kvm)) 5040 + if (is_sev_es_guest(vcpu)) 5082 5041 return X86EMUL_RETRY_INSTR; 5083 5042 5084 5043 /* ··· 5215 5174 5216 5175 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 5217 5176 { 5218 - if (!sev_es_guest(vcpu->kvm)) 5177 + if (!is_sev_es_guest(vcpu)) 5219 5178 return kvm_vcpu_deliver_sipi_vector(vcpu, vector); 5220 5179 5221 5180 sev_vcpu_deliver_sipi_vector(vcpu, vector); ··· 5231 5190 5232 5191 static int svm_vm_init(struct kvm *kvm) 5233 5192 { 5234 - int type = kvm->arch.vm_type; 5235 - 5236 - if (type != KVM_X86_DEFAULT_VM && 5237 - type != KVM_X86_SW_PROTECTED_VM) { 5238 - kvm->arch.has_protected_state = 5239 - (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); 5240 - to_kvm_sev_info(kvm)->need_init = true; 5241 - 5242 - kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); 5243 - kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 5244 - } 5193 + sev_vm_init(kvm); 5245 5194 5246 5195 if (!pause_filter_count || !pause_filter_thresh) 5247 5196 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
+29 -8
arch/x86/kvm/svm/svm.h
··· 92 92 /* TPR and CR2 are always written before VMRUN */ 93 93 #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) 94 94 95 + #ifdef CONFIG_KVM_AMD_SEV 95 96 struct kvm_sev_info { 96 97 bool active; /* SEV enabled guest */ 97 98 bool es_active; /* SEV-ES enabled guest */ ··· 118 117 cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ 119 118 bool snp_certs_enabled; /* SNP certificate-fetching support. */ 120 119 }; 120 + #endif 121 121 122 122 struct kvm_svm { 123 123 struct kvm kvm; ··· 129 127 u64 *avic_physical_id_table; 130 128 struct hlist_node hnode; 131 129 130 + #ifdef CONFIG_KVM_AMD_SEV 132 131 struct kvm_sev_info sev_info; 132 + #endif 133 133 }; 134 134 135 135 struct kvm_vcpu; ··· 353 349 354 350 bool guest_state_loaded; 355 351 352 + bool avic_irq_window; 356 353 bool x2avic_msrs_intercepted; 357 354 bool lbr_msrs_intercepted; 358 355 ··· 383 378 return container_of(kvm, struct kvm_svm, kvm); 384 379 } 385 380 381 + #ifdef CONFIG_KVM_AMD_SEV 386 382 static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm) 387 383 { 388 384 return &to_kvm_svm(kvm)->sev_info; 389 385 } 390 386 391 - #ifdef CONFIG_KVM_AMD_SEV 392 - static __always_inline bool sev_guest(struct kvm *kvm) 387 + static __always_inline bool ____sev_guest(struct kvm *kvm) 393 388 { 394 389 return to_kvm_sev_info(kvm)->active; 395 390 } 396 - static __always_inline bool sev_es_guest(struct kvm *kvm) 391 + static __always_inline bool ____sev_es_guest(struct kvm *kvm) 397 392 { 398 393 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 399 394 400 395 return sev->es_active && !WARN_ON_ONCE(!sev->active); 401 396 } 402 397 403 - static __always_inline bool sev_snp_guest(struct kvm *kvm) 398 + static __always_inline bool ____sev_snp_guest(struct kvm *kvm) 404 399 { 405 400 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 406 401 407 402 return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && 408 - !WARN_ON_ONCE(!sev_es_guest(kvm)); 403 + !WARN_ON_ONCE(!____sev_es_guest(kvm)); 404 + } 405 + 406 + static __always_inline bool is_sev_guest(struct kvm_vcpu *vcpu) 407 + { 408 + return ____sev_guest(vcpu->kvm); 409 + } 410 + static __always_inline bool is_sev_es_guest(struct kvm_vcpu *vcpu) 411 + { 412 + return ____sev_es_guest(vcpu->kvm); 413 + } 414 + 415 + static __always_inline bool is_sev_snp_guest(struct kvm_vcpu *vcpu) 416 + { 417 + return ____sev_snp_guest(vcpu->kvm); 409 418 } 410 419 #else 411 - #define sev_guest(kvm) false 412 - #define sev_es_guest(kvm) false 413 - #define sev_snp_guest(kvm) false 420 + #define is_sev_guest(vcpu) false 421 + #define is_sev_es_guest(vcpu) false 422 + #define is_sev_snp_guest(vcpu) false 414 423 #endif 415 424 416 425 static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) ··· 942 923 943 924 int sev_vcpu_create(struct kvm_vcpu *vcpu); 944 925 void sev_free_vcpu(struct kvm_vcpu *vcpu); 926 + void sev_vm_init(struct kvm *kvm); 945 927 void sev_vm_destroy(struct kvm *kvm); 946 928 void __init sev_set_cpu_caps(void); 947 929 void __init sev_hardware_setup(void); ··· 969 949 970 950 static inline int sev_vcpu_create(struct kvm_vcpu *vcpu) { return 0; } 971 951 static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {} 952 + static inline void sev_vm_init(struct kvm *kvm) {} 972 953 static inline void sev_vm_destroy(struct kvm *kvm) {} 973 954 static inline void __init sev_set_cpu_caps(void) {} 974 955 static inline void __init sev_hardware_setup(void) {}
+44 -1
arch/x86/kvm/x86.c
··· 11018 11018 11019 11019 old = new = kvm->arch.apicv_inhibit_reasons; 11020 11020 11021 - set_or_clear_apicv_inhibit(&new, reason, set); 11021 + if (reason != APICV_INHIBIT_REASON_IRQWIN) 11022 + set_or_clear_apicv_inhibit(&new, reason, set); 11023 + 11024 + set_or_clear_apicv_inhibit(&new, APICV_INHIBIT_REASON_IRQWIN, 11025 + atomic_read(&kvm->arch.apicv_nr_irq_window_req)); 11022 11026 11023 11027 if (!!old != !!new) { 11024 11028 /* ··· 11062 11058 up_write(&kvm->arch.apicv_update_lock); 11063 11059 } 11064 11060 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit); 11061 + 11062 + void kvm_inc_or_dec_irq_window_inhibit(struct kvm *kvm, bool inc) 11063 + { 11064 + int add = inc ? 1 : -1; 11065 + 11066 + if (!enable_apicv) 11067 + return; 11068 + 11069 + /* 11070 + * IRQ windows are requested either because of ExtINT injections, or 11071 + * because APICv is already disabled/inhibited for another reason. 11072 + * While ExtINT injections are rare and should not happen while the 11073 + * vCPU is running its actual workload, it's worth avoiding thrashing 11074 + * if the IRQ window is being requested because APICv is already 11075 + * inhibited. So, toggle the actual inhibit (which requires taking 11076 + * the lock for write) if and only if there's no other inhibit. 11077 + * kvm_set_or_clear_apicv_inhibit() always evaluates the IRQ window 11078 + * count; thus the IRQ window inhibit call _will_ be lazily updated on 11079 + * the next call, if it ever happens. 11080 + */ 11081 + if (READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~BIT(APICV_INHIBIT_REASON_IRQWIN)) { 11082 + guard(rwsem_read)(&kvm->arch.apicv_update_lock); 11083 + if (READ_ONCE(kvm->arch.apicv_inhibit_reasons) & ~BIT(APICV_INHIBIT_REASON_IRQWIN)) { 11084 + atomic_add(add, &kvm->arch.apicv_nr_irq_window_req); 11085 + return; 11086 + } 11087 + } 11088 + 11089 + /* 11090 + * Strictly speaking, the lock is only needed if going 0->1 or 1->0, 11091 + * a la atomic_dec_and_mutex_lock. However, ExtINTs are rare and 11092 + * only target a single CPU, so that is the common case; do not 11093 + * bother eliding the down_write()/up_write() pair. 11094 + */ 11095 + guard(rwsem_write)(&kvm->arch.apicv_update_lock); 11096 + if (atomic_add_return(add, &kvm->arch.apicv_nr_irq_window_req) == inc) 11097 + __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_IRQWIN, inc); 11098 + } 11099 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inc_or_dec_irq_window_inhibit); 11065 11100 11066 11101 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 11067 11102 {
+7
include/linux/kvm_host.h
··· 1030 1030 return NULL; 1031 1031 } 1032 1032 1033 + static inline bool kvm_is_vcpu_creation_in_progress(struct kvm *kvm) 1034 + { 1035 + lockdep_assert_held(&kvm->lock); 1036 + 1037 + return kvm->created_vcpus != atomic_read(&kvm->online_vcpus); 1038 + } 1039 + 1033 1040 void kvm_destroy_vcpus(struct kvm *kvm); 1034 1041 1035 1042 int kvm_trylock_all_vcpus(struct kvm *kvm);
-2
tools/testing/selftests/kvm/x86/sev_migrate_tests.c
··· 36 36 37 37 sev_vm_launch(vm, es ? SEV_POLICY_ES : 0); 38 38 39 - if (es) 40 - vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); 41 39 return vm; 42 40 } 43 41