Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+24 -9

Documentation/virt/kvm/api.txt

··· 586 586 Architectures: x86 587 587 Type: vcpu ioctl 588 588 Parameters: struct kvm_msrs (in) 589 - Returns: 0 on success, -1 on error 589 + Returns: number of msrs successfully set (see below), -1 on error 590 590 591 591 Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the 592 592 data structures. ··· 594 594 Application code should set the 'nmsrs' member (which indicates the 595 595 size of the entries array), and the 'index' and 'data' members of each 596 596 array entry. 597 + 598 + It tries to set the MSRs in array entries[] one by one. If setting an MSR 599 + fails, e.g., due to setting reserved bits, the MSR isn't supported/emulated 600 + by KVM, etc..., it stops processing the MSR list and returns the number of 601 + MSRs that have been set successfully. 597 602 598 603 599 604 4.20 KVM_SET_CPUID ··· 758 753 use PPIs designated for specific cpus. The irq field is interpreted 759 754 like this: 760 755 761 - bits: | 31 ... 24 | 23 ... 16 | 15 ... 0 | 762 - field: | irq_type | vcpu_index | irq_id | 756 + bits: | 31 ... 28 | 27 ... 24 | 23 ... 16 | 15 ... 0 | 757 + field: | vcpu2_index | irq_type | vcpu_index | irq_id | 763 758 764 759 The irq_type field has the following values: 765 760 - irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ ··· 770 765 (The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) 771 766 772 767 In both cases, level is used to assert/deassert the line. 768 + 769 + When KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 is supported, the target vcpu is 770 + identified as (256 * vcpu2_index + vcpu_index). Otherwise, vcpu2_index 771 + must be zero. 772 + 773 + Note that on arm/arm64, the KVM_CAP_IRQCHIP capability only conditions 774 + injection of interrupts for the in-kernel irqchip. KVM_IRQ_LINE can always 775 + be used for a userspace interrupt controller. 773 776 774 777 struct kvm_irq_level { 775 778 union { ··· 3092 3079 flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. 3093 3080 3094 3081 The start address of the memory region has to be specified in the "gaddr" 3095 - field, and the length of the region in the "size" field. "buf" is the buffer 3096 - supplied by the userspace application where the read data should be written 3097 - to for KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written 3098 - is stored for a KVM_S390_MEMOP_LOGICAL_WRITE. "buf" is unused and can be NULL 3099 - when KVM_S390_MEMOP_F_CHECK_ONLY is specified. "ar" designates the access 3100 - register number to be used. 3082 + field, and the length of the region in the "size" field (which must not 3083 + be 0). The maximum value for "size" can be obtained by checking the 3084 + KVM_CAP_S390_MEM_OP capability. "buf" is the buffer supplied by the 3085 + userspace application where the read data should be written to for 3086 + KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written is 3087 + stored for a KVM_S390_MEMOP_LOGICAL_WRITE. When KVM_S390_MEMOP_F_CHECK_ONLY 3088 + is specified, "buf" is unused and can be NULL. "ar" designates the access 3089 + register number to be used; the valid range is 0..15. 3101 3090 3102 3091 The "reserved" field is meant for future extensions. It is not used by 3103 3092 KVM with the currently defined set of flags.

+2 -2

Documentation/virt/kvm/mmu.txt

··· 294 294 - walk shadow page table 295 295 - check for valid generation number in the spte (see "Fast invalidation of 296 296 MMIO sptes" below) 297 - - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and 297 + - cache the information to vcpu->arch.mmio_gva, vcpu->arch.mmio_access and 298 298 vcpu->arch.mmio_gfn, and call the emulator 299 299 - If both P bit and R/W bit of error code are set, this could possibly 300 300 be handled as a "fast page fault" (fixed without taking the MMU lock). See ··· 304 304 - if permissions are insufficient, reflect the fault back to the guest 305 305 - determine the host page 306 306 - if this is an mmio request, there is no host page; cache the info to 307 - vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn 307 + vcpu->arch.mmio_gva, vcpu->arch.mmio_access and vcpu->arch.mmio_gfn 308 308 - walk the shadow page table to find the spte for the translation, 309 309 instantiating missing intermediate page tables as necessary 310 310 - If this is an mmio request, cache the mmio info to the spte and set some

+3 -1

arch/arm/include/uapi/asm/kvm.h

··· 266 266 #define KVM_DEV_ARM_ITS_CTRL_RESET 4 267 267 268 268 /* KVM_IRQ_LINE irq field index values */ 269 + #define KVM_ARM_IRQ_VCPU2_SHIFT 28 270 + #define KVM_ARM_IRQ_VCPU2_MASK 0xf 269 271 #define KVM_ARM_IRQ_TYPE_SHIFT 24 270 - #define KVM_ARM_IRQ_TYPE_MASK 0xff 272 + #define KVM_ARM_IRQ_TYPE_MASK 0xf 271 273 #define KVM_ARM_IRQ_VCPU_SHIFT 16 272 274 #define KVM_ARM_IRQ_VCPU_MASK 0xff 273 275 #define KVM_ARM_IRQ_NUM_SHIFT 0

+1 -1

arch/arm64/include/asm/pgtable-prot.h

··· 77 77 }) 78 78 79 79 #define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN) 80 - #define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PAGE_S2_XN) 80 + #define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN) 81 81 82 82 #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) 83 83 #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)

+3 -1

arch/arm64/include/uapi/asm/kvm.h

··· 325 325 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 326 326 327 327 /* KVM_IRQ_LINE irq field index values */ 328 + #define KVM_ARM_IRQ_VCPU2_SHIFT 28 329 + #define KVM_ARM_IRQ_VCPU2_MASK 0xf 328 330 #define KVM_ARM_IRQ_TYPE_SHIFT 24 329 - #define KVM_ARM_IRQ_TYPE_MASK 0xff 331 + #define KVM_ARM_IRQ_TYPE_MASK 0xf 330 332 #define KVM_ARM_IRQ_VCPU_SHIFT 16 331 333 #define KVM_ARM_IRQ_VCPU_MASK 0xff 332 334 #define KVM_ARM_IRQ_NUM_SHIFT 0

+13 -1

arch/arm64/kvm/hyp/tlb.c

··· 193 193 { 194 194 dsb(ishst); 195 195 __tlbi(alle1is); 196 - asm volatile("ic ialluis" : : ); 196 + 197 + /* 198 + * VIPT and PIPT caches are not affected by VMID, so no maintenance 199 + * is necessary across a VMID rollover. 200 + * 201 + * VPIPT caches constrain lookup and maintenance to the active VMID, 202 + * so we need to invalidate lines with a stale VMID to avoid an ABA 203 + * race after multiple rollovers. 204 + * 205 + */ 206 + if (icache_is_vpipt()) 207 + asm volatile("ic ialluis"); 208 + 197 209 dsb(ish); 198 210 }

+18 -4

arch/powerpc/include/asm/kvm_host.h

··· 232 232 }; 233 233 234 234 /* 235 - * We use the top bit of each memslot->arch.rmap entry as a lock bit, 236 - * and bit 32 as a present flag. The bottom 32 bits are the 237 - * index in the guest HPT of a HPTE that points to the page. 235 + * The rmap array of size number of guest pages is allocated for each memslot. 236 + * This array is used to store usage specific information about the guest page. 237 + * Below are the encodings of the various possible usage types. 238 238 */ 239 - #define KVMPPC_RMAP_LOCK_BIT 63 239 + /* Free bits which can be used to define a new usage */ 240 + #define KVMPPC_RMAP_TYPE_MASK 0xff00000000000000 241 + #define KVMPPC_RMAP_NESTED 0xc000000000000000 /* Nested rmap array */ 242 + #define KVMPPC_RMAP_HPT 0x0100000000000000 /* HPT guest */ 243 + 244 + /* 245 + * rmap usage definition for a hash page table (hpt) guest: 246 + * 0x0000080000000000 Lock bit 247 + * 0x0000018000000000 RC bits 248 + * 0x0000000100000000 Present bit 249 + * 0x00000000ffffffff HPT index bits 250 + * The bottom 32 bits are the index in the guest HPT of a HPTE that points to 251 + * the page. 252 + */ 253 + #define KVMPPC_RMAP_LOCK_BIT 43 240 254 #define KVMPPC_RMAP_RC_SHIFT 32 241 255 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) 242 256 #define KVMPPC_RMAP_PRESENT 0x100000000ul

+1

arch/powerpc/include/asm/kvm_ppc.h

··· 598 598 union kvmppc_one_reg *val); 599 599 extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, 600 600 union kvmppc_one_reg *val); 601 + extern bool kvmppc_xive_native_supported(void); 601 602 602 603 #else 603 604 static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,

+9

arch/powerpc/include/asm/xive.h

··· 46 46 47 47 /* Setup/used by frontend */ 48 48 int target; 49 + /* 50 + * saved_p means that there is a queue entry for this interrupt 51 + * in some CPU's queue (not including guest vcpu queues), even 52 + * if P is not set in the source ESB. 53 + * stale_p means that there is no queue entry for this interrupt 54 + * in some CPU's queue, even if P is set in the source ESB. 55 + */ 49 56 bool saved_p; 57 + bool stale_p; 50 58 }; 51 59 #define XIVE_IRQ_FLAG_STORE_EOI 0x01 52 60 #define XIVE_IRQ_FLAG_LSI 0x02 ··· 135 127 extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, 136 128 u32 qindex); 137 129 extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state); 130 + extern bool xive_native_has_queue_state_support(void); 138 131 139 132 #else 140 133

+5 -3

arch/powerpc/kvm/book3s.c

··· 1083 1083 if (xics_on_xive()) { 1084 1084 kvmppc_xive_init_module(); 1085 1085 kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); 1086 - kvmppc_xive_native_init_module(); 1087 - kvm_register_device_ops(&kvm_xive_native_ops, 1088 - KVM_DEV_TYPE_XIVE); 1086 + if (kvmppc_xive_native_supported()) { 1087 + kvmppc_xive_native_init_module(); 1088 + kvm_register_device_ops(&kvm_xive_native_ops, 1089 + KVM_DEV_TYPE_XIVE); 1090 + } 1089 1091 } else 1090 1092 #endif 1091 1093 kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);

+18 -6

arch/powerpc/kvm/book3s_hv.c

··· 1678 1678 *val = get_reg_val(id, vcpu->arch.pspb); 1679 1679 break; 1680 1680 case KVM_REG_PPC_DPDES: 1681 - *val = get_reg_val(id, vcpu->arch.vcore->dpdes); 1681 + /* 1682 + * On POWER9, where we are emulating msgsndp etc., 1683 + * we return 1 bit for each vcpu, which can come from 1684 + * either vcore->dpdes or doorbell_request. 1685 + * On POWER8, doorbell_request is 0. 1686 + */ 1687 + *val = get_reg_val(id, vcpu->arch.vcore->dpdes | 1688 + vcpu->arch.doorbell_request); 1682 1689 break; 1683 1690 case KVM_REG_PPC_VTB: 1684 1691 *val = get_reg_val(id, vcpu->arch.vcore->vtb); ··· 2867 2860 if (!spin_trylock(&pvc->lock)) 2868 2861 continue; 2869 2862 prepare_threads(pvc); 2870 - if (!pvc->n_runnable) { 2863 + if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) { 2871 2864 list_del_init(&pvc->preempt_list); 2872 2865 if (pvc->runner == NULL) { 2873 2866 pvc->vcore_state = VCORE_INACTIVE; ··· 2888 2881 spin_unlock(&lp->lock); 2889 2882 } 2890 2883 2891 - static bool recheck_signals(struct core_info *cip) 2884 + static bool recheck_signals_and_mmu(struct core_info *cip) 2892 2885 { 2893 2886 int sub, i; 2894 2887 struct kvm_vcpu *vcpu; 2888 + struct kvmppc_vcore *vc; 2895 2889 2896 - for (sub = 0; sub < cip->n_subcores; ++sub) 2897 - for_each_runnable_thread(i, vcpu, cip->vc[sub]) 2890 + for (sub = 0; sub < cip->n_subcores; ++sub) { 2891 + vc = cip->vc[sub]; 2892 + if (!vc->kvm->arch.mmu_ready) 2893 + return true; 2894 + for_each_runnable_thread(i, vcpu, vc) 2898 2895 if (signal_pending(vcpu->arch.run_task)) 2899 2896 return true; 2897 + } 2900 2898 return false; 2901 2899 } 2902 2900 ··· 3131 3119 local_irq_disable(); 3132 3120 hard_irq_disable(); 3133 3121 if (lazy_irq_pending() || need_resched() || 3134 - recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { 3122 + recheck_signals_and_mmu(&core_info)) { 3135 3123 local_irq_enable(); 3136 3124 vc->vcore_state = VCORE_INACTIVE; 3137 3125 /* Unlock all except the primary vcore */

+1 -1

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 99 99 } else { 100 100 rev->forw = rev->back = pte_index; 101 101 *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | 102 - pte_index | KVMPPC_RMAP_PRESENT; 102 + pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT; 103 103 } 104 104 unlock_rmap(rmap); 105 105 }

+29 -17

arch/powerpc/kvm/book3s_hv_rmhandlers.S

··· 942 942 ld r11, VCPU_XIVE_SAVED_STATE(r4) 943 943 li r9, TM_QW1_OS 944 944 lwz r8, VCPU_XIVE_CAM_WORD(r4) 945 + cmpwi r8, 0 946 + beq no_xive 945 947 li r7, TM_QW1_OS + TM_WORD2 946 948 mfmsr r0 947 949 andi. r0, r0, MSR_DR /* in real mode? */ ··· 2833 2831 kvm_cede_exit: 2834 2832 ld r9, HSTATE_KVM_VCPU(r13) 2835 2833 #ifdef CONFIG_KVM_XICS 2836 - /* Abort if we still have a pending escalation */ 2837 - lbz r5, VCPU_XIVE_ESC_ON(r9) 2838 - cmpwi r5, 0 2839 - beq 1f 2840 - li r0, 0 2841 - stb r0, VCPU_CEDED(r9) 2842 - 1: /* Enable XIVE escalation */ 2843 - li r5, XIVE_ESB_SET_PQ_00 2844 - mfmsr r0 2845 - andi. r0, r0, MSR_DR /* in real mode? */ 2846 - beq 1f 2834 + /* are we using XIVE with single escalation? */ 2847 2835 ld r10, VCPU_XIVE_ESC_VADDR(r9) 2848 2836 cmpdi r10, 0 2849 2837 beq 3f 2850 - ldx r0, r10, r5 2838 + li r6, XIVE_ESB_SET_PQ_00 2839 + /* 2840 + * If we still have a pending escalation, abort the cede, 2841 + * and we must set PQ to 10 rather than 00 so that we don't 2842 + * potentially end up with two entries for the escalation 2843 + * interrupt in the XIVE interrupt queue. In that case 2844 + * we also don't want to set xive_esc_on to 1 here in 2845 + * case we race with xive_esc_irq(). 2846 + */ 2847 + lbz r5, VCPU_XIVE_ESC_ON(r9) 2848 + cmpwi r5, 0 2849 + beq 4f 2850 + li r0, 0 2851 + stb r0, VCPU_CEDED(r9) 2852 + li r6, XIVE_ESB_SET_PQ_10 2853 + b 5f 2854 + 4: li r0, 1 2855 + stb r0, VCPU_XIVE_ESC_ON(r9) 2856 + /* make sure store to xive_esc_on is seen before xive_esc_irq runs */ 2857 + sync 2858 + 5: /* Enable XIVE escalation */ 2859 + mfmsr r0 2860 + andi. r0, r0, MSR_DR /* in real mode? */ 2861 + beq 1f 2862 + ldx r0, r10, r6 2851 2863 b 2f 2852 2864 1: ld r10, VCPU_XIVE_ESC_RADDR(r9) 2853 - cmpdi r10, 0 2854 - beq 3f 2855 - ldcix r0, r10, r5 2865 + ldcix r0, r10, r6 2856 2866 2: sync 2857 - li r0, 1 2858 - stb r0, VCPU_XIVE_ESC_ON(r9) 2859 2867 #endif /* CONFIG_KVM_XICS */ 2860 2868 3: b guest_exit_cont 2861 2869

+51 -9

arch/powerpc/kvm/book3s_xive.c

··· 67 67 void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; 68 68 u64 pq; 69 69 70 - if (!tima) 70 + /* 71 + * Nothing to do if the platform doesn't have a XIVE 72 + * or this vCPU doesn't have its own XIVE context 73 + * (e.g. because it's not using an in-kernel interrupt controller). 74 + */ 75 + if (!tima || !vcpu->arch.xive_cam_word) 71 76 return; 77 + 72 78 eieio(); 73 79 __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); 74 80 __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); ··· 165 159 * that knowledge today but might (see comment in book3s_hv_rmhandler.S) 166 160 */ 167 161 vcpu->arch.xive_esc_on = false; 162 + 163 + /* This orders xive_esc_on = false vs. subsequent stale_p = true */ 164 + smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */ 168 165 169 166 return IRQ_HANDLED; 170 167 } ··· 1122 1113 vcpu->arch.xive_esc_raddr = 0; 1123 1114 } 1124 1115 1116 + /* 1117 + * In single escalation mode, the escalation interrupt is marked so 1118 + * that EOI doesn't re-enable it, but just sets the stale_p flag to 1119 + * indicate that the P bit has already been dealt with. However, the 1120 + * assembly code that enters the guest sets PQ to 00 without clearing 1121 + * stale_p (because it has no easy way to address it). Hence we have 1122 + * to adjust stale_p before shutting down the interrupt. 1123 + */ 1124 + void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, 1125 + struct kvmppc_xive_vcpu *xc, int irq) 1126 + { 1127 + struct irq_data *d = irq_get_irq_data(irq); 1128 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 1129 + 1130 + /* 1131 + * This slightly odd sequence gives the right result 1132 + * (i.e. stale_p set if xive_esc_on is false) even if 1133 + * we race with xive_esc_irq() and xive_irq_eoi(). 1134 + */ 1135 + xd->stale_p = false; 1136 + smp_mb(); /* paired with smb_wmb in xive_esc_irq */ 1137 + if (!vcpu->arch.xive_esc_on) 1138 + xd->stale_p = true; 1139 + } 1140 + 1125 1141 void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) 1126 1142 { 1127 1143 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; ··· 1168 1134 /* Mask the VP IPI */ 1169 1135 xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); 1170 1136 1171 - /* Disable the VP */ 1172 - xive_native_disable_vp(xc->vp_id); 1173 - 1174 - /* Free the queues & associated interrupts */ 1137 + /* Free escalations */ 1175 1138 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1176 - struct xive_q *q = &xc->queues[i]; 1177 - 1178 - /* Free the escalation irq */ 1179 1139 if (xc->esc_virq[i]) { 1140 + if (xc->xive->single_escalation) 1141 + xive_cleanup_single_escalation(vcpu, xc, 1142 + xc->esc_virq[i]); 1180 1143 free_irq(xc->esc_virq[i], vcpu); 1181 1144 irq_dispose_mapping(xc->esc_virq[i]); 1182 1145 kfree(xc->esc_virq_names[i]); 1183 1146 } 1184 - /* Free the queue */ 1147 + } 1148 + 1149 + /* Disable the VP */ 1150 + xive_native_disable_vp(xc->vp_id); 1151 + 1152 + /* Clear the cam word so guest entry won't try to push context */ 1153 + vcpu->arch.xive_cam_word = 0; 1154 + 1155 + /* Free the queues */ 1156 + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 1157 + struct xive_q *q = &xc->queues[i]; 1158 + 1185 1159 xive_native_disable_queue(xc->vp_id, q, i); 1186 1160 if (q->qpage) { 1187 1161 free_pages((unsigned long)q->qpage,

+2

arch/powerpc/kvm/book3s_xive.h

··· 282 282 int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, 283 283 bool single_escalation); 284 284 struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); 285 + void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, 286 + struct kvmppc_xive_vcpu *xc, int irq); 285 287 286 288 #endif /* CONFIG_KVM_XICS */ 287 289 #endif /* _KVM_PPC_BOOK3S_XICS_H */

+18 -5

arch/powerpc/kvm/book3s_xive_native.c

··· 67 67 xc->valid = false; 68 68 kvmppc_xive_disable_vcpu_interrupts(vcpu); 69 69 70 - /* Disable the VP */ 71 - xive_native_disable_vp(xc->vp_id); 72 - 73 - /* Free the queues & associated interrupts */ 70 + /* Free escalations */ 74 71 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 75 72 /* Free the escalation irq */ 76 73 if (xc->esc_virq[i]) { 74 + if (xc->xive->single_escalation) 75 + xive_cleanup_single_escalation(vcpu, xc, 76 + xc->esc_virq[i]); 77 77 free_irq(xc->esc_virq[i], vcpu); 78 78 irq_dispose_mapping(xc->esc_virq[i]); 79 79 kfree(xc->esc_virq_names[i]); 80 80 xc->esc_virq[i] = 0; 81 81 } 82 + } 82 83 83 - /* Free the queue */ 84 + /* Disable the VP */ 85 + xive_native_disable_vp(xc->vp_id); 86 + 87 + /* Clear the cam word so guest entry won't try to push context */ 88 + vcpu->arch.xive_cam_word = 0; 89 + 90 + /* Free the queues */ 91 + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 84 92 kvmppc_xive_native_cleanup_queue(vcpu, i); 85 93 } 86 94 ··· 1177 1169 * state when captured. 1178 1170 */ 1179 1171 return 0; 1172 + } 1173 + 1174 + bool kvmppc_xive_native_supported(void) 1175 + { 1176 + return xive_native_has_queue_state_support(); 1180 1177 } 1181 1178 1182 1179 static int xive_native_debug_show(struct seq_file *m, void *private)

+3

arch/powerpc/kvm/e500.c

··· 440 440 struct kvm_vcpu *vcpu; 441 441 int err; 442 442 443 + BUILD_BUG_ON_MSG(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0, 444 + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); 445 + 443 446 vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 444 447 if (!vcpu_e500) { 445 448 err = -ENOMEM;

+1

arch/powerpc/kvm/emulate.c

··· 271 271 */ 272 272 if (inst == KVMPPC_INST_SW_BREAKPOINT) { 273 273 run->exit_reason = KVM_EXIT_DEBUG; 274 + run->debug.arch.status = 0; 274 275 run->debug.arch.address = kvmppc_get_pc(vcpu); 275 276 emulated = EMULATE_EXIT_USER; 276 277 advance = 0;

-6

arch/powerpc/kvm/emulate_loadstore.c

··· 89 89 rs = get_rs(inst); 90 90 rt = get_rt(inst); 91 91 92 - /* 93 - * if mmio_vsx_tx_sx_enabled == 0, copy data between 94 - * VSR[0..31] and memory 95 - * if mmio_vsx_tx_sx_enabled == 1, copy data between 96 - * VSR[32..63] and memory 97 - */ 98 92 vcpu->arch.mmio_vsx_copy_nums = 0; 99 93 vcpu->arch.mmio_vsx_offset = 0; 100 94 vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;

+2 -1

arch/powerpc/kvm/powerpc.c

··· 561 561 * a POWER9 processor) and the PowerNV platform, as 562 562 * nested is not yet supported. 563 563 */ 564 - r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE); 564 + r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE) && 565 + kvmppc_xive_native_supported(); 565 566 break; 566 567 #endif 567 568

+64 -23

arch/powerpc/sysdev/xive/common.c

··· 135 135 static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) 136 136 { 137 137 u32 irq = 0; 138 - u8 prio; 138 + u8 prio = 0; 139 139 140 140 /* Find highest pending priority */ 141 141 while (xc->pending_prio != 0) { ··· 148 148 irq = xive_read_eq(&xc->queue[prio], just_peek); 149 149 150 150 /* Found something ? That's it */ 151 - if (irq) 152 - break; 151 + if (irq) { 152 + if (just_peek || irq_to_desc(irq)) 153 + break; 154 + /* 155 + * We should never get here; if we do then we must 156 + * have failed to synchronize the interrupt properly 157 + * when shutting it down. 158 + */ 159 + pr_crit("xive: got interrupt %d without descriptor, dropping\n", 160 + irq); 161 + WARN_ON(1); 162 + continue; 163 + } 153 164 154 165 /* Clear pending bits */ 155 166 xc->pending_prio &= ~(1 << prio); ··· 318 307 */ 319 308 static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) 320 309 { 310 + xd->stale_p = false; 321 311 /* If the XIVE supports the new "store EOI facility, use it */ 322 312 if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) 323 313 xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); ··· 362 350 } 363 351 } 364 352 365 - /* irq_chip eoi callback */ 353 + /* irq_chip eoi callback, called with irq descriptor lock held */ 366 354 static void xive_irq_eoi(struct irq_data *d) 367 355 { 368 356 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); ··· 378 366 if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) && 379 367 !(xd->flags & XIVE_IRQ_NO_EOI)) 380 368 xive_do_source_eoi(irqd_to_hwirq(d), xd); 369 + else 370 + xd->stale_p = true; 381 371 382 372 /* 383 373 * Clear saved_p to indicate that it's no longer occupying ··· 411 397 */ 412 398 if (mask) { 413 399 val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01); 414 - xd->saved_p = !!(val & XIVE_ESB_VAL_P); 415 - } else if (xd->saved_p) 400 + if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P)) 401 + xd->saved_p = true; 402 + xd->stale_p = false; 403 + } else if (xd->saved_p) { 416 404 xive_esb_read(xd, XIVE_ESB_SET_PQ_10); 417 - else 405 + xd->saved_p = false; 406 + } else { 418 407 xive_esb_read(xd, XIVE_ESB_SET_PQ_00); 408 + xd->stale_p = false; 409 + } 419 410 } 420 411 421 412 /* ··· 560 541 unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 561 542 int target, rc; 562 543 544 + xd->saved_p = false; 545 + xd->stale_p = false; 563 546 pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", 564 547 d->irq, hw_irq, d); 565 548 ··· 608 587 return 0; 609 588 } 610 589 590 + /* called with irq descriptor lock held */ 611 591 static void xive_irq_shutdown(struct irq_data *d) 612 592 { 613 593 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); ··· 622 600 623 601 /* Mask the interrupt at the source */ 624 602 xive_do_source_set_mask(xd, true); 625 - 626 - /* 627 - * The above may have set saved_p. We clear it otherwise it 628 - * will prevent re-enabling later on. It is ok to forget the 629 - * fact that the interrupt might be in a queue because we are 630 - * accounting that already in xive_dec_target_count() and will 631 - * be re-routing it to a new queue with proper accounting when 632 - * it's started up again 633 - */ 634 - xd->saved_p = false; 635 603 636 604 /* 637 605 * Mask the interrupt in HW in the IVT/EAS and set the number ··· 809 797 return 1; 810 798 } 811 799 800 + /* 801 + * Caller holds the irq descriptor lock, so this won't be called 802 + * concurrently with xive_get_irqchip_state on the same interrupt. 803 + */ 812 804 static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) 813 805 { 814 806 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); ··· 836 820 837 821 /* Set it to PQ=10 state to prevent further sends */ 838 822 pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10); 823 + if (!xd->stale_p) { 824 + xd->saved_p = !!(pq & XIVE_ESB_VAL_P); 825 + xd->stale_p = !xd->saved_p; 826 + } 839 827 840 828 /* No target ? nothing to do */ 841 829 if (xd->target == XIVE_INVALID_TARGET) { ··· 847 827 * An untargetted interrupt should have been 848 828 * also masked at the source 849 829 */ 850 - WARN_ON(pq & 2); 830 + WARN_ON(xd->saved_p); 851 831 852 832 return 0; 853 833 } ··· 867 847 * This saved_p is cleared by the host EOI, when we know 868 848 * for sure the queue slot is no longer in use. 869 849 */ 870 - if (pq & 2) { 871 - pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11); 872 - xd->saved_p = true; 850 + if (xd->saved_p) { 851 + xive_esb_read(xd, XIVE_ESB_SET_PQ_11); 873 852 874 853 /* 875 854 * Sync the XIVE source HW to ensure the interrupt ··· 881 862 */ 882 863 if (xive_ops->sync_source) 883 864 xive_ops->sync_source(hw_irq); 884 - } else 885 - xd->saved_p = false; 865 + } 886 866 } else { 887 867 irqd_clr_forwarded_to_vcpu(d); 888 868 ··· 932 914 return 0; 933 915 } 934 916 917 + /* Called with irq descriptor lock held. */ 918 + static int xive_get_irqchip_state(struct irq_data *data, 919 + enum irqchip_irq_state which, bool *state) 920 + { 921 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); 922 + 923 + switch (which) { 924 + case IRQCHIP_STATE_ACTIVE: 925 + *state = !xd->stale_p && 926 + (xd->saved_p || 927 + !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P)); 928 + return 0; 929 + default: 930 + return -EINVAL; 931 + } 932 + } 933 + 935 934 static struct irq_chip xive_irq_chip = { 936 935 .name = "XIVE-IRQ", 937 936 .irq_startup = xive_irq_startup, ··· 960 925 .irq_set_type = xive_irq_set_type, 961 926 .irq_retrigger = xive_irq_retrigger, 962 927 .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity, 928 + .irq_get_irqchip_state = xive_get_irqchip_state, 963 929 }; 964 930 965 931 bool is_xive_irq(struct irq_chip *chip) ··· 1372 1336 #endif 1373 1337 raw_spin_lock(&desc->lock); 1374 1338 xd = irq_desc_get_handler_data(desc); 1339 + 1340 + /* 1341 + * Clear saved_p to indicate that it's no longer pending 1342 + */ 1343 + xd->saved_p = false; 1375 1344 1376 1345 /* 1377 1346 * For LSIs, we EOI, this will cause a resend if it's

+7

arch/powerpc/sysdev/xive/native.c

··· 800 800 } 801 801 EXPORT_SYMBOL_GPL(xive_native_set_queue_state); 802 802 803 + bool xive_native_has_queue_state_support(void) 804 + { 805 + return opal_check_token(OPAL_XIVE_GET_QUEUE_STATE) && 806 + opal_check_token(OPAL_XIVE_SET_QUEUE_STATE); 807 + } 808 + EXPORT_SYMBOL_GPL(xive_native_has_queue_state_support); 809 + 803 810 int xive_native_get_vp_state(u32 vp_id, u64 *out_state) 804 811 { 805 812 __be64 state;

+6

arch/s390/include/uapi/asm/kvm.h

··· 231 231 #define KVM_SYNC_GSCB (1UL << 9) 232 232 #define KVM_SYNC_BPBC (1UL << 10) 233 233 #define KVM_SYNC_ETOKEN (1UL << 11) 234 + 235 + #define KVM_SYNC_S390_VALID_FIELDS \ 236 + (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ 237 + KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ 238 + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) 239 + 234 240 /* length and alignment of the sdnx as a power of two */ 235 241 #define SDNXC 8 236 242 #define SDNXL (1UL << SDNXC)

+5 -1

arch/s390/kvm/kvm-s390.c

··· 4000 4000 if (kvm_run->immediate_exit) 4001 4001 return -EINTR; 4002 4002 4003 + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS || 4004 + kvm_run->kvm_dirty_regs & ~KVM_SYNC_S390_VALID_FIELDS) 4005 + return -EINVAL; 4006 + 4003 4007 vcpu_load(vcpu); 4004 4008 4005 4009 if (guestdbg_exit_pending(vcpu)) { ··· 4261 4257 const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION 4262 4258 | KVM_S390_MEMOP_F_CHECK_ONLY; 4263 4259 4264 - if (mop->flags & ~supported_flags) 4260 + if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size) 4265 4261 return -EINVAL; 4266 4262 4267 4263 if (mop->size > MEM_OP_MAX_SIZE)

+2 -1

arch/x86/include/asm/kvm_emulate.h

··· 229 229 int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, 230 230 const char *smstate); 231 231 void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); 232 - 232 + int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); 233 233 }; 234 234 235 235 typedef u32 __attribute__((vector_size(16))) sse128_t; ··· 429 429 x86_intercept_ins, 430 430 x86_intercept_out, 431 431 x86_intercept_outs, 432 + x86_intercept_xsetbv, 432 433 433 434 nr_x86_intercepts 434 435 };

+15 -4

arch/x86/include/asm/kvm_host.h

··· 718 718 719 719 /* Cache MMIO info */ 720 720 u64 mmio_gva; 721 - unsigned access; 721 + unsigned mmio_access; 722 722 gfn_t mmio_gfn; 723 723 u64 mmio_gen; 724 724 ··· 1072 1072 1073 1073 void (*run)(struct kvm_vcpu *vcpu); 1074 1074 int (*handle_exit)(struct kvm_vcpu *vcpu); 1075 - void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 1075 + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 1076 1076 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 1077 1077 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); 1078 1078 void (*patch_hypercall)(struct kvm_vcpu *vcpu, ··· 1211 1211 uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); 1212 1212 1213 1213 bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); 1214 + 1215 + bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); 1214 1216 }; 1215 1217 1216 1218 struct kvm_arch_async_pf { ··· 1330 1328 1331 1329 void kvm_enable_efer_bits(u64); 1332 1330 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); 1333 - int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 1334 - int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 1331 + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); 1332 + int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); 1333 + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); 1334 + int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); 1335 1335 1336 1336 struct x86_emulate_ctxt; 1337 1337 ··· 1586 1582 1587 1583 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, 1588 1584 struct kvm_lapic_irq *irq); 1585 + 1586 + static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) 1587 + { 1588 + /* We can only post Fixed and LowPrio IRQs */ 1589 + return (irq->delivery_mode == dest_Fixed || 1590 + irq->delivery_mode == dest_LowestPrio); 1591 + } 1589 1592 1590 1593 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) 1591 1594 {

+14

arch/x86/include/asm/vmx.h

··· 562 562 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, 563 563 }; 564 564 565 + /* 566 + * VM-instruction errors that can be encountered on VM-Enter, used to trace 567 + * nested VM-Enter failures reported by hardware. Errors unique to VM-Enter 568 + * from a SMI Transfer Monitor are not included as things have gone seriously 569 + * sideways if we get one of those... 570 + */ 571 + #define VMX_VMENTER_INSTRUCTION_ERRORS \ 572 + { VMXERR_VMLAUNCH_NONCLEAR_VMCS, "VMLAUNCH_NONCLEAR_VMCS" }, \ 573 + { VMXERR_VMRESUME_NONLAUNCHED_VMCS, "VMRESUME_NONLAUNCHED_VMCS" }, \ 574 + { VMXERR_VMRESUME_AFTER_VMXOFF, "VMRESUME_AFTER_VMXOFF" }, \ 575 + { VMXERR_ENTRY_INVALID_CONTROL_FIELD, "VMENTRY_INVALID_CONTROL_FIELD" }, \ 576 + { VMXERR_ENTRY_INVALID_HOST_STATE_FIELD, "VMENTRY_INVALID_HOST_STATE_FIELD" }, \ 577 + { VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS, "VMENTRY_EVENTS_BLOCKED_BY_MOV_SS" } 578 + 565 579 enum vmx_l1d_flush_state { 566 580 VMENTER_L1D_FLUSH_AUTO, 567 581 VMENTER_L1D_FLUSH_NEVER,

+2

arch/x86/include/uapi/asm/vmx.h

··· 31 31 #define EXIT_REASON_EXCEPTION_NMI 0 32 32 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 33 33 #define EXIT_REASON_TRIPLE_FAULT 2 34 + #define EXIT_REASON_INIT_SIGNAL 3 34 35 35 36 #define EXIT_REASON_PENDING_INTERRUPT 7 36 37 #define EXIT_REASON_NMI_WINDOW 8 ··· 91 90 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ 92 91 { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ 93 92 { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ 93 + { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ 94 94 { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ 95 95 { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ 96 96 { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \

-12

arch/x86/kernel/kvm.c

··· 502 502 __send_ipi_mask(local_mask, vector); 503 503 } 504 504 505 - static void kvm_send_ipi_allbutself(int vector) 506 - { 507 - kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); 508 - } 509 - 510 - static void kvm_send_ipi_all(int vector) 511 - { 512 - __send_ipi_mask(cpu_online_mask, vector); 513 - } 514 - 515 505 /* 516 506 * Set the IPI entry points 517 507 */ ··· 509 519 { 510 520 apic->send_IPI_mask = kvm_send_ipi_mask; 511 521 apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; 512 - apic->send_IPI_allbutself = kvm_send_ipi_allbutself; 513 - apic->send_IPI_all = kvm_send_ipi_all; 514 522 pr_info("KVM setup pv IPIs\n"); 515 523 } 516 524

+21 -10

arch/x86/kvm/cpuid.c

··· 392 392 393 393 entry->edx &= kvm_cpuid_7_0_edx_x86_features; 394 394 cpuid_mask(&entry->edx, CPUID_7_EDX); 395 + if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) 396 + entry->edx |= F(SPEC_CTRL); 397 + if (boot_cpu_has(X86_FEATURE_STIBP)) 398 + entry->edx |= F(INTEL_STIBP); 399 + if (boot_cpu_has(X86_FEATURE_SSBD)) 400 + entry->edx |= F(SPEC_CTRL_SSBD); 395 401 /* 396 402 * We emulate ARCH_CAPABILITIES in software even 397 403 * if the host doesn't support it. ··· 735 729 g_phys_as = phys_as; 736 730 entry->eax = g_phys_as | (virt_as << 8); 737 731 entry->edx = 0; 738 - /* 739 - * IBRS, IBPB and VIRT_SSBD aren't necessarily present in 740 - * hardware cpuid 741 - */ 742 - if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) 743 - entry->ebx |= F(AMD_IBPB); 744 - if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) 745 - entry->ebx |= F(AMD_IBRS); 746 - if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) 747 - entry->ebx |= F(VIRT_SSBD); 748 732 entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; 749 733 cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); 734 + /* 735 + * AMD has separate bits for each SPEC_CTRL bit. 736 + * arch/x86/kernel/cpu/bugs.c is kind enough to 737 + * record that in cpufeatures so use them. 738 + */ 739 + if (boot_cpu_has(X86_FEATURE_IBPB)) 740 + entry->ebx |= F(AMD_IBPB); 741 + if (boot_cpu_has(X86_FEATURE_IBRS)) 742 + entry->ebx |= F(AMD_IBRS); 743 + if (boot_cpu_has(X86_FEATURE_STIBP)) 744 + entry->ebx |= F(AMD_STIBP); 745 + if (boot_cpu_has(X86_FEATURE_SSBD)) 746 + entry->ebx |= F(AMD_SSBD); 747 + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) 748 + entry->ebx |= F(AMD_SSB_NO); 750 749 /* 751 750 * The preference is to use SPEC CTRL MSR instead of the 752 751 * VIRT_SPEC MSR.

+25 -2

arch/x86/kvm/emulate.c

··· 4156 4156 return rc; 4157 4157 } 4158 4158 4159 + static int em_xsetbv(struct x86_emulate_ctxt *ctxt) 4160 + { 4161 + u32 eax, ecx, edx; 4162 + 4163 + eax = reg_read(ctxt, VCPU_REGS_RAX); 4164 + edx = reg_read(ctxt, VCPU_REGS_RDX); 4165 + ecx = reg_read(ctxt, VCPU_REGS_RCX); 4166 + 4167 + if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax)) 4168 + return emulate_gp(ctxt, 0); 4169 + 4170 + return X86EMUL_CONTINUE; 4171 + } 4172 + 4159 4173 static bool valid_cr(int nr) 4160 4174 { 4161 4175 switch (nr) { ··· 4423 4409 N, N, N, N, N, N, 4424 4410 }; 4425 4411 4412 + static const struct opcode group7_rm2[] = { 4413 + N, 4414 + II(ImplicitOps | Priv, em_xsetbv, xsetbv), 4415 + N, N, N, N, N, N, 4416 + }; 4417 + 4426 4418 static const struct opcode group7_rm3[] = { 4427 4419 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), 4428 4420 II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall), ··· 4518 4498 }, { 4519 4499 EXT(0, group7_rm0), 4520 4500 EXT(0, group7_rm1), 4521 - N, EXT(0, group7_rm3), 4501 + EXT(0, group7_rm2), 4502 + EXT(0, group7_rm3), 4522 4503 II(SrcNone | DstMem | Mov, em_smsw, smsw), N, 4523 4504 II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), 4524 4505 EXT(0, group7_rm7), ··· 5165 5144 else { 5166 5145 rc = __do_insn_fetch_bytes(ctxt, 1); 5167 5146 if (rc != X86EMUL_CONTINUE) 5168 - return rc; 5147 + goto done; 5169 5148 } 5170 5149 5171 5150 switch (mode) { ··· 5416 5395 ctxt->memopp->addr.mem.ea + ctxt->_eip); 5417 5396 5418 5397 done: 5398 + if (rc == X86EMUL_PROPAGATE_FAULT) 5399 + ctxt->have_exception = true; 5419 5400 return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; 5420 5401 } 5421 5402

+11 -9

arch/x86/kvm/lapic.c

··· 1198 1198 } 1199 1199 EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); 1200 1200 1201 - static void apic_send_ipi(struct kvm_lapic *apic) 1201 + static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) 1202 1202 { 1203 - u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR); 1204 - u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2); 1205 1203 struct kvm_lapic_irq irq; 1206 1204 1207 1205 irq.vector = icr_low & APIC_VECTOR_MASK; ··· 1912 1914 } 1913 1915 case APIC_ICR: 1914 1916 /* No delay here, so we always clear the pending bit */ 1915 - kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); 1916 - apic_send_ipi(apic); 1917 + val &= ~(1 << 12); 1918 + apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2)); 1919 + kvm_lapic_set_reg(apic, APIC_ICR, val); 1917 1920 break; 1918 1921 1919 1922 case APIC_ICR2: ··· 2706 2707 return; 2707 2708 2708 2709 /* 2709 - * INITs are latched while in SMM. Because an SMM CPU cannot 2710 - * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs 2711 - * and delay processing of INIT until the next RSM. 2710 + * INITs are latched while CPU is in specific states 2711 + * (SMM, VMX non-root mode, SVM with GIF=0). 2712 + * Because a CPU cannot be in these states immediately 2713 + * after it has processed an INIT signal (and thus in 2714 + * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs 2715 + * and leave the INIT pending. 2712 2716 */ 2713 - if (is_smm(vcpu)) { 2717 + if (is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu)) { 2714 2718 WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); 2715 2719 if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) 2716 2720 clear_bit(KVM_APIC_SIPI, &apic->pending_events);

+39 -22

arch/x86/kvm/mmu.c

··· 214 214 static u64 __read_mostly shadow_dirty_mask; 215 215 static u64 __read_mostly shadow_mmio_mask; 216 216 static u64 __read_mostly shadow_mmio_value; 217 + static u64 __read_mostly shadow_mmio_access_mask; 217 218 static u64 __read_mostly shadow_present_mask; 218 219 static u64 __read_mostly shadow_me_mask; 219 220 ··· 300 299 kvm_flush_remote_tlbs_with_range(kvm, &range); 301 300 } 302 301 303 - void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) 302 + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) 304 303 { 304 + BUG_ON((u64)(unsigned)access_mask != access_mask); 305 305 BUG_ON((mmio_mask & mmio_value) != mmio_value); 306 306 shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; 307 307 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; 308 + shadow_mmio_access_mask = access_mask; 308 309 } 309 310 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 311 + 312 + static bool is_mmio_spte(u64 spte) 313 + { 314 + return (spte & shadow_mmio_mask) == shadow_mmio_value; 315 + } 310 316 311 317 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) 312 318 { ··· 322 314 323 315 static inline bool spte_ad_enabled(u64 spte) 324 316 { 325 - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); 317 + MMU_WARN_ON(is_mmio_spte(spte)); 326 318 return !(spte & shadow_acc_track_value); 327 319 } 328 320 329 321 static inline u64 spte_shadow_accessed_mask(u64 spte) 330 322 { 331 - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); 323 + MMU_WARN_ON(is_mmio_spte(spte)); 332 324 return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; 333 325 } 334 326 335 327 static inline u64 spte_shadow_dirty_mask(u64 spte) 336 328 { 337 - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); 329 + MMU_WARN_ON(is_mmio_spte(spte)); 338 330 return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; 339 331 } 340 332 ··· 397 389 u64 mask = generation_mmio_spte_mask(gen); 398 390 u64 gpa = gfn << PAGE_SHIFT; 399 391 400 - access &= ACC_WRITE_MASK | ACC_USER_MASK; 392 + access &= shadow_mmio_access_mask; 401 393 mask |= shadow_mmio_value | access; 402 394 mask |= gpa | shadow_nonpresent_or_rsvd_mask; 403 395 mask |= (gpa & shadow_nonpresent_or_rsvd_mask) ··· 407 399 408 400 trace_mark_mmio_spte(sptep, gfn, access, gen); 409 401 mmu_spte_set(sptep, mask); 410 - } 411 - 412 - static bool is_mmio_spte(u64 spte) 413 - { 414 - return (spte & shadow_mmio_mask) == shadow_mmio_value; 415 402 } 416 403 417 404 static gfn_t get_mmio_spte_gfn(u64 spte) ··· 421 418 422 419 static unsigned get_mmio_spte_access(u64 spte) 423 420 { 424 - u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask; 425 - return (spte & ~mask) & ~PAGE_MASK; 421 + return spte & shadow_mmio_access_mask; 426 422 } 427 423 428 424 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, ··· 3304 3302 } 3305 3303 3306 3304 if (unlikely(is_noslot_pfn(pfn))) 3307 - vcpu_cache_mmio_info(vcpu, gva, gfn, access); 3305 + vcpu_cache_mmio_info(vcpu, gva, gfn, 3306 + access & shadow_mmio_access_mask); 3308 3307 3309 3308 return false; 3310 3309 } ··· 5614 5611 PT_PAGE_TABLE_LEVEL, lock_flush_tlb); 5615 5612 } 5616 5613 5617 - static void free_mmu_pages(struct kvm_vcpu *vcpu) 5614 + static void free_mmu_pages(struct kvm_mmu *mmu) 5618 5615 { 5619 - free_page((unsigned long)vcpu->arch.mmu->pae_root); 5620 - free_page((unsigned long)vcpu->arch.mmu->lm_root); 5616 + free_page((unsigned long)mmu->pae_root); 5617 + free_page((unsigned long)mmu->lm_root); 5621 5618 } 5622 5619 5623 - static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 5620 + static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 5624 5621 { 5625 5622 struct page *page; 5626 5623 int i; ··· 5641 5638 if (!page) 5642 5639 return -ENOMEM; 5643 5640 5644 - vcpu->arch.mmu->pae_root = page_address(page); 5641 + mmu->pae_root = page_address(page); 5645 5642 for (i = 0; i < 4; ++i) 5646 - vcpu->arch.mmu->pae_root[i] = INVALID_PAGE; 5643 + mmu->pae_root[i] = INVALID_PAGE; 5647 5644 5648 5645 return 0; 5649 5646 } ··· 5651 5648 int kvm_mmu_create(struct kvm_vcpu *vcpu) 5652 5649 { 5653 5650 uint i; 5651 + int ret; 5654 5652 5655 5653 vcpu->arch.mmu = &vcpu->arch.root_mmu; 5656 5654 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; ··· 5669 5665 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; 5670 5666 5671 5667 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; 5672 - return alloc_mmu_pages(vcpu); 5668 + 5669 + ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu); 5670 + if (ret) 5671 + return ret; 5672 + 5673 + ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu); 5674 + if (ret) 5675 + goto fail_allocate_root; 5676 + 5677 + return ret; 5678 + fail_allocate_root: 5679 + free_mmu_pages(&vcpu->arch.guest_mmu); 5680 + return ret; 5673 5681 } 5674 5682 5675 5683 ··· 6110 6094 if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) 6111 6095 mask &= ~1ull; 6112 6096 6113 - kvm_mmu_set_mmio_spte_mask(mask, mask); 6097 + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); 6114 6098 } 6115 6099 6116 6100 int kvm_mmu_module_init(void) ··· 6184 6168 void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 6185 6169 { 6186 6170 kvm_mmu_unload(vcpu); 6187 - free_mmu_pages(vcpu); 6171 + free_mmu_pages(&vcpu->arch.root_mmu); 6172 + free_mmu_pages(&vcpu->arch.guest_mmu); 6188 6173 mmu_free_memory_caches(vcpu); 6189 6174 } 6190 6175

+1 -1

arch/x86/kvm/mmu.h

··· 51 51 return ((1ULL << (e - s + 1)) - 1) << s; 52 52 } 53 53 54 - void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); 54 + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask); 55 55 56 56 void 57 57 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);

+93 -105

arch/x86/kvm/svm.c

··· 68 68 #define SEG_TYPE_LDT 2 69 69 #define SEG_TYPE_BUSY_TSS16 3 70 70 71 - #define SVM_FEATURE_NPT (1 << 0) 72 71 #define SVM_FEATURE_LBRV (1 << 1) 73 72 #define SVM_FEATURE_SVML (1 << 2) 74 - #define SVM_FEATURE_NRIP (1 << 3) 75 73 #define SVM_FEATURE_TSC_RATE (1 << 4) 76 74 #define SVM_FEATURE_VMCB_CLEAN (1 << 5) 77 75 #define SVM_FEATURE_FLUSH_ASID (1 << 6) ··· 768 770 769 771 } 770 772 771 - static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 773 + static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 772 774 { 773 775 struct vcpu_svm *svm = to_svm(vcpu); 774 776 ··· 777 779 svm->next_rip = svm->vmcb->control.next_rip; 778 780 } 779 781 780 - if (!svm->next_rip) { 781 - if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != 782 - EMULATE_DONE) 783 - printk(KERN_DEBUG "%s: NOP\n", __func__); 784 - return; 785 - } 782 + if (!svm->next_rip) 783 + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP); 784 + 786 785 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) 787 786 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", 788 787 __func__, kvm_rip_read(vcpu), svm->next_rip); 789 788 790 789 kvm_rip_write(vcpu, svm->next_rip); 791 790 svm_set_interrupt_shadow(vcpu, 0); 791 + 792 + return EMULATE_DONE; 792 793 } 793 794 794 795 static void svm_queue_exception(struct kvm_vcpu *vcpu) ··· 818 821 * raises a fault that is not intercepted. Still better than 819 822 * failing in all cases. 820 823 */ 821 - skip_emulated_instruction(&svm->vcpu); 824 + (void)skip_emulated_instruction(&svm->vcpu); 822 825 rip = kvm_rip_read(&svm->vcpu); 823 826 svm->int3_rip = rip + svm->vmcb->save.cs.base; 824 827 svm->int3_injected = rip - old_rip; ··· 1266 1269 pause_filter_count_grow, 1267 1270 pause_filter_count_max); 1268 1271 1269 - if (control->pause_filter_count != old) 1272 + if (control->pause_filter_count != old) { 1270 1273 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1271 - 1272 - trace_kvm_ple_window_grow(vcpu->vcpu_id, 1273 - control->pause_filter_count, old); 1274 + trace_kvm_ple_window_update(vcpu->vcpu_id, 1275 + control->pause_filter_count, old); 1276 + } 1274 1277 } 1275 1278 1276 1279 static void shrink_ple_window(struct kvm_vcpu *vcpu) ··· 1284 1287 pause_filter_count, 1285 1288 pause_filter_count_shrink, 1286 1289 pause_filter_count); 1287 - if (control->pause_filter_count != old) 1290 + if (control->pause_filter_count != old) { 1288 1291 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1289 - 1290 - trace_kvm_ple_window_shrink(vcpu->vcpu_id, 1291 - control->pause_filter_count, old); 1292 + trace_kvm_ple_window_update(vcpu->vcpu_id, 1293 + control->pause_filter_count, old); 1294 + } 1292 1295 } 1293 1296 1294 1297 static __init int svm_hardware_setup(void) ··· 2133 2136 struct page *nested_msrpm_pages; 2134 2137 int err; 2135 2138 2139 + BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, 2140 + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); 2141 + 2136 2142 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); 2137 2143 if (!svm) { 2138 2144 err = -ENOMEM; ··· 2903 2903 2904 2904 static int halt_interception(struct vcpu_svm *svm) 2905 2905 { 2906 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 2907 2906 return kvm_emulate_halt(&svm->vcpu); 2908 2907 } 2909 2908 2910 2909 static int vmmcall_interception(struct vcpu_svm *svm) 2911 2910 { 2912 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2913 2911 return kvm_emulate_hypercall(&svm->vcpu); 2914 2912 } 2915 2913 ··· 3586 3588 mark_all_dirty(svm->vmcb); 3587 3589 } 3588 3590 3589 - static bool nested_svm_vmrun(struct vcpu_svm *svm) 3591 + static int nested_svm_vmrun(struct vcpu_svm *svm) 3590 3592 { 3591 - int rc; 3593 + int ret; 3592 3594 struct vmcb *nested_vmcb; 3593 3595 struct vmcb *hsave = svm->nested.hsave; 3594 3596 struct vmcb *vmcb = svm->vmcb; ··· 3597 3599 3598 3600 vmcb_gpa = svm->vmcb->save.rax; 3599 3601 3600 - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); 3601 - if (rc) { 3602 - if (rc == -EINVAL) 3603 - kvm_inject_gp(&svm->vcpu, 0); 3604 - return false; 3602 + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); 3603 + if (ret == -EINVAL) { 3604 + kvm_inject_gp(&svm->vcpu, 0); 3605 + return 1; 3606 + } else if (ret) { 3607 + return kvm_skip_emulated_instruction(&svm->vcpu); 3605 3608 } 3609 + 3610 + ret = kvm_skip_emulated_instruction(&svm->vcpu); 3606 3611 3607 3612 nested_vmcb = map.hva; 3608 3613 ··· 3617 3616 3618 3617 kvm_vcpu_unmap(&svm->vcpu, &map, true); 3619 3618 3620 - return false; 3619 + return ret; 3621 3620 } 3622 3621 3623 3622 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, ··· 3661 3660 3662 3661 enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); 3663 3662 3664 - return true; 3663 + if (!nested_svm_vmrun_msrpm(svm)) { 3664 + svm->vmcb->control.exit_code = SVM_EXIT_ERR; 3665 + svm->vmcb->control.exit_code_hi = 0; 3666 + svm->vmcb->control.exit_info_1 = 0; 3667 + svm->vmcb->control.exit_info_2 = 0; 3668 + 3669 + nested_svm_vmexit(svm); 3670 + } 3671 + 3672 + return ret; 3665 3673 } 3666 3674 3667 3675 static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) ··· 3707 3697 3708 3698 nested_vmcb = map.hva; 3709 3699 3710 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3711 3700 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3712 3701 3713 3702 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); ··· 3733 3724 3734 3725 nested_vmcb = map.hva; 3735 3726 3736 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3737 3727 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3738 3728 3739 3729 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); ··· 3746 3738 if (nested_svm_check_permissions(svm)) 3747 3739 return 1; 3748 3740 3749 - /* Save rip after vmrun instruction */ 3750 - kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); 3751 - 3752 - if (!nested_svm_vmrun(svm)) 3753 - return 1; 3754 - 3755 - if (!nested_svm_vmrun_msrpm(svm)) 3756 - goto failed; 3757 - 3758 - return 1; 3759 - 3760 - failed: 3761 - 3762 - svm->vmcb->control.exit_code = SVM_EXIT_ERR; 3763 - svm->vmcb->control.exit_code_hi = 0; 3764 - svm->vmcb->control.exit_info_1 = 0; 3765 - svm->vmcb->control.exit_info_2 = 0; 3766 - 3767 - nested_svm_vmexit(svm); 3768 - 3769 - return 1; 3741 + return nested_svm_vmrun(svm); 3770 3742 } 3771 3743 3772 3744 static int stgi_interception(struct vcpu_svm *svm) ··· 3763 3775 if (vgif_enabled(svm)) 3764 3776 clr_intercept(svm, INTERCEPT_STGI); 3765 3777 3766 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3767 3778 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3768 3779 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 3769 3780 ··· 3778 3791 if (nested_svm_check_permissions(svm)) 3779 3792 return 1; 3780 3793 3781 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3782 3794 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3783 3795 3784 3796 disable_gif(svm); ··· 3802 3816 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 3803 3817 kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); 3804 3818 3805 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3806 3819 return kvm_skip_emulated_instruction(&svm->vcpu); 3807 3820 } 3808 3821 ··· 3824 3839 u32 index = kvm_rcx_read(&svm->vcpu); 3825 3840 3826 3841 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { 3827 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3828 3842 return kvm_skip_emulated_instruction(&svm->vcpu); 3829 3843 } 3830 3844 ··· 3882 3898 if (reason != TASK_SWITCH_GATE || 3883 3899 int_type == SVM_EXITINTINFO_TYPE_SOFT || 3884 3900 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 3885 - (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 3886 - skip_emulated_instruction(&svm->vcpu); 3901 + (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { 3902 + if (skip_emulated_instruction(&svm->vcpu) != EMULATE_DONE) 3903 + goto fail; 3904 + } 3887 3905 3888 3906 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 3889 3907 int_vec = -1; 3890 3908 3891 3909 if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, 3892 - has_error_code, error_code) == EMULATE_FAIL) { 3893 - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3894 - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3895 - svm->vcpu.run->internal.ndata = 0; 3896 - return 0; 3897 - } 3910 + has_error_code, error_code) == EMULATE_FAIL) 3911 + goto fail; 3912 + 3898 3913 return 1; 3914 + 3915 + fail: 3916 + svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3917 + svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3918 + svm->vcpu.run->internal.ndata = 0; 3919 + return 0; 3899 3920 } 3900 3921 3901 3922 static int cpuid_interception(struct vcpu_svm *svm) 3902 3923 { 3903 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3904 3924 return kvm_emulate_cpuid(&svm->vcpu); 3905 3925 } 3906 3926 ··· 4220 4232 4221 4233 static int rdmsr_interception(struct vcpu_svm *svm) 4222 4234 { 4223 - u32 ecx = kvm_rcx_read(&svm->vcpu); 4224 - struct msr_data msr_info; 4225 - 4226 - msr_info.index = ecx; 4227 - msr_info.host_initiated = false; 4228 - if (svm_get_msr(&svm->vcpu, &msr_info)) { 4229 - trace_kvm_msr_read_ex(ecx); 4230 - kvm_inject_gp(&svm->vcpu, 0); 4231 - return 1; 4232 - } else { 4233 - trace_kvm_msr_read(ecx, msr_info.data); 4234 - 4235 - kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff); 4236 - kvm_rdx_write(&svm->vcpu, msr_info.data >> 32); 4237 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 4238 - return kvm_skip_emulated_instruction(&svm->vcpu); 4239 - } 4235 + return kvm_emulate_rdmsr(&svm->vcpu); 4240 4236 } 4241 4237 4242 4238 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) ··· 4410 4438 4411 4439 static int wrmsr_interception(struct vcpu_svm *svm) 4412 4440 { 4413 - struct msr_data msr; 4414 - u32 ecx = kvm_rcx_read(&svm->vcpu); 4415 - u64 data = kvm_read_edx_eax(&svm->vcpu); 4416 - 4417 - msr.data = data; 4418 - msr.index = ecx; 4419 - msr.host_initiated = false; 4420 - 4421 - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 4422 - if (kvm_set_msr(&svm->vcpu, &msr)) { 4423 - trace_kvm_msr_write_ex(ecx, data); 4424 - kvm_inject_gp(&svm->vcpu, 0); 4425 - return 1; 4426 - } else { 4427 - trace_kvm_msr_write(ecx, data); 4428 - return kvm_skip_emulated_instruction(&svm->vcpu); 4429 - } 4441 + return kvm_emulate_wrmsr(&svm->vcpu); 4430 4442 } 4431 4443 4432 4444 static int msr_interception(struct vcpu_svm *svm) ··· 4981 5025 4982 5026 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 4983 5027 || !svm_exit_handlers[exit_code]) { 4984 - WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); 4985 - kvm_queue_exception(vcpu, UD_VECTOR); 4986 - return 1; 5028 + vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); 5029 + dump_vmcb(vcpu); 5030 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5031 + vcpu->run->internal.suberror = 5032 + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 5033 + vcpu->run->internal.ndata = 1; 5034 + vcpu->run->internal.data[0] = exit_code; 5035 + return 0; 4987 5036 } 4988 5037 4989 5038 return svm_exit_handlers[exit_code](svm); ··· 5235 5274 5236 5275 kvm_set_msi_irq(kvm, e, &irq); 5237 5276 5238 - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { 5277 + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 5278 + !kvm_irq_is_postable(&irq)) { 5239 5279 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", 5240 5280 __func__, irq.vector); 5241 5281 return -1; ··· 5290 5328 * 1. When cannot target interrupt to a specific vcpu. 5291 5329 * 2. Unsetting posted interrupt. 5292 5330 * 3. APIC virtialization is disabled for the vcpu. 5331 + * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) 5293 5332 */ 5294 5333 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && 5295 5334 kvm_vcpu_apicv_active(&svm->vcpu)) { ··· 5896 5933 guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); 5897 5934 } 5898 5935 5936 + #define F(x) bit(X86_FEATURE_##x) 5937 + 5899 5938 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 5900 5939 { 5901 5940 switch (func) { ··· 5909 5944 if (nested) 5910 5945 entry->ecx |= (1 << 2); /* Set SVM bit */ 5911 5946 break; 5947 + case 0x80000008: 5948 + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 5949 + boot_cpu_has(X86_FEATURE_AMD_SSBD)) 5950 + entry->ebx |= F(VIRT_SSBD); 5951 + break; 5912 5952 case 0x8000000A: 5913 5953 entry->eax = 1; /* SVM revision 1 */ 5914 5954 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper ··· 5924 5954 5925 5955 /* Support next_rip if host supports it */ 5926 5956 if (boot_cpu_has(X86_FEATURE_NRIPS)) 5927 - entry->edx |= SVM_FEATURE_NRIP; 5957 + entry->edx |= F(NRIPS); 5928 5958 5929 5959 /* Support NPT for the guest if enabled */ 5930 5960 if (npt_enabled) 5931 - entry->edx |= SVM_FEATURE_NPT; 5961 + entry->edx |= F(NPT); 5932 5962 5933 5963 break; 5934 5964 case 0x8000001F: ··· 6037 6067 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), 6038 6068 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), 6039 6069 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), 6070 + [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), 6040 6071 }; 6041 6072 6042 6073 #undef PRE_EX ··· 7164 7193 return false; 7165 7194 } 7166 7195 7196 + static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 7197 + { 7198 + struct vcpu_svm *svm = to_svm(vcpu); 7199 + 7200 + /* 7201 + * TODO: Last condition latch INIT signals on vCPU when 7202 + * vCPU is in guest-mode and vmcb12 defines intercept on INIT. 7203 + * To properly emulate the INIT intercept, SVM should implement 7204 + * kvm_x86_ops->check_nested_events() and call nested_svm_vmexit() 7205 + * there if an INIT signal is pending. 7206 + */ 7207 + return !gif_set(svm) || 7208 + (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); 7209 + } 7210 + 7167 7211 static struct kvm_x86_ops svm_x86_ops __ro_after_init = { 7168 7212 .cpu_has_kvm_support = has_svm, 7169 7213 .disabled_by_bios = is_disabled, ··· 7315 7329 .nested_get_evmcs_version = NULL, 7316 7330 7317 7331 .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, 7332 + 7333 + .apic_init_signal_blocked = svm_apic_init_signal_blocked, 7318 7334 }; 7319 7335 7320 7336 static int __init svm_init(void)

+54 -20

arch/x86/kvm/trace.h

··· 232 232 __field( u32, isa ) 233 233 __field( u64, info1 ) 234 234 __field( u64, info2 ) 235 + __field( unsigned int, vcpu_id ) 235 236 ), 236 237 237 238 TP_fast_assign( 238 239 __entry->exit_reason = exit_reason; 239 240 __entry->guest_rip = kvm_rip_read(vcpu); 240 241 __entry->isa = isa; 242 + __entry->vcpu_id = vcpu->vcpu_id; 241 243 kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, 242 244 &__entry->info2); 243 245 ), 244 246 245 - TP_printk("reason %s rip 0x%lx info %llx %llx", 247 + TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx", 248 + __entry->vcpu_id, 246 249 (__entry->isa == KVM_ISA_VMX) ? 247 250 __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : 248 251 __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), ··· 890 887 TP_printk("vcpu %d: PML full", __entry->vcpu_id) 891 888 ); 892 889 893 - TRACE_EVENT(kvm_ple_window, 894 - TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), 895 - TP_ARGS(grow, vcpu_id, new, old), 890 + TRACE_EVENT(kvm_ple_window_update, 891 + TP_PROTO(unsigned int vcpu_id, unsigned int new, unsigned int old), 892 + TP_ARGS(vcpu_id, new, old), 896 893 897 894 TP_STRUCT__entry( 898 - __field( bool, grow ) 899 895 __field( unsigned int, vcpu_id ) 900 - __field( int, new ) 901 - __field( int, old ) 896 + __field( unsigned int, new ) 897 + __field( unsigned int, old ) 902 898 ), 903 899 904 900 TP_fast_assign( 905 - __entry->grow = grow; 906 901 __entry->vcpu_id = vcpu_id; 907 902 __entry->new = new; 908 903 __entry->old = old; 909 904 ), 910 905 911 - TP_printk("vcpu %u: ple_window %d (%s %d)", 912 - __entry->vcpu_id, 913 - __entry->new, 914 - __entry->grow ? "grow" : "shrink", 915 - __entry->old) 906 + TP_printk("vcpu %u old %u new %u (%s)", 907 + __entry->vcpu_id, __entry->old, __entry->new, 908 + __entry->old < __entry->new ? "growed" : "shrinked") 916 909 ); 917 - 918 - #define trace_kvm_ple_window_grow(vcpu_id, new, old) \ 919 - trace_kvm_ple_window(true, vcpu_id, new, old) 920 - #define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ 921 - trace_kvm_ple_window(false, vcpu_id, new, old) 922 910 923 911 TRACE_EVENT(kvm_pvclock_update, 924 912 TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock), ··· 1314 1320 __entry->index = index; 1315 1321 ), 1316 1322 1317 - TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n", 1323 + TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u", 1318 1324 __entry->vcpu, __entry->icrh, __entry->icrl, 1319 1325 __entry->id, __entry->index) 1320 1326 ); ··· 1339 1345 __entry->vec = vec; 1340 1346 ), 1341 1347 1342 - TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n", 1348 + TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x", 1343 1349 __entry->vcpu, 1344 1350 __entry->offset, 1345 1351 __print_symbolic(__entry->offset, kvm_trace_symbol_apic), ··· 1456 1462 __entry->vector, __entry->format, 1457 1463 __entry->valid_bank_mask) 1458 1464 ); 1465 + 1466 + TRACE_EVENT(kvm_pv_tlb_flush, 1467 + TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb), 1468 + TP_ARGS(vcpu_id, need_flush_tlb), 1469 + 1470 + TP_STRUCT__entry( 1471 + __field( unsigned int, vcpu_id ) 1472 + __field( bool, need_flush_tlb ) 1473 + ), 1474 + 1475 + TP_fast_assign( 1476 + __entry->vcpu_id = vcpu_id; 1477 + __entry->need_flush_tlb = need_flush_tlb; 1478 + ), 1479 + 1480 + TP_printk("vcpu %u need_flush_tlb %s", __entry->vcpu_id, 1481 + __entry->need_flush_tlb ? "true" : "false") 1482 + ); 1483 + 1484 + /* 1485 + * Tracepoint for failed nested VMX VM-Enter. 1486 + */ 1487 + TRACE_EVENT(kvm_nested_vmenter_failed, 1488 + TP_PROTO(const char *msg, u32 err), 1489 + TP_ARGS(msg, err), 1490 + 1491 + TP_STRUCT__entry( 1492 + __field(const char *, msg) 1493 + __field(u32, err) 1494 + ), 1495 + 1496 + TP_fast_assign( 1497 + __entry->msg = msg; 1498 + __entry->err = err; 1499 + ), 1500 + 1501 + TP_printk("%s%s", __entry->msg, !__entry->err ? "" : 1502 + __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) 1503 + ); 1504 + 1459 1505 #endif /* _TRACE_KVM_H */ 1460 1506 1461 1507 #undef TRACE_INCLUDE_PATH

+162 -143

arch/x86/kvm/vmx/nested.c

··· 19 19 static bool __read_mostly nested_early_check = 0; 20 20 module_param(nested_early_check, bool, S_IRUGO); 21 21 22 + #define CC(consistency_check) \ 23 + ({ \ 24 + bool failed = (consistency_check); \ 25 + if (failed) \ 26 + trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ 27 + failed; \ 28 + }) 29 + 22 30 /* 23 31 * Hyper-V requires all of these, so mark them as supported even though 24 32 * they are just treated the same as all-context. ··· 438 430 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 439 431 return 0; 440 432 441 - if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || 442 - !page_address_valid(vcpu, vmcs12->io_bitmap_b)) 433 + if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 434 + CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 443 435 return -EINVAL; 444 436 445 437 return 0; ··· 451 443 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 452 444 return 0; 453 445 454 - if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) 446 + if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 455 447 return -EINVAL; 456 448 457 449 return 0; ··· 463 455 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 464 456 return 0; 465 457 466 - if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) 458 + if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 467 459 return -EINVAL; 468 460 469 461 return 0; ··· 696 688 struct vmcs12 *vmcs12) 697 689 { 698 690 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 699 - !page_address_valid(vcpu, vmcs12->apic_access_addr)) 691 + CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 700 692 return -EINVAL; 701 693 else 702 694 return 0; ··· 715 707 * If virtualize x2apic mode is enabled, 716 708 * virtualize apic access must be disabled. 717 709 */ 718 - if (nested_cpu_has_virt_x2apic_mode(vmcs12) && 719 - nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 710 + if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 711 + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 720 712 return -EINVAL; 721 713 722 714 /* 723 715 * If virtual interrupt delivery is enabled, 724 716 * we must exit on external interrupts. 725 717 */ 726 - if (nested_cpu_has_vid(vmcs12) && 727 - !nested_exit_on_intr(vcpu)) 718 + if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 728 719 return -EINVAL; 729 720 730 721 /* ··· 734 727 * bits 5:0 of posted_intr_desc_addr should be zero. 735 728 */ 736 729 if (nested_cpu_has_posted_intr(vmcs12) && 737 - (!nested_cpu_has_vid(vmcs12) || 738 - !nested_exit_intr_ack_set(vcpu) || 739 - (vmcs12->posted_intr_nv & 0xff00) || 740 - (vmcs12->posted_intr_desc_addr & 0x3f) || 741 - (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) 730 + (CC(!nested_cpu_has_vid(vmcs12)) || 731 + CC(!nested_exit_intr_ack_set(vcpu)) || 732 + CC((vmcs12->posted_intr_nv & 0xff00)) || 733 + CC((vmcs12->posted_intr_desc_addr & 0x3f)) || 734 + CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) 742 735 return -EINVAL; 743 736 744 737 /* tpr shadow is needed by all apicv features. */ 745 - if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 738 + if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 746 739 return -EINVAL; 747 740 748 741 return 0; ··· 766 759 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 767 760 struct vmcs12 *vmcs12) 768 761 { 769 - if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, 770 - vmcs12->vm_exit_msr_load_addr) || 771 - nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, 772 - vmcs12->vm_exit_msr_store_addr)) 762 + if (CC(nested_vmx_check_msr_switch(vcpu, 763 + vmcs12->vm_exit_msr_load_count, 764 + vmcs12->vm_exit_msr_load_addr)) || 765 + CC(nested_vmx_check_msr_switch(vcpu, 766 + vmcs12->vm_exit_msr_store_count, 767 + vmcs12->vm_exit_msr_store_addr))) 773 768 return -EINVAL; 774 769 775 770 return 0; ··· 780 771 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 781 772 struct vmcs12 *vmcs12) 782 773 { 783 - if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, 784 - vmcs12->vm_entry_msr_load_addr)) 774 + if (CC(nested_vmx_check_msr_switch(vcpu, 775 + vmcs12->vm_entry_msr_load_count, 776 + vmcs12->vm_entry_msr_load_addr))) 785 777 return -EINVAL; 786 778 787 779 return 0; ··· 794 784 if (!nested_cpu_has_pml(vmcs12)) 795 785 return 0; 796 786 797 - if (!nested_cpu_has_ept(vmcs12) || 798 - !page_address_valid(vcpu, vmcs12->pml_address)) 787 + if (CC(!nested_cpu_has_ept(vmcs12)) || 788 + CC(!page_address_valid(vcpu, vmcs12->pml_address))) 799 789 return -EINVAL; 800 790 801 791 return 0; ··· 804 794 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 805 795 struct vmcs12 *vmcs12) 806 796 { 807 - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 808 - !nested_cpu_has_ept(vmcs12)) 797 + if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 798 + !nested_cpu_has_ept(vmcs12))) 809 799 return -EINVAL; 810 800 return 0; 811 801 } ··· 813 803 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 814 804 struct vmcs12 *vmcs12) 815 805 { 816 - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 817 - !nested_cpu_has_ept(vmcs12)) 806 + if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 807 + !nested_cpu_has_ept(vmcs12))) 818 808 return -EINVAL; 819 809 return 0; 820 810 } ··· 825 815 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 826 816 return 0; 827 817 828 - if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || 829 - !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) 818 + if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 819 + CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 830 820 return -EINVAL; 831 821 832 822 return 0; ··· 836 826 struct vmx_msr_entry *e) 837 827 { 838 828 /* x2APIC MSR accesses are not allowed */ 839 - if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) 829 + if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 840 830 return -EINVAL; 841 - if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ 842 - e->index == MSR_IA32_UCODE_REV) 831 + if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 832 + CC(e->index == MSR_IA32_UCODE_REV)) 843 833 return -EINVAL; 844 - if (e->reserved != 0) 834 + if (CC(e->reserved != 0)) 845 835 return -EINVAL; 846 836 return 0; 847 837 } ··· 849 839 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 850 840 struct vmx_msr_entry *e) 851 841 { 852 - if (e->index == MSR_FS_BASE || 853 - e->index == MSR_GS_BASE || 854 - e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ 842 + if (CC(e->index == MSR_FS_BASE) || 843 + CC(e->index == MSR_GS_BASE) || 844 + CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 855 845 nested_vmx_msr_check_common(vcpu, e)) 856 846 return -EINVAL; 857 847 return 0; ··· 860 850 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 861 851 struct vmx_msr_entry *e) 862 852 { 863 - if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ 853 + if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 864 854 nested_vmx_msr_check_common(vcpu, e)) 865 855 return -EINVAL; 866 856 return 0; ··· 874 864 { 875 865 u32 i; 876 866 struct vmx_msr_entry e; 877 - struct msr_data msr; 878 867 879 - msr.host_initiated = false; 880 868 for (i = 0; i < count; i++) { 881 869 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 882 870 &e, sizeof(e))) { ··· 889 881 __func__, i, e.index, e.reserved); 890 882 goto fail; 891 883 } 892 - msr.index = e.index; 893 - msr.data = e.value; 894 - if (kvm_set_msr(vcpu, &msr)) { 884 + if (kvm_set_msr(vcpu, e.index, e.value)) { 895 885 pr_debug_ratelimited( 896 886 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 897 887 __func__, i, e.index, e.value); ··· 903 897 904 898 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 905 899 { 900 + u64 data; 906 901 u32 i; 907 902 struct vmx_msr_entry e; 908 903 909 904 for (i = 0; i < count; i++) { 910 - struct msr_data msr_info; 911 905 if (kvm_vcpu_read_guest(vcpu, 912 906 gpa + i * sizeof(e), 913 907 &e, 2 * sizeof(u32))) { ··· 922 916 __func__, i, e.index, e.reserved); 923 917 return -EINVAL; 924 918 } 925 - msr_info.host_initiated = false; 926 - msr_info.index = e.index; 927 - if (kvm_get_msr(vcpu, &msr_info)) { 919 + if (kvm_get_msr(vcpu, e.index, &data)) { 928 920 pr_debug_ratelimited( 929 921 "%s cannot read MSR (%u, 0x%x)\n", 930 922 __func__, i, e.index); ··· 931 927 if (kvm_vcpu_write_guest(vcpu, 932 928 gpa + i * sizeof(e) + 933 929 offsetof(struct vmx_msr_entry, value), 934 - &msr_info.data, sizeof(msr_info.data))) { 930 + &data, sizeof(data))) { 935 931 pr_debug_ratelimited( 936 932 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 937 - __func__, i, e.index, msr_info.data); 933 + __func__, i, e.index, data); 938 934 return -EINVAL; 939 935 } 940 936 } ··· 959 955 u32 *entry_failure_code) 960 956 { 961 957 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { 962 - if (!nested_cr3_valid(vcpu, cr3)) { 958 + if (CC(!nested_cr3_valid(vcpu, cr3))) { 963 959 *entry_failure_code = ENTRY_FAIL_DEFAULT; 964 960 return -EINVAL; 965 961 } ··· 969 965 * must not be dereferenced. 970 966 */ 971 967 if (is_pae_paging(vcpu) && !nested_ept) { 972 - if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { 968 + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 973 969 *entry_failure_code = ENTRY_FAIL_PDPTE; 974 970 return -EINVAL; 975 971 } ··· 2415 2411 2416 2412 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2417 2413 { 2418 - if (!nested_cpu_has_nmi_exiting(vmcs12) && 2419 - nested_cpu_has_virtual_nmis(vmcs12)) 2414 + if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2415 + nested_cpu_has_virtual_nmis(vmcs12))) 2420 2416 return -EINVAL; 2421 2417 2422 - if (!nested_cpu_has_virtual_nmis(vmcs12) && 2423 - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) 2418 + if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2419 + nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) 2424 2420 return -EINVAL; 2425 2421 2426 2422 return 0; ··· 2434 2430 /* Check for memory type validity */ 2435 2431 switch (address & VMX_EPTP_MT_MASK) { 2436 2432 case VMX_EPTP_MT_UC: 2437 - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) 2433 + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2438 2434 return false; 2439 2435 break; 2440 2436 case VMX_EPTP_MT_WB: 2441 - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) 2437 + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2442 2438 return false; 2443 2439 break; 2444 2440 default: ··· 2446 2442 } 2447 2443 2448 2444 /* only 4 levels page-walk length are valid */ 2449 - if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) 2445 + if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) 2450 2446 return false; 2451 2447 2452 2448 /* Reserved bits should not be set */ 2453 - if (address >> maxphyaddr || ((address >> 7) & 0x1f)) 2449 + if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) 2454 2450 return false; 2455 2451 2456 2452 /* AD, if set, should be supported */ 2457 2453 if (address & VMX_EPTP_AD_ENABLE_BIT) { 2458 - if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) 2454 + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2459 2455 return false; 2460 2456 } 2461 2457 ··· 2470 2466 { 2471 2467 struct vcpu_vmx *vmx = to_vmx(vcpu); 2472 2468 2473 - if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2474 - vmx->nested.msrs.pinbased_ctls_low, 2475 - vmx->nested.msrs.pinbased_ctls_high) || 2476 - !vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2477 - vmx->nested.msrs.procbased_ctls_low, 2478 - vmx->nested.msrs.procbased_ctls_high)) 2469 + if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2470 + vmx->nested.msrs.pinbased_ctls_low, 2471 + vmx->nested.msrs.pinbased_ctls_high)) || 2472 + CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2473 + vmx->nested.msrs.procbased_ctls_low, 2474 + vmx->nested.msrs.procbased_ctls_high))) 2479 2475 return -EINVAL; 2480 2476 2481 2477 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2482 - !vmx_control_verify(vmcs12->secondary_vm_exec_control, 2483 - vmx->nested.msrs.secondary_ctls_low, 2484 - vmx->nested.msrs.secondary_ctls_high)) 2478 + CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2479 + vmx->nested.msrs.secondary_ctls_low, 2480 + vmx->nested.msrs.secondary_ctls_high))) 2485 2481 return -EINVAL; 2486 2482 2487 - if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) || 2483 + if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2488 2484 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2489 2485 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2490 2486 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || ··· 2495 2491 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2496 2492 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2497 2493 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2498 - (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2494 + CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2499 2495 return -EINVAL; 2500 2496 2501 2497 if (!nested_cpu_has_preemption_timer(vmcs12) && ··· 2503 2499 return -EINVAL; 2504 2500 2505 2501 if (nested_cpu_has_ept(vmcs12) && 2506 - !valid_ept_address(vcpu, vmcs12->ept_pointer)) 2502 + CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) 2507 2503 return -EINVAL; 2508 2504 2509 2505 if (nested_cpu_has_vmfunc(vmcs12)) { 2510 - if (vmcs12->vm_function_control & 2511 - ~vmx->nested.msrs.vmfunc_controls) 2506 + if (CC(vmcs12->vm_function_control & 2507 + ~vmx->nested.msrs.vmfunc_controls)) 2512 2508 return -EINVAL; 2513 2509 2514 2510 if (nested_cpu_has_eptp_switching(vmcs12)) { 2515 - if (!nested_cpu_has_ept(vmcs12) || 2516 - !page_address_valid(vcpu, vmcs12->eptp_list_address)) 2511 + if (CC(!nested_cpu_has_ept(vmcs12)) || 2512 + CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2517 2513 return -EINVAL; 2518 2514 } 2519 2515 } ··· 2529 2525 { 2530 2526 struct vcpu_vmx *vmx = to_vmx(vcpu); 2531 2527 2532 - if (!vmx_control_verify(vmcs12->vm_exit_controls, 2533 - vmx->nested.msrs.exit_ctls_low, 2534 - vmx->nested.msrs.exit_ctls_high) || 2535 - nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)) 2528 + if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2529 + vmx->nested.msrs.exit_ctls_low, 2530 + vmx->nested.msrs.exit_ctls_high)) || 2531 + CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2536 2532 return -EINVAL; 2537 2533 2538 2534 return 0; ··· 2546 2542 { 2547 2543 struct vcpu_vmx *vmx = to_vmx(vcpu); 2548 2544 2549 - if (!vmx_control_verify(vmcs12->vm_entry_controls, 2550 - vmx->nested.msrs.entry_ctls_low, 2551 - vmx->nested.msrs.entry_ctls_high)) 2545 + if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2546 + vmx->nested.msrs.entry_ctls_low, 2547 + vmx->nested.msrs.entry_ctls_high))) 2552 2548 return -EINVAL; 2553 2549 2554 2550 /* ··· 2568 2564 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2569 2565 2570 2566 /* VM-entry interruption-info field: interruption type */ 2571 - if (intr_type == INTR_TYPE_RESERVED || 2572 - (intr_type == INTR_TYPE_OTHER_EVENT && 2573 - !nested_cpu_supports_monitor_trap_flag(vcpu))) 2567 + if (CC(intr_type == INTR_TYPE_RESERVED) || 2568 + CC(intr_type == INTR_TYPE_OTHER_EVENT && 2569 + !nested_cpu_supports_monitor_trap_flag(vcpu))) 2574 2570 return -EINVAL; 2575 2571 2576 2572 /* VM-entry interruption-info field: vector */ 2577 - if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2578 - (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2579 - (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2573 + if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2574 + CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2575 + CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2580 2576 return -EINVAL; 2581 2577 2582 2578 /* VM-entry interruption-info field: deliver error code */ 2583 2579 should_have_error_code = 2584 2580 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2585 2581 x86_exception_has_error_code(vector); 2586 - if (has_error_code != should_have_error_code) 2582 + if (CC(has_error_code != should_have_error_code)) 2587 2583 return -EINVAL; 2588 2584 2589 2585 /* VM-entry exception error code */ 2590 - if (has_error_code && 2591 - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) 2586 + if (CC(has_error_code && 2587 + vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) 2592 2588 return -EINVAL; 2593 2589 2594 2590 /* VM-entry interruption-info field: reserved bits */ 2595 - if (intr_info & INTR_INFO_RESVD_BITS_MASK) 2591 + if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2596 2592 return -EINVAL; 2597 2593 2598 2594 /* VM-entry instruction length */ ··· 2600 2596 case INTR_TYPE_SOFT_EXCEPTION: 2601 2597 case INTR_TYPE_SOFT_INTR: 2602 2598 case INTR_TYPE_PRIV_SW_EXCEPTION: 2603 - if ((vmcs12->vm_entry_instruction_len > 15) || 2604 - (vmcs12->vm_entry_instruction_len == 0 && 2605 - !nested_cpu_has_zero_length_injection(vcpu))) 2599 + if (CC(vmcs12->vm_entry_instruction_len > 15) || 2600 + CC(vmcs12->vm_entry_instruction_len == 0 && 2601 + CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2606 2602 return -EINVAL; 2607 2603 } 2608 2604 } ··· 2629 2625 { 2630 2626 bool ia32e; 2631 2627 2632 - if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || 2633 - !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || 2634 - !nested_cr3_valid(vcpu, vmcs12->host_cr3)) 2628 + if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2629 + CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2630 + CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) 2635 2631 return -EINVAL; 2636 2632 2637 - if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) || 2638 - is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) 2633 + if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2634 + CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2639 2635 return -EINVAL; 2640 2636 2641 2637 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2642 - !kvm_pat_valid(vmcs12->host_ia32_pat)) 2638 + CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2643 2639 return -EINVAL; 2644 2640 2645 2641 ia32e = (vmcs12->vm_exit_controls & 2646 2642 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 2647 2643 2648 - if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2649 - vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2650 - vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2651 - vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2652 - vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2653 - vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2654 - vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2655 - vmcs12->host_cs_selector == 0 || 2656 - vmcs12->host_tr_selector == 0 || 2657 - (vmcs12->host_ss_selector == 0 && !ia32e)) 2644 + if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2645 + CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2646 + CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2647 + CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2648 + CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2649 + CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2650 + CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2651 + CC(vmcs12->host_cs_selector == 0) || 2652 + CC(vmcs12->host_tr_selector == 0) || 2653 + CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2658 2654 return -EINVAL; 2659 2655 2660 2656 #ifdef CONFIG_X86_64 2661 - if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) || 2662 - is_noncanonical_address(vmcs12->host_gs_base, vcpu) || 2663 - is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) || 2664 - is_noncanonical_address(vmcs12->host_idtr_base, vcpu) || 2665 - is_noncanonical_address(vmcs12->host_tr_base, vcpu)) 2657 + if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2658 + CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2659 + CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2660 + CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2661 + CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu))) 2666 2662 return -EINVAL; 2667 2663 #endif 2668 2664 ··· 2673 2669 * the host address-space size VM-exit control. 2674 2670 */ 2675 2671 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2676 - if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 2677 - ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 2678 - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) 2672 + if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2673 + CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2674 + CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2679 2675 return -EINVAL; 2680 2676 } 2681 2677 ··· 2692 2688 if (vmcs12->vmcs_link_pointer == -1ull) 2693 2689 return 0; 2694 2690 2695 - if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) 2691 + if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2696 2692 return -EINVAL; 2697 2693 2698 - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 2694 + if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2699 2695 return -EINVAL; 2700 2696 2701 2697 shadow = map.hva; 2702 2698 2703 - if (shadow->hdr.revision_id != VMCS12_REVISION || 2704 - shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) 2699 + if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2700 + CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2705 2701 r = -EINVAL; 2706 2702 2707 2703 kvm_vcpu_unmap(vcpu, &map, false); ··· 2713 2709 */ 2714 2710 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2715 2711 { 2716 - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2717 - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) 2712 + if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2713 + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) 2718 2714 return -EINVAL; 2719 2715 2720 2716 return 0; ··· 2728 2724 2729 2725 *exit_qual = ENTRY_FAIL_DEFAULT; 2730 2726 2731 - if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || 2732 - !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) 2727 + if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2728 + CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2733 2729 return -EINVAL; 2734 2730 2735 2731 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2736 - !kvm_pat_valid(vmcs12->guest_ia32_pat)) 2732 + CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2737 2733 return -EINVAL; 2738 2734 2739 2735 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { ··· 2753 2749 if (to_vmx(vcpu)->nested.nested_run_pending && 2754 2750 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2755 2751 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2756 - if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || 2757 - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 2758 - ((vmcs12->guest_cr0 & X86_CR0_PG) && 2759 - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) 2752 + if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2753 + CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2754 + CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2755 + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2760 2756 return -EINVAL; 2761 2757 } 2762 2758 2763 2759 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2764 - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || 2765 - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) 2760 + (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 2761 + CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 2766 2762 return -EINVAL; 2767 2763 2768 2764 if (nested_check_guest_non_reg_state(vmcs12)) ··· 2845 2841 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2846 2842 2847 2843 if (vm_fail) { 2844 + u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 2845 + 2848 2846 preempt_enable(); 2849 - WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 2850 - VMXERR_ENTRY_INVALID_CONTROL_FIELD); 2847 + 2848 + trace_kvm_nested_vmenter_failed( 2849 + "early hardware check VM-instruction error: ", error); 2850 + WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 2851 2851 return 1; 2852 2852 } 2853 2853 ··· 3409 3401 unsigned long exit_qual; 3410 3402 bool block_nested_events = 3411 3403 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3404 + struct kvm_lapic *apic = vcpu->arch.apic; 3405 + 3406 + if (lapic_in_kernel(vcpu) && 3407 + test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3408 + if (block_nested_events) 3409 + return -EBUSY; 3410 + nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3411 + return 0; 3412 + } 3412 3413 3413 3414 if (vcpu->arch.exception.pending && 3414 3415 nested_vmx_check_exception(vcpu, &exit_qual)) { ··· 3906 3889 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3907 3890 struct vcpu_vmx *vmx = to_vmx(vcpu); 3908 3891 struct vmx_msr_entry g, h; 3909 - struct msr_data msr; 3910 3892 gpa_t gpa; 3911 3893 u32 i, j; 3912 3894 ··· 3965 3949 * from the guest value. The intent is to stuff host state as 3966 3950 * silently as possible, not to fully process the exit load list. 3967 3951 */ 3968 - msr.host_initiated = false; 3969 3952 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 3970 3953 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 3971 3954 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { ··· 3994 3979 goto vmabort; 3995 3980 } 3996 3981 3997 - msr.index = h.index; 3998 - msr.data = h.value; 3999 - if (kvm_set_msr(vcpu, &msr)) { 3982 + if (kvm_set_msr(vcpu, h.index, h.value)) { 4000 3983 pr_debug_ratelimited( 4001 3984 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4002 3985 __func__, j, h.index, h.value); ··· 4479 4466 { 4480 4467 if (!nested_vmx_check_permission(vcpu)) 4481 4468 return 1; 4469 + 4482 4470 free_nested(vcpu); 4471 + 4472 + /* Process a latched INIT during time CPU was in VMX operation */ 4473 + kvm_make_request(KVM_REQ_EVENT, vcpu); 4474 + 4483 4475 return nested_vmx_succeed(vcpu); 4484 4476 } 4485 4477 ··· 5279 5261 return false; 5280 5262 5281 5263 if (unlikely(vmx->fail)) { 5282 - pr_info_ratelimited("%s failed vm entry %x\n", __func__, 5283 - vmcs_read32(VM_INSTRUCTION_ERROR)); 5264 + trace_kvm_nested_vmenter_failed( 5265 + "hardware VM-instruction error: ", 5266 + vmcs_read32(VM_INSTRUCTION_ERROR)); 5284 5267 return true; 5285 5268 } 5286 5269

+2 -2

arch/x86/kvm/vmx/vmenter.S

··· 94 94 95 95 /** 96 96 * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode 97 - * @vmx: struct vcpu_vmx * 97 + * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp) 98 98 * @regs: unsigned long * (to guest registers) 99 99 * @launched: %true if the VMCS has been launched 100 100 * ··· 151 151 mov VCPU_R14(%_ASM_AX), %r14 152 152 mov VCPU_R15(%_ASM_AX), %r15 153 153 #endif 154 - /* Load guest RAX. This kills the vmx_vcpu pointer! */ 154 + /* Load guest RAX. This kills the @regs pointer! */ 155 155 mov VCPU_RAX(%_ASM_AX), %_ASM_AX 156 156 157 157 /* Enter guest mode */

+48 -46

arch/x86/kvm/vmx/vmx.c

··· 1472 1472 return 0; 1473 1473 } 1474 1474 1475 - 1476 - static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 1475 + /* 1476 + * Returns an int to be compatible with SVM implementation (which can fail). 1477 + * Do not use directly, use skip_emulated_instruction() instead. 1478 + */ 1479 + static int __skip_emulated_instruction(struct kvm_vcpu *vcpu) 1477 1480 { 1478 1481 unsigned long rip; 1479 1482 ··· 1486 1483 1487 1484 /* skipping an emulated instruction also counts */ 1488 1485 vmx_set_interrupt_shadow(vcpu, 0); 1486 + 1487 + return EMULATE_DONE; 1488 + } 1489 + 1490 + static inline void skip_emulated_instruction(struct kvm_vcpu *vcpu) 1491 + { 1492 + (void)__skip_emulated_instruction(vcpu); 1489 1493 } 1490 1494 1491 1495 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) ··· 4036 4026 * of an EPT paging-structure entry is 110b (write/execute). 4037 4027 */ 4038 4028 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, 4039 - VMX_EPT_MISCONFIG_WX_VALUE); 4029 + VMX_EPT_MISCONFIG_WX_VALUE, 0); 4040 4030 } 4041 4031 4042 4032 #define VMX_XSS_EXIT_BITMAP 0 ··· 4162 4152 4163 4153 vcpu->arch.microcode_version = 0x100000000ULL; 4164 4154 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4155 + vmx->hv_deadline_tsc = -1; 4165 4156 kvm_set_cr8(vcpu, 0); 4166 4157 4167 4158 if (!init_event) { ··· 4867 4856 4868 4857 static int handle_rdmsr(struct kvm_vcpu *vcpu) 4869 4858 { 4870 - u32 ecx = kvm_rcx_read(vcpu); 4871 - struct msr_data msr_info; 4872 - 4873 - msr_info.index = ecx; 4874 - msr_info.host_initiated = false; 4875 - if (vmx_get_msr(vcpu, &msr_info)) { 4876 - trace_kvm_msr_read_ex(ecx); 4877 - kvm_inject_gp(vcpu, 0); 4878 - return 1; 4879 - } 4880 - 4881 - trace_kvm_msr_read(ecx, msr_info.data); 4882 - 4883 - kvm_rax_write(vcpu, msr_info.data & -1u); 4884 - kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u); 4885 - return kvm_skip_emulated_instruction(vcpu); 4859 + return kvm_emulate_rdmsr(vcpu); 4886 4860 } 4887 4861 4888 4862 static int handle_wrmsr(struct kvm_vcpu *vcpu) 4889 4863 { 4890 - struct msr_data msr; 4891 - u32 ecx = kvm_rcx_read(vcpu); 4892 - u64 data = kvm_read_edx_eax(vcpu); 4893 - 4894 - msr.data = data; 4895 - msr.index = ecx; 4896 - msr.host_initiated = false; 4897 - if (kvm_set_msr(vcpu, &msr) != 0) { 4898 - trace_kvm_msr_write_ex(ecx, data); 4899 - kvm_inject_gp(vcpu, 0); 4900 - return 1; 4901 - } 4902 - 4903 - trace_kvm_msr_write(ecx, data); 4904 - return kvm_skip_emulated_instruction(vcpu); 4864 + return kvm_emulate_wrmsr(vcpu); 4905 4865 } 4906 4866 4907 4867 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) ··· 5209 5227 static void grow_ple_window(struct kvm_vcpu *vcpu) 5210 5228 { 5211 5229 struct vcpu_vmx *vmx = to_vmx(vcpu); 5212 - int old = vmx->ple_window; 5230 + unsigned int old = vmx->ple_window; 5213 5231 5214 5232 vmx->ple_window = __grow_ple_window(old, ple_window, 5215 5233 ple_window_grow, 5216 5234 ple_window_max); 5217 5235 5218 - if (vmx->ple_window != old) 5236 + if (vmx->ple_window != old) { 5219 5237 vmx->ple_window_dirty = true; 5220 - 5221 - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); 5238 + trace_kvm_ple_window_update(vcpu->vcpu_id, 5239 + vmx->ple_window, old); 5240 + } 5222 5241 } 5223 5242 5224 5243 static void shrink_ple_window(struct kvm_vcpu *vcpu) 5225 5244 { 5226 5245 struct vcpu_vmx *vmx = to_vmx(vcpu); 5227 - int old = vmx->ple_window; 5246 + unsigned int old = vmx->ple_window; 5228 5247 5229 5248 vmx->ple_window = __shrink_ple_window(old, ple_window, 5230 5249 ple_window_shrink, 5231 5250 ple_window); 5232 5251 5233 - if (vmx->ple_window != old) 5252 + if (vmx->ple_window != old) { 5234 5253 vmx->ple_window_dirty = true; 5235 - 5236 - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); 5254 + trace_kvm_ple_window_update(vcpu->vcpu_id, 5255 + vmx->ple_window, old); 5256 + } 5237 5257 } 5238 5258 5239 5259 /* ··· 5871 5887 else { 5872 5888 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 5873 5889 exit_reason); 5874 - kvm_queue_exception(vcpu, UD_VECTOR); 5875 - return 1; 5890 + dump_vmcs(); 5891 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5892 + vcpu->run->internal.suberror = 5893 + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 5894 + vcpu->run->internal.ndata = 1; 5895 + vcpu->run->internal.data[0] = exit_reason; 5896 + return 0; 5876 5897 } 5877 5898 } 5878 5899 ··· 6603 6614 struct vcpu_vmx *vmx; 6604 6615 unsigned long *msr_bitmap; 6605 6616 int cpu; 6617 + 6618 + BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, 6619 + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); 6606 6620 6607 6621 vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); 6608 6622 if (!vmx) ··· 7361 7369 * irqbalance to make the interrupts single-CPU. 7362 7370 * 7363 7371 * We will support full lowest-priority interrupt later. 7372 + * 7373 + * In addition, we can only inject generic interrupts using 7374 + * the PI mechanism, refuse to route others through it. 7364 7375 */ 7365 7376 7366 7377 kvm_set_msi_irq(kvm, e, &irq); 7367 - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { 7378 + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 7379 + !kvm_irq_is_postable(&irq)) { 7368 7380 /* 7369 7381 * Make sure the IRTE is in remapped mode if 7370 7382 * we don't handle it in posted mode. ··· 7468 7472 static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) 7469 7473 { 7470 7474 return false; 7475 + } 7476 + 7477 + static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 7478 + { 7479 + return to_vmx(vcpu)->nested.vmxon; 7471 7480 } 7472 7481 7473 7482 static __init int hardware_setup(void) ··· 7706 7705 7707 7706 .run = vmx_vcpu_run, 7708 7707 .handle_exit = vmx_handle_exit, 7709 - .skip_emulated_instruction = skip_emulated_instruction, 7708 + .skip_emulated_instruction = __skip_emulated_instruction, 7710 7709 .set_interrupt_shadow = vmx_set_interrupt_shadow, 7711 7710 .get_interrupt_shadow = vmx_get_interrupt_shadow, 7712 7711 .patch_hypercall = vmx_patch_hypercall, ··· 7800 7799 .nested_enable_evmcs = NULL, 7801 7800 .nested_get_evmcs_version = NULL, 7802 7801 .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, 7802 + .apic_init_signal_blocked = vmx_apic_init_signal_blocked, 7803 7803 }; 7804 7804 7805 7805 static void vmx_cleanup_l1d_flush(void)

+1 -1

arch/x86/kvm/vmx/vmx.h

··· 253 253 struct nested_vmx nested; 254 254 255 255 /* Dynamic PLE window. */ 256 - int ple_window; 256 + unsigned int ple_window; 257 257 bool ple_window_dirty; 258 258 259 259 bool req_immediate_exit;

+129 -70

arch/x86/kvm/x86.c

··· 674 674 data, offset, len, access); 675 675 } 676 676 677 + static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) 678 + { 679 + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | 680 + rsvd_bits(1, 2); 681 + } 682 + 677 683 /* 678 - * Load the pae pdptrs. Return true is they are all valid. 684 + * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. 679 685 */ 680 686 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) 681 687 { ··· 700 694 } 701 695 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 702 696 if ((pdpte[i] & PT_PRESENT_MASK) && 703 - (pdpte[i] & 704 - vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) { 697 + (pdpte[i] & pdptr_rsvd_bits(vcpu))) { 705 698 ret = 0; 706 699 goto out; 707 700 } ··· 1259 1254 if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) 1260 1255 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; 1261 1256 1257 + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) 1258 + data |= ARCH_CAP_RDCL_NO; 1259 + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) 1260 + data |= ARCH_CAP_SSB_NO; 1261 + if (!boot_cpu_has_bug(X86_BUG_MDS)) 1262 + data |= ARCH_CAP_MDS_NO; 1263 + 1262 1264 return data; 1263 1265 } 1264 1266 ··· 1363 1351 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 1364 1352 1365 1353 /* 1366 - * Writes msr value into into the appropriate "register". 1354 + * Write @data into the MSR specified by @index. Select MSR specific fault 1355 + * checks are bypassed if @host_initiated is %true. 1367 1356 * Returns 0 on success, non-0 otherwise. 1368 1357 * Assumes vcpu_load() was already called. 1369 1358 */ 1370 - int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 1359 + static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, 1360 + bool host_initiated) 1371 1361 { 1372 - switch (msr->index) { 1362 + struct msr_data msr; 1363 + 1364 + switch (index) { 1373 1365 case MSR_FS_BASE: 1374 1366 case MSR_GS_BASE: 1375 1367 case MSR_KERNEL_GS_BASE: 1376 1368 case MSR_CSTAR: 1377 1369 case MSR_LSTAR: 1378 - if (is_noncanonical_address(msr->data, vcpu)) 1370 + if (is_noncanonical_address(data, vcpu)) 1379 1371 return 1; 1380 1372 break; 1381 1373 case MSR_IA32_SYSENTER_EIP: ··· 1396 1380 * value, and that something deterministic happens if the guest 1397 1381 * invokes 64-bit SYSENTER. 1398 1382 */ 1399 - msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu)); 1383 + data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); 1400 1384 } 1401 - return kvm_x86_ops->set_msr(vcpu, msr); 1385 + 1386 + msr.data = data; 1387 + msr.index = index; 1388 + msr.host_initiated = host_initiated; 1389 + 1390 + return kvm_x86_ops->set_msr(vcpu, &msr); 1391 + } 1392 + 1393 + /* 1394 + * Read the MSR specified by @index into @data. Select MSR specific fault 1395 + * checks are bypassed if @host_initiated is %true. 1396 + * Returns 0 on success, non-0 otherwise. 1397 + * Assumes vcpu_load() was already called. 1398 + */ 1399 + static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1400 + bool host_initiated) 1401 + { 1402 + struct msr_data msr; 1403 + int ret; 1404 + 1405 + msr.index = index; 1406 + msr.host_initiated = host_initiated; 1407 + 1408 + ret = kvm_x86_ops->get_msr(vcpu, &msr); 1409 + if (!ret) 1410 + *data = msr.data; 1411 + return ret; 1412 + } 1413 + 1414 + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1415 + { 1416 + return __kvm_get_msr(vcpu, index, data, false); 1417 + } 1418 + EXPORT_SYMBOL_GPL(kvm_get_msr); 1419 + 1420 + int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) 1421 + { 1422 + return __kvm_set_msr(vcpu, index, data, false); 1402 1423 } 1403 1424 EXPORT_SYMBOL_GPL(kvm_set_msr); 1425 + 1426 + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) 1427 + { 1428 + u32 ecx = kvm_rcx_read(vcpu); 1429 + u64 data; 1430 + 1431 + if (kvm_get_msr(vcpu, ecx, &data)) { 1432 + trace_kvm_msr_read_ex(ecx); 1433 + kvm_inject_gp(vcpu, 0); 1434 + return 1; 1435 + } 1436 + 1437 + trace_kvm_msr_read(ecx, data); 1438 + 1439 + kvm_rax_write(vcpu, data & -1u); 1440 + kvm_rdx_write(vcpu, (data >> 32) & -1u); 1441 + return kvm_skip_emulated_instruction(vcpu); 1442 + } 1443 + EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); 1444 + 1445 + int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) 1446 + { 1447 + u32 ecx = kvm_rcx_read(vcpu); 1448 + u64 data = kvm_read_edx_eax(vcpu); 1449 + 1450 + if (kvm_set_msr(vcpu, ecx, data)) { 1451 + trace_kvm_msr_write_ex(ecx, data); 1452 + kvm_inject_gp(vcpu, 0); 1453 + return 1; 1454 + } 1455 + 1456 + trace_kvm_msr_write(ecx, data); 1457 + return kvm_skip_emulated_instruction(vcpu); 1458 + } 1459 + EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); 1404 1460 1405 1461 /* 1406 1462 * Adapt set_msr() to msr_io()'s calling convention 1407 1463 */ 1408 1464 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1409 1465 { 1410 - struct msr_data msr; 1411 - int r; 1412 - 1413 - msr.index = index; 1414 - msr.host_initiated = true; 1415 - r = kvm_get_msr(vcpu, &msr); 1416 - if (r) 1417 - return r; 1418 - 1419 - *data = msr.data; 1420 - return 0; 1466 + return __kvm_get_msr(vcpu, index, data, true); 1421 1467 } 1422 1468 1423 1469 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1424 1470 { 1425 - struct msr_data msr; 1426 - 1427 - msr.data = *data; 1428 - msr.index = index; 1429 - msr.host_initiated = true; 1430 - return kvm_set_msr(vcpu, &msr); 1471 + return __kvm_set_msr(vcpu, index, *data, true); 1431 1472 } 1432 1473 1433 1474 #ifdef CONFIG_X86_64 ··· 2525 2452 * Doing a TLB flush here, on the guest's behalf, can avoid 2526 2453 * expensive IPIs. 2527 2454 */ 2455 + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, 2456 + vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); 2528 2457 if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) 2529 2458 kvm_vcpu_flush_tlb(vcpu, false); 2530 2459 ··· 2822 2747 return 0; 2823 2748 } 2824 2749 EXPORT_SYMBOL_GPL(kvm_set_msr_common); 2825 - 2826 - 2827 - /* 2828 - * Reads an msr value (of 'msr_index') into 'pdata'. 2829 - * Returns 0 on success, non-0 otherwise. 2830 - * Assumes vcpu_load() was already called. 2831 - */ 2832 - int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2833 - { 2834 - return kvm_x86_ops->get_msr(vcpu, msr); 2835 - } 2836 - EXPORT_SYMBOL_GPL(kvm_get_msr); 2837 2750 2838 2751 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) 2839 2752 { ··· 3569 3506 for (bank = 0; bank < bank_num; bank++) 3570 3507 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 3571 3508 3572 - if (kvm_x86_ops->setup_mce) 3573 - kvm_x86_ops->setup_mce(vcpu); 3509 + kvm_x86_ops->setup_mce(vcpu); 3574 3510 out: 3575 3511 return r; 3576 3512 } ··· 5439 5377 */ 5440 5378 if (vcpu_match_mmio_gva(vcpu, gva) 5441 5379 && !permission_fault(vcpu, vcpu->arch.walk_mmu, 5442 - vcpu->arch.access, 0, access)) { 5380 + vcpu->arch.mmio_access, 0, access)) { 5443 5381 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | 5444 5382 (gva & (PAGE_SIZE - 1)); 5445 5383 trace_vcpu_match_mmio(gva, *gpa, write, false); ··· 6033 5971 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, 6034 5972 u32 msr_index, u64 *pdata) 6035 5973 { 6036 - struct msr_data msr; 6037 - int r; 6038 - 6039 - msr.index = msr_index; 6040 - msr.host_initiated = false; 6041 - r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); 6042 - if (r) 6043 - return r; 6044 - 6045 - *pdata = msr.data; 6046 - return 0; 5974 + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); 6047 5975 } 6048 5976 6049 5977 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, 6050 5978 u32 msr_index, u64 data) 6051 5979 { 6052 - struct msr_data msr; 6053 - 6054 - msr.data = data; 6055 - msr.index = msr_index; 6056 - msr.host_initiated = false; 6057 - return kvm_set_msr(emul_to_vcpu(ctxt), &msr); 5980 + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); 6058 5981 } 6059 5982 6060 5983 static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) ··· 6122 6075 kvm_smm_changed(emul_to_vcpu(ctxt)); 6123 6076 } 6124 6077 6078 + static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) 6079 + { 6080 + return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); 6081 + } 6082 + 6125 6083 static const struct x86_emulate_ops emulate_ops = { 6126 6084 .read_gpr = emulator_read_gpr, 6127 6085 .write_gpr = emulator_write_gpr, ··· 6168 6116 .set_hflags = emulator_set_hflags, 6169 6117 .pre_leave_smm = emulator_pre_leave_smm, 6170 6118 .post_leave_smm = emulator_post_leave_smm, 6119 + .set_xcr = emulator_set_xcr, 6171 6120 }; 6172 6121 6173 6122 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) ··· 6443 6390 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) 6444 6391 { 6445 6392 unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); 6446 - int r = EMULATE_DONE; 6393 + int r; 6447 6394 6448 - kvm_x86_ops->skip_emulated_instruction(vcpu); 6395 + r = kvm_x86_ops->skip_emulated_instruction(vcpu); 6396 + if (unlikely(r != EMULATE_DONE)) 6397 + return 0; 6449 6398 6450 6399 /* 6451 6400 * rflags is the old, "raw" value of the flags. The new value has ··· 6583 6528 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, 6584 6529 emulation_type)) 6585 6530 return EMULATE_DONE; 6586 - if (ctxt->have_exception && inject_emulated_exception(vcpu)) 6531 + if (ctxt->have_exception) { 6532 + /* 6533 + * #UD should result in just EMULATION_FAILED, and trap-like 6534 + * exception should not be encountered during decode. 6535 + */ 6536 + WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || 6537 + exception_type(ctxt->exception.vector) == EXCPT_TRAP); 6538 + inject_emulated_exception(vcpu); 6587 6539 return EMULATE_DONE; 6540 + } 6588 6541 if (emulation_type & EMULTYPE_SKIP) 6589 6542 return EMULATE_FAIL; 6590 6543 return handle_emulation_failure(vcpu, emulation_type); ··· 6607 6544 kvm_rip_write(vcpu, ctxt->_eip); 6608 6545 if (ctxt->eflags & X86_EFLAGS_RF) 6609 6546 kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); 6547 + kvm_x86_ops->set_interrupt_shadow(vcpu, 0); 6610 6548 return EMULATE_DONE; 6611 6549 } 6612 6550 ··· 9386 9322 kvm_page_track_init(kvm); 9387 9323 kvm_mmu_init_vm(kvm); 9388 9324 9389 - if (kvm_x86_ops->vm_init) 9390 - return kvm_x86_ops->vm_init(kvm); 9391 - 9392 - return 0; 9325 + return kvm_x86_ops->vm_init(kvm); 9393 9326 } 9394 9327 9395 9328 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) ··· 10078 10017 10079 10018 bool kvm_arch_has_irq_bypass(void) 10080 10019 { 10081 - return kvm_x86_ops->update_pi_irte != NULL; 10020 + return true; 10082 10021 } 10083 10022 10084 10023 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, ··· 10118 10057 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, 10119 10058 uint32_t guest_irq, bool set) 10120 10059 { 10121 - if (!kvm_x86_ops->update_pi_irte) 10122 - return -EINVAL; 10123 - 10124 10060 return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); 10125 10061 } 10126 10062 ··· 10144 10086 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); 10145 10087 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); 10146 10088 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 10089 + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed); 10147 10090 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 10148 10091 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 10149 10092 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 10150 10093 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); 10151 - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); 10094 + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); 10152 10095 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); 10153 10096 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); 10154 10097 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);

+1 -1

arch/x86/kvm/x86.h

··· 196 196 * actually a nGPA. 197 197 */ 198 198 vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; 199 - vcpu->arch.access = access; 199 + vcpu->arch.mmio_access = access; 200 200 vcpu->arch.mmio_gfn = gfn; 201 201 vcpu->arch.mmio_gen = gen; 202 202 }

+3 -1

include/kvm/arm_vgic.h

··· 249 249 struct list_head lpi_list_head; 250 250 int lpi_list_count; 251 251 252 + /* LPI translation cache */ 253 + struct list_head lpi_translation_cache; 254 + 252 255 /* used by vgic-debug */ 253 256 struct vgic_state_iter *iter; 254 257 ··· 314 311 * parts of the redistributor. 315 312 */ 316 313 struct vgic_io_device rd_iodev; 317 - struct vgic_io_device sgi_iodev; 318 314 struct vgic_redist_region *rdreg; 319 315 320 316 /* Contains the attributes and gpa of the LPI pending tables. */

+3

include/uapi/linux/kvm.h

··· 243 243 #define KVM_INTERNAL_ERROR_SIMUL_EX 2 244 244 /* Encounter unexpected vm-exit due to delivery event. */ 245 245 #define KVM_INTERNAL_ERROR_DELIVERY_EV 3 246 + /* Encounter unexpected vm-exit reason */ 247 + #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON 4 246 248 247 249 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 248 250 struct kvm_run { ··· 998 996 #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171 999 997 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172 1000 998 #define KVM_CAP_PMU_EVENT_FILTER 173 999 + #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 1001 1000 1002 1001 #ifdef KVM_CAP_IRQ_ROUTING 1003 1002

+6 -4

tools/testing/selftests/kvm/Makefile

··· 7 7 KSFT_KHDR_INSTALL := 1 8 8 UNAME_M := $(shell uname -m) 9 9 10 - LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/ucall.c lib/sparsebit.c 11 - LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c 12 - LIBKVM_aarch64 = lib/aarch64/processor.c 13 - LIBKVM_s390x = lib/s390x/processor.c 10 + LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c 11 + LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/ucall.c 12 + LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c 13 + LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c 14 14 15 15 TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test 16 16 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test ··· 32 32 TEST_GEN_PROGS_aarch64 += dirty_log_test 33 33 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus 34 34 35 + TEST_GEN_PROGS_s390x = s390x/memop 35 36 TEST_GEN_PROGS_s390x += s390x/sync_regs_test 37 + TEST_GEN_PROGS_s390x += dirty_log_test 36 38 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus 37 39 38 40 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))

+53 -8

tools/testing/selftests/kvm/dirty_log_test.c

··· 26 26 /* The memory slot index to track dirty pages */ 27 27 #define TEST_MEM_SLOT_INDEX 1 28 28 29 - /* Default guest test memory offset, 1G */ 30 - #define DEFAULT_GUEST_TEST_MEM 0x40000000 29 + /* Default guest test virtual memory offset */ 30 + #define DEFAULT_GUEST_TEST_MEM 0xc0000000 31 31 32 32 /* How many pages to dirty for each guest loop */ 33 33 #define TEST_PAGES_PER_LOOP 1024 ··· 37 37 38 38 /* Interval for each host loop (ms) */ 39 39 #define TEST_HOST_LOOP_INTERVAL 10UL 40 + 41 + /* Dirty bitmaps are always little endian, so we need to swap on big endian */ 42 + #if defined(__s390x__) 43 + # define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) 44 + # define test_bit_le(nr, addr) \ 45 + test_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 46 + # define set_bit_le(nr, addr) \ 47 + set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 48 + # define clear_bit_le(nr, addr) \ 49 + clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 50 + # define test_and_set_bit_le(nr, addr) \ 51 + test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 52 + # define test_and_clear_bit_le(nr, addr) \ 53 + test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) 54 + #else 55 + # define test_bit_le test_bit 56 + # define set_bit_le set_bit 57 + # define clear_bit_le clear_bit 58 + # define test_and_set_bit_le test_and_set_bit 59 + # define test_and_clear_bit_le test_and_clear_bit 60 + #endif 40 61 41 62 /* 42 63 * Guest/Host shared variables. Ensure addr_gva2hva() and/or ··· 90 69 */ 91 70 static void guest_code(void) 92 71 { 72 + uint64_t addr; 93 73 int i; 74 + 75 + /* 76 + * On s390x, all pages of a 1M segment are initially marked as dirty 77 + * when a page of the segment is written to for the very first time. 78 + * To compensate this specialty in this test, we need to touch all 79 + * pages during the first iteration. 80 + */ 81 + for (i = 0; i < guest_num_pages; i++) { 82 + addr = guest_test_virt_mem + i * guest_page_size; 83 + *(uint64_t *)addr = READ_ONCE(iteration); 84 + } 94 85 95 86 while (true) { 96 87 for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { 97 - uint64_t addr = guest_test_virt_mem; 88 + addr = guest_test_virt_mem; 98 89 addr += (READ_ONCE(random_array[i]) % guest_num_pages) 99 90 * guest_page_size; 100 91 addr &= ~(host_page_size - 1); ··· 191 158 value_ptr = host_test_mem + page * host_page_size; 192 159 193 160 /* If this is a special page that we were tracking... */ 194 - if (test_and_clear_bit(page, host_bmap_track)) { 161 + if (test_and_clear_bit_le(page, host_bmap_track)) { 195 162 host_track_next_count++; 196 - TEST_ASSERT(test_bit(page, bmap), 163 + TEST_ASSERT(test_bit_le(page, bmap), 197 164 "Page %"PRIu64" should have its dirty bit " 198 165 "set in this iteration but it is missing", 199 166 page); 200 167 } 201 168 202 - if (test_bit(page, bmap)) { 169 + if (test_bit_le(page, bmap)) { 203 170 host_dirty_count++; 204 171 /* 205 172 * If the bit is set, the value written onto ··· 242 209 * should report its dirtyness in the 243 210 * next run 244 211 */ 245 - set_bit(page, host_bmap_track); 212 + set_bit_le(page, host_bmap_track); 246 213 } 247 214 } 248 215 } ··· 326 293 * case where the size is not aligned to 64 pages. 327 294 */ 328 295 guest_num_pages = (1ul << (30 - guest_page_shift)) + 16; 296 + #ifdef __s390x__ 297 + /* Round up to multiple of 1M (segment size) */ 298 + guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL; 299 + #endif 329 300 host_page_size = getpagesize(); 330 301 host_num_pages = (guest_num_pages * guest_page_size) / host_page_size + 331 302 !!((guest_num_pages * guest_page_size) % host_page_size); ··· 340 303 } else { 341 304 guest_test_phys_mem = phys_offset; 342 305 } 306 + 307 + #ifdef __s390x__ 308 + /* Align to 1M (segment size) */ 309 + guest_test_phys_mem &= ~((1 << 20) - 1); 310 + #endif 343 311 344 312 DEBUG("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); 345 313 ··· 379 337 vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 380 338 #endif 381 339 #ifdef __aarch64__ 382 - ucall_init(vm, UCALL_MMIO, NULL); 340 + ucall_init(vm, NULL); 383 341 #endif 384 342 385 343 /* Export the shared variables to the guest */ ··· 495 453 vm_guest_mode_params_init(VM_MODE_P48V48_4K, true, true); 496 454 vm_guest_mode_params_init(VM_MODE_P48V48_64K, true, true); 497 455 } 456 + #endif 457 + #ifdef __s390x__ 458 + vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true); 498 459 #endif 499 460 500 461 while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) {

+1 -7

tools/testing/selftests/kvm/include/kvm_util.h

··· 165 165 memcpy(&(g), _p, sizeof(g)); \ 166 166 }) 167 167 168 - /* ucall implementation types */ 169 - typedef enum { 170 - UCALL_PIO, 171 - UCALL_MMIO, 172 - } ucall_type_t; 173 - 174 168 /* Common ucalls */ 175 169 enum { 176 170 UCALL_NONE, ··· 180 186 uint64_t args[UCALL_MAX_ARGS]; 181 187 }; 182 188 183 - void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg); 189 + void ucall_init(struct kvm_vm *vm, void *arg); 184 190 void ucall_uninit(struct kvm_vm *vm); 185 191 void ucall(uint64_t cmd, int nargs, ...); 186 192 uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);

+112

tools/testing/selftests/kvm/lib/aarch64/ucall.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * ucall support. A ucall is a "hypercall to userspace". 4 + * 5 + * Copyright (C) 2018, Red Hat, Inc. 6 + */ 7 + #include "kvm_util.h" 8 + #include "../kvm_util_internal.h" 9 + 10 + static vm_vaddr_t *ucall_exit_mmio_addr; 11 + 12 + static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) 13 + { 14 + if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) 15 + return false; 16 + 17 + virt_pg_map(vm, gpa, gpa, 0); 18 + 19 + ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; 20 + sync_global_to_guest(vm, ucall_exit_mmio_addr); 21 + 22 + return true; 23 + } 24 + 25 + void ucall_init(struct kvm_vm *vm, void *arg) 26 + { 27 + vm_paddr_t gpa, start, end, step, offset; 28 + unsigned int bits; 29 + bool ret; 30 + 31 + if (arg) { 32 + gpa = (vm_paddr_t)arg; 33 + ret = ucall_mmio_init(vm, gpa); 34 + TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa); 35 + return; 36 + } 37 + 38 + /* 39 + * Find an address within the allowed physical and virtual address 40 + * spaces, that does _not_ have a KVM memory region associated with 41 + * it. Identity mapping an address like this allows the guest to 42 + * access it, but as KVM doesn't know what to do with it, it 43 + * will assume it's something userspace handles and exit with 44 + * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. 45 + * Here we start with a guess that the addresses around 5/8th 46 + * of the allowed space are unmapped and then work both down and 47 + * up from there in 1/16th allowed space sized steps. 48 + * 49 + * Note, we need to use VA-bits - 1 when calculating the allowed 50 + * virtual address space for an identity mapping because the upper 51 + * half of the virtual address space is the two's complement of the 52 + * lower and won't match physical addresses. 53 + */ 54 + bits = vm->va_bits - 1; 55 + bits = vm->pa_bits < bits ? vm->pa_bits : bits; 56 + end = 1ul << bits; 57 + start = end * 5 / 8; 58 + step = end / 16; 59 + for (offset = 0; offset < end - start; offset += step) { 60 + if (ucall_mmio_init(vm, start - offset)) 61 + return; 62 + if (ucall_mmio_init(vm, start + offset)) 63 + return; 64 + } 65 + TEST_ASSERT(false, "Can't find a ucall mmio address"); 66 + } 67 + 68 + void ucall_uninit(struct kvm_vm *vm) 69 + { 70 + ucall_exit_mmio_addr = 0; 71 + sync_global_to_guest(vm, ucall_exit_mmio_addr); 72 + } 73 + 74 + void ucall(uint64_t cmd, int nargs, ...) 75 + { 76 + struct ucall uc = { 77 + .cmd = cmd, 78 + }; 79 + va_list va; 80 + int i; 81 + 82 + nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; 83 + 84 + va_start(va, nargs); 85 + for (i = 0; i < nargs; ++i) 86 + uc.args[i] = va_arg(va, uint64_t); 87 + va_end(va); 88 + 89 + *ucall_exit_mmio_addr = (vm_vaddr_t)&uc; 90 + } 91 + 92 + uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) 93 + { 94 + struct kvm_run *run = vcpu_state(vm, vcpu_id); 95 + struct ucall ucall = {}; 96 + 97 + if (run->exit_reason == KVM_EXIT_MMIO && 98 + run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { 99 + vm_vaddr_t gva; 100 + 101 + TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, 102 + "Unexpected ucall exit mmio address access"); 103 + memcpy(&gva, run->mmio.data, sizeof(gva)); 104 + memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall)); 105 + 106 + vcpu_run_complete_io(vm, vcpu_id); 107 + if (uc) 108 + memcpy(uc, &ucall, sizeof(ucall)); 109 + } 110 + 111 + return ucall.cmd; 112 + }

+56

tools/testing/selftests/kvm/lib/s390x/ucall.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * ucall support. A ucall is a "hypercall to userspace". 4 + * 5 + * Copyright (C) 2019 Red Hat, Inc. 6 + */ 7 + #include "kvm_util.h" 8 + 9 + void ucall_init(struct kvm_vm *vm, void *arg) 10 + { 11 + } 12 + 13 + void ucall_uninit(struct kvm_vm *vm) 14 + { 15 + } 16 + 17 + void ucall(uint64_t cmd, int nargs, ...) 18 + { 19 + struct ucall uc = { 20 + .cmd = cmd, 21 + }; 22 + va_list va; 23 + int i; 24 + 25 + nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; 26 + 27 + va_start(va, nargs); 28 + for (i = 0; i < nargs; ++i) 29 + uc.args[i] = va_arg(va, uint64_t); 30 + va_end(va); 31 + 32 + /* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */ 33 + asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory"); 34 + } 35 + 36 + uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) 37 + { 38 + struct kvm_run *run = vcpu_state(vm, vcpu_id); 39 + struct ucall ucall = {}; 40 + 41 + if (run->exit_reason == KVM_EXIT_S390_SIEIC && 42 + run->s390_sieic.icptcode == 4 && 43 + (run->s390_sieic.ipa >> 8) == 0x83 && /* 0x83 means DIAGNOSE */ 44 + (run->s390_sieic.ipb >> 16) == 0x501) { 45 + int reg = run->s390_sieic.ipa & 0xf; 46 + 47 + memcpy(&ucall, addr_gva2hva(vm, run->s.regs.gprs[reg]), 48 + sizeof(ucall)); 49 + 50 + vcpu_run_complete_io(vm, vcpu_id); 51 + if (uc) 52 + memcpy(uc, &ucall, sizeof(ucall)); 53 + } 54 + 55 + return ucall.cmd; 56 + }

-157

tools/testing/selftests/kvm/lib/ucall.c

··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * ucall support. A ucall is a "hypercall to userspace". 4 - * 5 - * Copyright (C) 2018, Red Hat, Inc. 6 - */ 7 - #include "kvm_util.h" 8 - #include "kvm_util_internal.h" 9 - 10 - #define UCALL_PIO_PORT ((uint16_t)0x1000) 11 - 12 - static ucall_type_t ucall_type; 13 - static vm_vaddr_t *ucall_exit_mmio_addr; 14 - 15 - static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) 16 - { 17 - if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) 18 - return false; 19 - 20 - virt_pg_map(vm, gpa, gpa, 0); 21 - 22 - ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; 23 - sync_global_to_guest(vm, ucall_exit_mmio_addr); 24 - 25 - return true; 26 - } 27 - 28 - void ucall_init(struct kvm_vm *vm, ucall_type_t type, void *arg) 29 - { 30 - ucall_type = type; 31 - sync_global_to_guest(vm, ucall_type); 32 - 33 - if (type == UCALL_PIO) 34 - return; 35 - 36 - if (type == UCALL_MMIO) { 37 - vm_paddr_t gpa, start, end, step, offset; 38 - unsigned bits; 39 - bool ret; 40 - 41 - if (arg) { 42 - gpa = (vm_paddr_t)arg; 43 - ret = ucall_mmio_init(vm, gpa); 44 - TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa); 45 - return; 46 - } 47 - 48 - /* 49 - * Find an address within the allowed physical and virtual address 50 - * spaces, that does _not_ have a KVM memory region associated with 51 - * it. Identity mapping an address like this allows the guest to 52 - * access it, but as KVM doesn't know what to do with it, it 53 - * will assume it's something userspace handles and exit with 54 - * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. 55 - * Here we start with a guess that the addresses around 5/8th 56 - * of the allowed space are unmapped and then work both down and 57 - * up from there in 1/16th allowed space sized steps. 58 - * 59 - * Note, we need to use VA-bits - 1 when calculating the allowed 60 - * virtual address space for an identity mapping because the upper 61 - * half of the virtual address space is the two's complement of the 62 - * lower and won't match physical addresses. 63 - */ 64 - bits = vm->va_bits - 1; 65 - bits = vm->pa_bits < bits ? vm->pa_bits : bits; 66 - end = 1ul << bits; 67 - start = end * 5 / 8; 68 - step = end / 16; 69 - for (offset = 0; offset < end - start; offset += step) { 70 - if (ucall_mmio_init(vm, start - offset)) 71 - return; 72 - if (ucall_mmio_init(vm, start + offset)) 73 - return; 74 - } 75 - TEST_ASSERT(false, "Can't find a ucall mmio address"); 76 - } 77 - } 78 - 79 - void ucall_uninit(struct kvm_vm *vm) 80 - { 81 - ucall_type = 0; 82 - sync_global_to_guest(vm, ucall_type); 83 - ucall_exit_mmio_addr = 0; 84 - sync_global_to_guest(vm, ucall_exit_mmio_addr); 85 - } 86 - 87 - static void ucall_pio_exit(struct ucall *uc) 88 - { 89 - #ifdef __x86_64__ 90 - asm volatile("in %[port], %%al" 91 - : : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax"); 92 - #endif 93 - } 94 - 95 - static void ucall_mmio_exit(struct ucall *uc) 96 - { 97 - *ucall_exit_mmio_addr = (vm_vaddr_t)uc; 98 - } 99 - 100 - void ucall(uint64_t cmd, int nargs, ...) 101 - { 102 - struct ucall uc = { 103 - .cmd = cmd, 104 - }; 105 - va_list va; 106 - int i; 107 - 108 - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; 109 - 110 - va_start(va, nargs); 111 - for (i = 0; i < nargs; ++i) 112 - uc.args[i] = va_arg(va, uint64_t); 113 - va_end(va); 114 - 115 - switch (ucall_type) { 116 - case UCALL_PIO: 117 - ucall_pio_exit(&uc); 118 - break; 119 - case UCALL_MMIO: 120 - ucall_mmio_exit(&uc); 121 - break; 122 - }; 123 - } 124 - 125 - uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) 126 - { 127 - struct kvm_run *run = vcpu_state(vm, vcpu_id); 128 - struct ucall ucall = {}; 129 - bool got_ucall = false; 130 - 131 - #ifdef __x86_64__ 132 - if (ucall_type == UCALL_PIO && run->exit_reason == KVM_EXIT_IO && 133 - run->io.port == UCALL_PIO_PORT) { 134 - struct kvm_regs regs; 135 - vcpu_regs_get(vm, vcpu_id, &regs); 136 - memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), sizeof(ucall)); 137 - got_ucall = true; 138 - } 139 - #endif 140 - if (ucall_type == UCALL_MMIO && run->exit_reason == KVM_EXIT_MMIO && 141 - run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { 142 - vm_vaddr_t gva; 143 - TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, 144 - "Unexpected ucall exit mmio address access"); 145 - memcpy(&gva, run->mmio.data, sizeof(gva)); 146 - memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall)); 147 - got_ucall = true; 148 - } 149 - 150 - if (got_ucall) { 151 - vcpu_run_complete_io(vm, vcpu_id); 152 - if (uc) 153 - memcpy(uc, &ucall, sizeof(ucall)); 154 - } 155 - 156 - return ucall.cmd; 157 - }

+56

tools/testing/selftests/kvm/lib/x86_64/ucall.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * ucall support. A ucall is a "hypercall to userspace". 4 + * 5 + * Copyright (C) 2018, Red Hat, Inc. 6 + */ 7 + #include "kvm_util.h" 8 + 9 + #define UCALL_PIO_PORT ((uint16_t)0x1000) 10 + 11 + void ucall_init(struct kvm_vm *vm, void *arg) 12 + { 13 + } 14 + 15 + void ucall_uninit(struct kvm_vm *vm) 16 + { 17 + } 18 + 19 + void ucall(uint64_t cmd, int nargs, ...) 20 + { 21 + struct ucall uc = { 22 + .cmd = cmd, 23 + }; 24 + va_list va; 25 + int i; 26 + 27 + nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; 28 + 29 + va_start(va, nargs); 30 + for (i = 0; i < nargs; ++i) 31 + uc.args[i] = va_arg(va, uint64_t); 32 + va_end(va); 33 + 34 + asm volatile("in %[port], %%al" 35 + : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax"); 36 + } 37 + 38 + uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) 39 + { 40 + struct kvm_run *run = vcpu_state(vm, vcpu_id); 41 + struct ucall ucall = {}; 42 + 43 + if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { 44 + struct kvm_regs regs; 45 + 46 + vcpu_regs_get(vm, vcpu_id, &regs); 47 + memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), 48 + sizeof(ucall)); 49 + 50 + vcpu_run_complete_io(vm, vcpu_id); 51 + if (uc) 52 + memcpy(uc, &ucall, sizeof(ucall)); 53 + } 54 + 55 + return ucall.cmd; 56 + }

+166

tools/testing/selftests/kvm/s390x/memop.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Test for s390x KVM_S390_MEM_OP 4 + * 5 + * Copyright (C) 2019, Red Hat, Inc. 6 + */ 7 + 8 + #include <stdio.h> 9 + #include <stdlib.h> 10 + #include <string.h> 11 + #include <sys/ioctl.h> 12 + 13 + #include "test_util.h" 14 + #include "kvm_util.h" 15 + 16 + #define VCPU_ID 1 17 + 18 + static uint8_t mem1[65536]; 19 + static uint8_t mem2[65536]; 20 + 21 + static void guest_code(void) 22 + { 23 + int i; 24 + 25 + for (;;) { 26 + for (i = 0; i < sizeof(mem2); i++) 27 + mem2[i] = mem1[i]; 28 + GUEST_SYNC(0); 29 + } 30 + } 31 + 32 + int main(int argc, char *argv[]) 33 + { 34 + struct kvm_vm *vm; 35 + struct kvm_run *run; 36 + struct kvm_s390_mem_op ksmo; 37 + int rv, i, maxsize; 38 + 39 + setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ 40 + 41 + maxsize = kvm_check_cap(KVM_CAP_S390_MEM_OP); 42 + if (!maxsize) { 43 + fprintf(stderr, "CAP_S390_MEM_OP not supported -> skip test\n"); 44 + exit(KSFT_SKIP); 45 + } 46 + if (maxsize > sizeof(mem1)) 47 + maxsize = sizeof(mem1); 48 + 49 + /* Create VM */ 50 + vm = vm_create_default(VCPU_ID, 0, guest_code); 51 + run = vcpu_state(vm, VCPU_ID); 52 + 53 + for (i = 0; i < sizeof(mem1); i++) 54 + mem1[i] = i * i + i; 55 + 56 + /* Set the first array */ 57 + ksmo.gaddr = addr_gva2gpa(vm, (uintptr_t)mem1); 58 + ksmo.flags = 0; 59 + ksmo.size = maxsize; 60 + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; 61 + ksmo.buf = (uintptr_t)mem1; 62 + ksmo.ar = 0; 63 + vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 64 + 65 + /* Let the guest code copy the first array to the second */ 66 + vcpu_run(vm, VCPU_ID); 67 + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, 68 + "Unexpected exit reason: %u (%s)\n", 69 + run->exit_reason, 70 + exit_reason_str(run->exit_reason)); 71 + 72 + memset(mem2, 0xaa, sizeof(mem2)); 73 + 74 + /* Get the second array */ 75 + ksmo.gaddr = (uintptr_t)mem2; 76 + ksmo.flags = 0; 77 + ksmo.size = maxsize; 78 + ksmo.op = KVM_S390_MEMOP_LOGICAL_READ; 79 + ksmo.buf = (uintptr_t)mem2; 80 + ksmo.ar = 0; 81 + vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 82 + 83 + TEST_ASSERT(!memcmp(mem1, mem2, maxsize), 84 + "Memory contents do not match!"); 85 + 86 + /* Check error conditions - first bad size: */ 87 + ksmo.gaddr = (uintptr_t)mem1; 88 + ksmo.flags = 0; 89 + ksmo.size = -1; 90 + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; 91 + ksmo.buf = (uintptr_t)mem1; 92 + ksmo.ar = 0; 93 + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 94 + TEST_ASSERT(rv == -1 && errno == E2BIG, "ioctl allows insane sizes"); 95 + 96 + /* Zero size: */ 97 + ksmo.gaddr = (uintptr_t)mem1; 98 + ksmo.flags = 0; 99 + ksmo.size = 0; 100 + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; 101 + ksmo.buf = (uintptr_t)mem1; 102 + ksmo.ar = 0; 103 + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 104 + TEST_ASSERT(rv == -1 && (errno == EINVAL || errno == ENOMEM), 105 + "ioctl allows 0 as size"); 106 + 107 + /* Bad flags: */ 108 + ksmo.gaddr = (uintptr_t)mem1; 109 + ksmo.flags = -1; 110 + ksmo.size = maxsize; 111 + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; 112 + ksmo.buf = (uintptr_t)mem1; 113 + ksmo.ar = 0; 114 + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 115 + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows all flags"); 116 + 117 + /* Bad operation: */ 118 + ksmo.gaddr = (uintptr_t)mem1; 119 + ksmo.flags = 0; 120 + ksmo.size = maxsize; 121 + ksmo.op = -1; 122 + ksmo.buf = (uintptr_t)mem1; 123 + ksmo.ar = 0; 124 + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 125 + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations"); 126 + 127 + /* Bad guest address: */ 128 + ksmo.gaddr = ~0xfffUL; 129 + ksmo.flags = KVM_S390_MEMOP_F_CHECK_ONLY; 130 + ksmo.size = maxsize; 131 + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; 132 + ksmo.buf = (uintptr_t)mem1; 133 + ksmo.ar = 0; 134 + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 135 + TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory access"); 136 + 137 + /* Bad host address: */ 138 + ksmo.gaddr = (uintptr_t)mem1; 139 + ksmo.flags = 0; 140 + ksmo.size = maxsize; 141 + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; 142 + ksmo.buf = 0; 143 + ksmo.ar = 0; 144 + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 145 + TEST_ASSERT(rv == -1 && errno == EFAULT, 146 + "ioctl does not report bad host memory address"); 147 + 148 + /* Bad access register: */ 149 + run->psw_mask &= ~(3UL << (63 - 17)); 150 + run->psw_mask |= 1UL << (63 - 17); /* Enable AR mode */ 151 + vcpu_run(vm, VCPU_ID); /* To sync new state to SIE block */ 152 + ksmo.gaddr = (uintptr_t)mem1; 153 + ksmo.flags = 0; 154 + ksmo.size = maxsize; 155 + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; 156 + ksmo.buf = (uintptr_t)mem1; 157 + ksmo.ar = 17; 158 + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); 159 + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows ARs > 15"); 160 + run->psw_mask &= ~(3UL << (63 - 17)); /* Disable AR mode */ 161 + vcpu_run(vm, VCPU_ID); /* Run to sync new state */ 162 + 163 + kvm_vm_free(vm); 164 + 165 + return 0; 166 + }

+34 -2

tools/testing/selftests/kvm/s390x/sync_regs_test.c

··· 25 25 26 26 static void guest_code(void) 27 27 { 28 + register u64 stage asm("11") = 0; 29 + 28 30 for (;;) { 29 - asm volatile ("diag 0,0,0x501"); 30 - asm volatile ("ahi 11,1"); 31 + GUEST_SYNC(0); 32 + asm volatile ("ahi %0,1" : : "r"(stage)); 31 33 } 32 34 } 33 35 ··· 84 82 vm = vm_create_default(VCPU_ID, 0, guest_code); 85 83 86 84 run = vcpu_state(vm, VCPU_ID); 85 + 86 + /* Request reading invalid register set from VCPU. */ 87 + run->kvm_valid_regs = INVALID_SYNC_FIELD; 88 + rv = _vcpu_run(vm, VCPU_ID); 89 + TEST_ASSERT(rv < 0 && errno == EINVAL, 90 + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", 91 + rv); 92 + vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; 93 + 94 + run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; 95 + rv = _vcpu_run(vm, VCPU_ID); 96 + TEST_ASSERT(rv < 0 && errno == EINVAL, 97 + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", 98 + rv); 99 + vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; 100 + 101 + /* Request setting invalid register set into VCPU. */ 102 + run->kvm_dirty_regs = INVALID_SYNC_FIELD; 103 + rv = _vcpu_run(vm, VCPU_ID); 104 + TEST_ASSERT(rv < 0 && errno == EINVAL, 105 + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", 106 + rv); 107 + vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; 108 + 109 + run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; 110 + rv = _vcpu_run(vm, VCPU_ID); 111 + TEST_ASSERT(rv < 0 && errno == EINVAL, 112 + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", 113 + rv); 114 + vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; 87 115 88 116 /* Request and verify all valid register sets. */ 89 117 run->kvm_valid_regs = TEST_SYNC_FIELDS;

+2

virt/kvm/arm/arm.c

··· 196 196 case KVM_CAP_MP_STATE: 197 197 case KVM_CAP_IMMEDIATE_EXIT: 198 198 case KVM_CAP_VCPU_EVENTS: 199 + case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: 199 200 r = 1; 200 201 break; 201 202 case KVM_CAP_ARM_SET_DEVICE_ADDR: ··· 889 888 890 889 irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; 891 890 vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; 891 + vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); 892 892 irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; 893 893 894 894 trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);

+6 -2

virt/kvm/arm/vgic/vgic-init.c

··· 54 54 struct vgic_dist *dist = &kvm->arch.vgic; 55 55 56 56 INIT_LIST_HEAD(&dist->lpi_list_head); 57 + INIT_LIST_HEAD(&dist->lpi_translation_cache); 57 58 raw_spin_lock_init(&dist->lpi_list_lock); 58 59 } 59 60 ··· 200 199 int i; 201 200 202 201 vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; 203 - vgic_cpu->sgi_iodev.base_addr = VGIC_ADDR_UNDEF; 204 202 205 203 INIT_LIST_HEAD(&vgic_cpu->ap_list_head); 206 204 raw_spin_lock_init(&vgic_cpu->ap_list_lock); ··· 304 304 } 305 305 306 306 if (vgic_has_its(kvm)) { 307 + vgic_lpi_translation_cache_init(kvm); 307 308 ret = vgic_v4_init(kvm); 308 309 if (ret) 309 310 goto out; ··· 345 344 } 346 345 INIT_LIST_HEAD(&dist->rd_regions); 347 346 } 347 + 348 + if (vgic_has_its(kvm)) 349 + vgic_lpi_translation_cache_destroy(kvm); 348 350 349 351 if (vgic_supports_direct_msis(kvm)) 350 352 vgic_v4_teardown(kvm); ··· 519 515 break; 520 516 default: 521 517 ret = -ENODEV; 522 - }; 518 + } 523 519 524 520 if (ret) 525 521 return ret;

+30 -6

virt/kvm/arm/vgic/vgic-irqfd.c

··· 66 66 return r; 67 67 } 68 68 69 + static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e, 70 + struct kvm_msi *msi) 71 + { 72 + msi->address_lo = e->msi.address_lo; 73 + msi->address_hi = e->msi.address_hi; 74 + msi->data = e->msi.data; 75 + msi->flags = e->msi.flags; 76 + msi->devid = e->msi.devid; 77 + } 69 78 /** 70 79 * kvm_set_msi: inject the MSI corresponding to the 71 80 * MSI routing entry ··· 88 79 { 89 80 struct kvm_msi msi; 90 81 91 - msi.address_lo = e->msi.address_lo; 92 - msi.address_hi = e->msi.address_hi; 93 - msi.data = e->msi.data; 94 - msi.flags = e->msi.flags; 95 - msi.devid = e->msi.devid; 96 - 97 82 if (!vgic_has_its(kvm)) 98 83 return -ENODEV; 99 84 100 85 if (!level) 101 86 return -1; 102 87 88 + kvm_populate_msi(e, &msi); 103 89 return vgic_its_inject_msi(kvm, &msi); 90 + } 91 + 92 + /** 93 + * kvm_arch_set_irq_inatomic: fast-path for irqfd injection 94 + * 95 + * Currently only direct MSI injection is supported. 96 + */ 97 + int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, 98 + struct kvm *kvm, int irq_source_id, int level, 99 + bool line_status) 100 + { 101 + if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) { 102 + struct kvm_msi msi; 103 + 104 + kvm_populate_msi(e, &msi); 105 + if (!vgic_its_inject_cached_translation(kvm, &msi)) 106 + return 0; 107 + } 108 + 109 + return -EWOULDBLOCK; 104 110 } 105 111 106 112 int kvm_vgic_setup_default_irq_routing(struct kvm *kvm)

+207

virt/kvm/arm/vgic/vgic-its.c

··· 138 138 u32 event_id; 139 139 }; 140 140 141 + struct vgic_translation_cache_entry { 142 + struct list_head entry; 143 + phys_addr_t db; 144 + u32 devid; 145 + u32 eventid; 146 + struct vgic_irq *irq; 147 + }; 148 + 141 149 /** 142 150 * struct vgic_its_abi - ITS abi ops and settings 143 151 * @cte_esz: collection table entry size ··· 535 527 return 0; 536 528 } 537 529 530 + static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist, 531 + phys_addr_t db, 532 + u32 devid, u32 eventid) 533 + { 534 + struct vgic_translation_cache_entry *cte; 535 + 536 + list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { 537 + /* 538 + * If we hit a NULL entry, there is nothing after this 539 + * point. 540 + */ 541 + if (!cte->irq) 542 + break; 543 + 544 + if (cte->db != db || cte->devid != devid || 545 + cte->eventid != eventid) 546 + continue; 547 + 548 + /* 549 + * Move this entry to the head, as it is the most 550 + * recently used. 551 + */ 552 + if (!list_is_first(&cte->entry, &dist->lpi_translation_cache)) 553 + list_move(&cte->entry, &dist->lpi_translation_cache); 554 + 555 + return cte->irq; 556 + } 557 + 558 + return NULL; 559 + } 560 + 561 + static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db, 562 + u32 devid, u32 eventid) 563 + { 564 + struct vgic_dist *dist = &kvm->arch.vgic; 565 + struct vgic_irq *irq; 566 + unsigned long flags; 567 + 568 + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); 569 + irq = __vgic_its_check_cache(dist, db, devid, eventid); 570 + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 571 + 572 + return irq; 573 + } 574 + 575 + static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its, 576 + u32 devid, u32 eventid, 577 + struct vgic_irq *irq) 578 + { 579 + struct vgic_dist *dist = &kvm->arch.vgic; 580 + struct vgic_translation_cache_entry *cte; 581 + unsigned long flags; 582 + phys_addr_t db; 583 + 584 + /* Do not cache a directly injected interrupt */ 585 + if (irq->hw) 586 + return; 587 + 588 + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); 589 + 590 + if (unlikely(list_empty(&dist->lpi_translation_cache))) 591 + goto out; 592 + 593 + /* 594 + * We could have raced with another CPU caching the same 595 + * translation behind our back, so let's check it is not in 596 + * already 597 + */ 598 + db = its->vgic_its_base + GITS_TRANSLATER; 599 + if (__vgic_its_check_cache(dist, db, devid, eventid)) 600 + goto out; 601 + 602 + /* Always reuse the last entry (LRU policy) */ 603 + cte = list_last_entry(&dist->lpi_translation_cache, 604 + typeof(*cte), entry); 605 + 606 + /* 607 + * Caching the translation implies having an extra reference 608 + * to the interrupt, so drop the potential reference on what 609 + * was in the cache, and increment it on the new interrupt. 610 + */ 611 + if (cte->irq) 612 + __vgic_put_lpi_locked(kvm, cte->irq); 613 + 614 + vgic_get_irq_kref(irq); 615 + 616 + cte->db = db; 617 + cte->devid = devid; 618 + cte->eventid = eventid; 619 + cte->irq = irq; 620 + 621 + /* Move the new translation to the head of the list */ 622 + list_move(&cte->entry, &dist->lpi_translation_cache); 623 + 624 + out: 625 + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 626 + } 627 + 628 + void vgic_its_invalidate_cache(struct kvm *kvm) 629 + { 630 + struct vgic_dist *dist = &kvm->arch.vgic; 631 + struct vgic_translation_cache_entry *cte; 632 + unsigned long flags; 633 + 634 + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); 635 + 636 + list_for_each_entry(cte, &dist->lpi_translation_cache, entry) { 637 + /* 638 + * If we hit a NULL entry, there is nothing after this 639 + * point. 640 + */ 641 + if (!cte->irq) 642 + break; 643 + 644 + __vgic_put_lpi_locked(kvm, cte->irq); 645 + cte->irq = NULL; 646 + } 647 + 648 + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 649 + } 650 + 538 651 int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, 539 652 u32 devid, u32 eventid, struct vgic_irq **irq) 540 653 { ··· 675 546 676 547 if (!vcpu->arch.vgic_cpu.lpis_enabled) 677 548 return -EBUSY; 549 + 550 + vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq); 678 551 679 552 *irq = ite->irq; 680 553 return 0; ··· 739 608 return 0; 740 609 } 741 610 611 + int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi) 612 + { 613 + struct vgic_irq *irq; 614 + unsigned long flags; 615 + phys_addr_t db; 616 + 617 + db = (u64)msi->address_hi << 32 | msi->address_lo; 618 + irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data); 619 + 620 + if (!irq) 621 + return -1; 622 + 623 + raw_spin_lock_irqsave(&irq->irq_lock, flags); 624 + irq->pending_latch = true; 625 + vgic_queue_irq_unlock(kvm, irq, flags); 626 + 627 + return 0; 628 + } 629 + 742 630 /* 743 631 * Queries the KVM IO bus framework to get the ITS pointer from the given 744 632 * doorbell address. ··· 768 618 { 769 619 struct vgic_its *its; 770 620 int ret; 621 + 622 + if (!vgic_its_inject_cached_translation(kvm, msi)) 623 + return 1; 771 624 772 625 its = vgic_msi_to_its(kvm, msi); 773 626 if (IS_ERR(its)) ··· 844 691 * don't bother here since we clear the ITTE anyway and the 845 692 * pending state is a property of the ITTE struct. 846 693 */ 694 + vgic_its_invalidate_cache(kvm); 695 + 847 696 its_free_ite(kvm, ite); 848 697 return 0; 849 698 } ··· 880 725 881 726 ite->collection = collection; 882 727 vcpu = kvm_get_vcpu(kvm, collection->target_addr); 728 + 729 + vgic_its_invalidate_cache(kvm); 883 730 884 731 return update_affinity(ite->irq, vcpu); 885 732 } ··· 1111 954 list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list) 1112 955 its_free_ite(kvm, ite); 1113 956 957 + vgic_its_invalidate_cache(kvm); 958 + 1114 959 list_del(&device->dev_list); 1115 960 kfree(device); 1116 961 } ··· 1218 1059 1219 1060 if (!valid) { 1220 1061 vgic_its_free_collection(its, coll_id); 1062 + vgic_its_invalidate_cache(kvm); 1221 1063 } else { 1222 1064 collection = find_collection(its, coll_id); 1223 1065 ··· 1366 1206 1367 1207 vgic_put_irq(kvm, irq); 1368 1208 } 1209 + 1210 + vgic_its_invalidate_cache(kvm); 1369 1211 1370 1212 kfree(intids); 1371 1213 return 0; ··· 1719 1557 goto out; 1720 1558 1721 1559 its->enabled = !!(val & GITS_CTLR_ENABLE); 1560 + if (!its->enabled) 1561 + vgic_its_invalidate_cache(kvm); 1722 1562 1723 1563 /* 1724 1564 * Try to process any pending commands. This function bails out early ··· 1821 1657 return ret; 1822 1658 } 1823 1659 1660 + /* Default is 16 cached LPIs per vcpu */ 1661 + #define LPI_DEFAULT_PCPU_CACHE_SIZE 16 1662 + 1663 + void vgic_lpi_translation_cache_init(struct kvm *kvm) 1664 + { 1665 + struct vgic_dist *dist = &kvm->arch.vgic; 1666 + unsigned int sz; 1667 + int i; 1668 + 1669 + if (!list_empty(&dist->lpi_translation_cache)) 1670 + return; 1671 + 1672 + sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE; 1673 + 1674 + for (i = 0; i < sz; i++) { 1675 + struct vgic_translation_cache_entry *cte; 1676 + 1677 + /* An allocation failure is not fatal */ 1678 + cte = kzalloc(sizeof(*cte), GFP_KERNEL); 1679 + if (WARN_ON(!cte)) 1680 + break; 1681 + 1682 + INIT_LIST_HEAD(&cte->entry); 1683 + list_add(&cte->entry, &dist->lpi_translation_cache); 1684 + } 1685 + } 1686 + 1687 + void vgic_lpi_translation_cache_destroy(struct kvm *kvm) 1688 + { 1689 + struct vgic_dist *dist = &kvm->arch.vgic; 1690 + struct vgic_translation_cache_entry *cte, *tmp; 1691 + 1692 + vgic_its_invalidate_cache(kvm); 1693 + 1694 + list_for_each_entry_safe(cte, tmp, 1695 + &dist->lpi_translation_cache, entry) { 1696 + list_del(&cte->entry); 1697 + kfree(cte); 1698 + } 1699 + } 1700 + 1824 1701 #define INITIAL_BASER_VALUE \ 1825 1702 (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ 1826 1703 GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ ··· 1890 1685 kfree(its); 1891 1686 return ret; 1892 1687 } 1688 + 1689 + vgic_lpi_translation_cache_init(dev->kvm); 1893 1690 } 1894 1691 1895 1692 mutex_init(&its->its_lock);

+27 -58

virt/kvm/arm/vgic/vgic-mmio-v3.c

··· 192 192 193 193 vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; 194 194 195 - if (was_enabled && !vgic_cpu->lpis_enabled) 195 + if (was_enabled && !vgic_cpu->lpis_enabled) { 196 196 vgic_flush_pending_lpis(vcpu); 197 + vgic_its_invalidate_cache(vcpu->kvm); 198 + } 197 199 198 200 if (!was_enabled && vgic_cpu->lpis_enabled) 199 201 vgic_enable_lpis(vcpu); ··· 517 515 VGIC_ACCESS_32bit), 518 516 }; 519 517 520 - static const struct vgic_register_region vgic_v3_rdbase_registers[] = { 518 + static const struct vgic_register_region vgic_v3_rd_registers[] = { 519 + /* RD_base registers */ 521 520 REGISTER_DESC_WITH_LENGTH(GICR_CTLR, 522 521 vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, 523 522 VGIC_ACCESS_32bit), ··· 543 540 REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, 544 541 vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, 545 542 VGIC_ACCESS_32bit), 546 - }; 547 - 548 - static const struct vgic_register_region vgic_v3_sgibase_registers[] = { 549 - REGISTER_DESC_WITH_LENGTH(GICR_IGROUPR0, 543 + /* SGI_base registers */ 544 + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0, 550 545 vgic_mmio_read_group, vgic_mmio_write_group, 4, 551 546 VGIC_ACCESS_32bit), 552 - REGISTER_DESC_WITH_LENGTH(GICR_ISENABLER0, 547 + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ISENABLER0, 553 548 vgic_mmio_read_enable, vgic_mmio_write_senable, 4, 554 549 VGIC_ACCESS_32bit), 555 - REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0, 550 + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICENABLER0, 556 551 vgic_mmio_read_enable, vgic_mmio_write_cenable, 4, 557 552 VGIC_ACCESS_32bit), 558 - REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISPENDR0, 553 + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, 559 554 vgic_mmio_read_pending, vgic_mmio_write_spending, 560 555 vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4, 561 556 VGIC_ACCESS_32bit), 562 - REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICPENDR0, 557 + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0, 563 558 vgic_mmio_read_pending, vgic_mmio_write_cpending, 564 559 vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4, 565 560 VGIC_ACCESS_32bit), 566 - REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISACTIVER0, 561 + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0, 567 562 vgic_mmio_read_active, vgic_mmio_write_sactive, 568 563 NULL, vgic_mmio_uaccess_write_sactive, 569 564 4, VGIC_ACCESS_32bit), 570 - REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICACTIVER0, 565 + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0, 571 566 vgic_mmio_read_active, vgic_mmio_write_cactive, 572 567 NULL, vgic_mmio_uaccess_write_cactive, 573 568 4, VGIC_ACCESS_32bit), 574 - REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0, 569 + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0, 575 570 vgic_mmio_read_priority, vgic_mmio_write_priority, 32, 576 571 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), 577 - REGISTER_DESC_WITH_LENGTH(GICR_ICFGR0, 572 + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0, 578 573 vgic_mmio_read_config, vgic_mmio_write_config, 8, 579 574 VGIC_ACCESS_32bit), 580 - REGISTER_DESC_WITH_LENGTH(GICR_IGRPMODR0, 575 + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0, 581 576 vgic_mmio_read_raz, vgic_mmio_write_wi, 4, 582 577 VGIC_ACCESS_32bit), 583 - REGISTER_DESC_WITH_LENGTH(GICR_NSACR, 578 + REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR, 584 579 vgic_mmio_read_raz, vgic_mmio_write_wi, 4, 585 580 VGIC_ACCESS_32bit), 586 581 }; ··· 608 607 struct vgic_dist *vgic = &kvm->arch.vgic; 609 608 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 610 609 struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; 611 - struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev; 612 610 struct vgic_redist_region *rdreg; 613 - gpa_t rd_base, sgi_base; 611 + gpa_t rd_base; 614 612 int ret; 615 613 616 614 if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) ··· 631 631 vgic_cpu->rdreg = rdreg; 632 632 633 633 rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; 634 - sgi_base = rd_base + SZ_64K; 635 634 636 635 kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); 637 636 rd_dev->base_addr = rd_base; 638 637 rd_dev->iodev_type = IODEV_REDIST; 639 - rd_dev->regions = vgic_v3_rdbase_registers; 640 - rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); 638 + rd_dev->regions = vgic_v3_rd_registers; 639 + rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); 641 640 rd_dev->redist_vcpu = vcpu; 642 641 643 642 mutex_lock(&kvm->slots_lock); 644 643 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base, 645 - SZ_64K, &rd_dev->dev); 644 + 2 * SZ_64K, &rd_dev->dev); 646 645 mutex_unlock(&kvm->slots_lock); 647 646 648 647 if (ret) 649 648 return ret; 650 649 651 - kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops); 652 - sgi_dev->base_addr = sgi_base; 653 - sgi_dev->iodev_type = IODEV_REDIST; 654 - sgi_dev->regions = vgic_v3_sgibase_registers; 655 - sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers); 656 - sgi_dev->redist_vcpu = vcpu; 657 - 658 - mutex_lock(&kvm->slots_lock); 659 - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base, 660 - SZ_64K, &sgi_dev->dev); 661 - if (ret) { 662 - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, 663 - &rd_dev->dev); 664 - goto out; 665 - } 666 - 667 650 rdreg->free_index++; 668 - out: 669 - mutex_unlock(&kvm->slots_lock); 670 - return ret; 651 + return 0; 671 652 } 672 653 673 654 static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu) 674 655 { 675 656 struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; 676 - struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev; 677 657 678 658 kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev); 679 - kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &sgi_dev->dev); 680 659 } 681 660 682 661 static int vgic_register_all_redist_iodevs(struct kvm *kvm) ··· 805 826 iodev.base_addr = 0; 806 827 break; 807 828 case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{ 808 - iodev.regions = vgic_v3_rdbase_registers; 809 - iodev.nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); 829 + iodev.regions = vgic_v3_rd_registers; 830 + iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers); 810 831 iodev.base_addr = 0; 811 832 break; 812 833 } ··· 964 985 int offset, u32 *val) 965 986 { 966 987 struct vgic_io_device rd_dev = { 967 - .regions = vgic_v3_rdbase_registers, 968 - .nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers), 988 + .regions = vgic_v3_rd_registers, 989 + .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers), 969 990 }; 970 991 971 - struct vgic_io_device sgi_dev = { 972 - .regions = vgic_v3_sgibase_registers, 973 - .nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers), 974 - }; 975 - 976 - /* SGI_base is the next 64K frame after RD_base */ 977 - if (offset >= SZ_64K) 978 - return vgic_uaccess(vcpu, &sgi_dev, is_write, offset - SZ_64K, 979 - val); 980 - else 981 - return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); 992 + return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val); 982 993 } 983 994 984 995 int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,

+4 -3

virt/kvm/arm/vgic/vgic-v2.c

··· 357 357 DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); 358 358 359 359 /** 360 - * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT 361 - * @node: pointer to the DT node 360 + * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller 361 + * @info: pointer to the GIC description 362 362 * 363 - * Returns 0 if a GICv2 has been found, returns an error code otherwise 363 + * Returns 0 if the VGICv2 has been probed successfully, returns an error code 364 + * otherwise 364 365 */ 365 366 int vgic_v2_probe(const struct gic_kvm_info *info) 366 367 {

+4 -3

virt/kvm/arm/vgic/vgic-v3.c

··· 573 573 early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); 574 574 575 575 /** 576 - * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT 577 - * @node: pointer to the DT node 576 + * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller 577 + * @info: pointer to the GIC description 578 578 * 579 - * Returns 0 if a GICv3 has been found, returns an error code otherwise 579 + * Returns 0 if the VGICv3 has been probed successfully, returns an error code 580 + * otherwise 580 581 */ 581 582 int vgic_v3_probe(const struct gic_kvm_info *info) 582 583 {

+17 -9

virt/kvm/arm/vgic/vgic.c

··· 119 119 { 120 120 } 121 121 122 + /* 123 + * Drop the refcount on the LPI. Must be called with lpi_list_lock held. 124 + */ 125 + void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq) 126 + { 127 + struct vgic_dist *dist = &kvm->arch.vgic; 128 + 129 + if (!kref_put(&irq->refcount, vgic_irq_release)) 130 + return; 131 + 132 + list_del(&irq->lpi_list); 133 + dist->lpi_list_count--; 134 + 135 + kfree(irq); 136 + } 137 + 122 138 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) 123 139 { 124 140 struct vgic_dist *dist = &kvm->arch.vgic; ··· 144 128 return; 145 129 146 130 raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); 147 - if (!kref_put(&irq->refcount, vgic_irq_release)) { 148 - raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 149 - return; 150 - }; 151 - 152 - list_del(&irq->lpi_list); 153 - dist->lpi_list_count--; 131 + __vgic_put_lpi_locked(kvm, irq); 154 132 raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); 155 - 156 - kfree(irq); 157 133 } 158 134 159 135 void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)

+5

virt/kvm/arm/vgic/vgic.h

··· 161 161 gpa_t addr, int len); 162 162 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, 163 163 u32 intid); 164 + void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq); 164 165 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); 165 166 bool vgic_get_phys_line_level(struct vgic_irq *irq); 166 167 void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); ··· 308 307 int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, 309 308 u32 devid, u32 eventid, struct vgic_irq **irq); 310 309 struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); 310 + int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); 311 + void vgic_lpi_translation_cache_init(struct kvm *kvm); 312 + void vgic_lpi_translation_cache_destroy(struct kvm *kvm); 313 + void vgic_its_invalidate_cache(struct kvm *kvm); 311 314 312 315 bool vgic_supports_direct_msis(struct kvm *kvm); 313 316 int vgic_v4_init(struct kvm *kvm);

+3 -4

virt/kvm/kvm_main.c

··· 2321 2321 bool waited = false; 2322 2322 u64 block_ns; 2323 2323 2324 + kvm_arch_vcpu_blocking(vcpu); 2325 + 2324 2326 start = cur = ktime_get(); 2325 2327 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2326 2328 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); ··· 2343 2341 } while (single_task_running() && ktime_before(cur, stop)); 2344 2342 } 2345 2343 2346 - kvm_arch_vcpu_blocking(vcpu); 2347 - 2348 2344 for (;;) { 2349 2345 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2350 2346 ··· 2355 2355 2356 2356 finish_swait(&vcpu->wq, &wait); 2357 2357 cur = ktime_get(); 2358 - 2359 - kvm_arch_vcpu_unblocking(vcpu); 2360 2358 out: 2359 + kvm_arch_vcpu_unblocking(vcpu); 2361 2360 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2362 2361 2363 2362 if (!vcpu_valid_wakeup(vcpu))

Configure Feed

Configure Feed