Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini:
"ARM and x86 bugfixes of all kinds.

The most visible one is that migrating a nested hypervisor has always
been busted on Broadwell and newer processors, and that has finally
been fixed"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (22 commits)
KVM: x86: omit "impossible" pmu MSRs from MSR list
KVM: nVMX: Fix consistency check on injected exception error code
KVM: x86: omit absent pmu MSRs from MSR list
selftests: kvm: Fix libkvm build error
kvm: vmx: Limit guest PMCs to those supported on the host
kvm: x86, powerpc: do not allow clearing largepages debugfs entry
KVM: selftests: x86: clarify what is reported on KVM_GET_MSRS failure
KVM: VMX: Set VMENTER_L1D_FLUSH_NOT_REQUIRED if !X86_BUG_L1TF
selftests: kvm: add test for dirty logging inside nested guests
KVM: x86: fix nested guest live migration with PML
KVM: x86: assign two bits to track SPTE kinds
KVM: x86: Expose XSAVEERPTR to the guest
kvm: x86: Enumerate support for CLZERO instruction
kvm: x86: Use AMD CPUID semantics for AMD vCPUs
kvm: x86: Improve emulation of CPUID leaves 0BH and 1FH
KVM: X86: Fix userspace set invalid CR4
kvm: x86: Fix a spurious -E2BIG in __do_cpuid_func
KVM: LAPIC: Loosen filter for adaptive tuning of lapic_timer_advance_ns
KVM: arm/arm64: vgic: Use the appropriate TRACE_INCLUDE_PATH
arm64: KVM: Kill hyp_alternate_select()
...

Linus Torvalds 6 years ago b145b0eb 50dfd03d

+593 -191

23 changed files

expand all collapse all

arch

arm64

include

asm

kvm_hyp.h

kvm

hyp

switch.c

tlb.c

powerpc

kvm

book3s.c

x86

include

asm

kvm_host.h

kvm

cpuid.c

lapic.c

mmu.c

vmx

nested.c

pmu_intel.c

vmx.c

x86.c

include

linux

kvm_host.h

tools

testing

selftests

kvm

Makefile

include

x86_64

processor.h

vmx.h

lib

kvm_util.c

kvm_util_internal.h

x86_64

processor.c

vmx.c

x86_64

vmx_dirty_log_test.c

virt

kvm

arm

vgic

trace.h

kvm_main.c

-24

arch/arm64/include/asm/kvm_hyp.h

reviewed

··· 47 47 #define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1) 48 48 #define write_sysreg_el2(v,r) write_sysreg_elx(v, r, _EL2, _EL1) 49 49 50 50 - /** 51 51 - * hyp_alternate_select - Generates patchable code sequences that are 52 52 - * used to switch between two implementations of a function, depending 53 53 - * on the availability of a feature. 54 54 - * 55 55 - * @fname: a symbol name that will be defined as a function returning a 56 56 - * function pointer whose type will match @orig and @alt 57 57 - * @orig: A pointer to the default function, as returned by @fname when 58 58 - * @cond doesn't hold 59 59 - * @alt: A pointer to the alternate function, as returned by @fname 60 60 - * when @cond holds 61 61 - * @cond: a CPU feature (as described in asm/cpufeature.h) 62 62 - */ 63 63 - #define hyp_alternate_select(fname, orig, alt, cond) \ 64 64 - typeof(orig) * __hyp_text fname(void) \ 65 65 - { \ 66 66 - typeof(alt) *val = orig; \ 67 67 - asm volatile(ALTERNATIVE("nop \n", \ 68 68 - "mov %0, %1 \n", \ 69 69 - cond) \ 70 70 - : "+r" (val) : "r" (alt)); \ 71 71 - return val; \ 72 72 - } 73 73 - 74 50 int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); 75 51 76 52 void __vgic_v3_save_state(struct kvm_vcpu *vcpu);

+2 -15

arch/arm64/kvm/hyp/switch.c

reviewed

··· 229 229 } 230 230 } 231 231 232 232 - static bool __hyp_text __true_value(void) 233 233 - { 234 234 - return true; 235 235 - } 236 236 - 237 237 - static bool __hyp_text __false_value(void) 238 238 - { 239 239 - return false; 240 240 - } 241 241 - 242 242 - static hyp_alternate_select(__check_arm_834220, 243 243 - __false_value, __true_value, 244 244 - ARM64_WORKAROUND_834220); 245 245 - 246 232 static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) 247 233 { 248 234 u64 par, tmp; ··· 284 298 * resolve the IPA using the AT instruction. 285 299 */ 286 300 if (!(esr & ESR_ELx_S1PTW) && 287 287 - (__check_arm_834220()() || (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { 301 301 + (cpus_have_const_cap(ARM64_WORKAROUND_834220) || 302 302 + (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { 288 303 if (!__translate_far_to_hpfar(far, &hpfar)) 289 304 return false; 290 305 } else {

+22 -14

arch/arm64/kvm/hyp/tlb.c

reviewed

··· 67 67 isb(); 68 68 } 69 69 70 70 - static hyp_alternate_select(__tlb_switch_to_guest, 71 71 - __tlb_switch_to_guest_nvhe, 72 72 - __tlb_switch_to_guest_vhe, 73 73 - ARM64_HAS_VIRT_HOST_EXTN); 70 70 + static void __hyp_text __tlb_switch_to_guest(struct kvm *kvm, 71 71 + struct tlb_inv_context *cxt) 72 72 + { 73 73 + if (has_vhe()) 74 74 + __tlb_switch_to_guest_vhe(kvm, cxt); 75 75 + else 76 76 + __tlb_switch_to_guest_nvhe(kvm, cxt); 77 77 + } 74 78 75 79 static void __hyp_text __tlb_switch_to_host_vhe(struct kvm *kvm, 76 80 struct tlb_inv_context *cxt) ··· 102 98 write_sysreg(0, vttbr_el2); 103 99 } 104 100 105 105 - static hyp_alternate_select(__tlb_switch_to_host, 106 106 - __tlb_switch_to_host_nvhe, 107 107 - __tlb_switch_to_host_vhe, 108 108 - ARM64_HAS_VIRT_HOST_EXTN); 101 101 + static void __hyp_text __tlb_switch_to_host(struct kvm *kvm, 102 102 + struct tlb_inv_context *cxt) 103 103 + { 104 104 + if (has_vhe()) 105 105 + __tlb_switch_to_host_vhe(kvm, cxt); 106 106 + else 107 107 + __tlb_switch_to_host_nvhe(kvm, cxt); 108 108 + } 109 109 110 110 void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 111 111 { ··· 119 111 120 112 /* Switch to requested VMID */ 121 113 kvm = kern_hyp_va(kvm); 122 122 - __tlb_switch_to_guest()(kvm, &cxt); 114 114 + __tlb_switch_to_guest(kvm, &cxt); 123 115 124 116 /* 125 117 * We could do so much better if we had the VA as well. ··· 162 154 if (!has_vhe() && icache_is_vpipt()) 163 155 __flush_icache_all(); 164 156 165 165 - __tlb_switch_to_host()(kvm, &cxt); 157 157 + __tlb_switch_to_host(kvm, &cxt); 166 158 } 167 159 168 160 void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) ··· 173 165 174 166 /* Switch to requested VMID */ 175 167 kvm = kern_hyp_va(kvm); 176 176 - __tlb_switch_to_guest()(kvm, &cxt); 168 168 + __tlb_switch_to_guest(kvm, &cxt); 177 169 178 170 __tlbi(vmalls12e1is); 179 171 dsb(ish); 180 172 isb(); 181 173 182 182 - __tlb_switch_to_host()(kvm, &cxt); 174 174 + __tlb_switch_to_host(kvm, &cxt); 183 175 } 184 176 185 177 void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) ··· 188 180 struct tlb_inv_context cxt; 189 181 190 182 /* Switch to requested VMID */ 191 191 - __tlb_switch_to_guest()(kvm, &cxt); 183 183 + __tlb_switch_to_guest(kvm, &cxt); 192 184 193 185 __tlbi(vmalle1); 194 186 dsb(nsh); 195 187 isb(); 196 188 197 197 - __tlb_switch_to_host()(kvm, &cxt); 189 189 + __tlb_switch_to_host(kvm, &cxt); 198 190 } 199 191 200 192 void __hyp_text __kvm_flush_vm_context(void)

+4 -4

arch/powerpc/kvm/book3s.c

reviewed

··· 36 36 #include "book3s.h" 37 37 #include "trace.h" 38 38 39 39 - #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 40 40 - #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 39 39 + #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ 40 40 + #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ 41 41 42 42 /* #define EXIT_DEBUG */ 43 43 ··· 69 69 { "pthru_all", VCPU_STAT(pthru_all) }, 70 70 { "pthru_host", VCPU_STAT(pthru_host) }, 71 71 { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, 72 72 - { "largepages_2M", VM_STAT(num_2M_pages) }, 73 73 - { "largepages_1G", VM_STAT(num_1G_pages) }, 72 72 + { "largepages_2M", VM_STAT(num_2M_pages, .mode = 0444) }, 73 73 + { "largepages_1G", VM_STAT(num_1G_pages, .mode = 0444) }, 74 74 { NULL } 75 75 }; 76 76

-7

arch/x86/include/asm/kvm_host.h

reviewed

··· 219 219 PFERR_WRITE_MASK | \ 220 220 PFERR_PRESENT_MASK) 221 221 222 222 - /* 223 223 - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or 224 224 - * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting 225 225 - * with the SVE bit in EPT PTEs. 226 226 - */ 227 227 - #define SPTE_SPECIAL_MASK (1ULL << 62) 228 228 - 229 222 /* apic attention bits */ 230 223 #define KVM_APIC_CHECK_VAPIC 0 231 224 /*

+59 -41

arch/x86/kvm/cpuid.c

reviewed

··· 485 485 486 486 /* cpuid 0x80000008.ebx */ 487 487 const u32 kvm_cpuid_8000_0008_ebx_x86_features = 488 488 + F(CLZERO) | F(XSAVEERPTR) | 488 489 F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | 489 490 F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); 490 491 ··· 619 618 */ 620 619 case 0x1f: 621 620 case 0xb: { 622 622 - int i, level_type; 621 621 + int i; 623 622 624 624 - /* read more entries until level_type is zero */ 625 625 - for (i = 1; ; ++i) { 623 623 + /* 624 624 + * We filled in entry[0] for CPUID(EAX=<function>, 625 625 + * ECX=00H) above. If its level type (ECX[15:8]) is 626 626 + * zero, then the leaf is unimplemented, and we're 627 627 + * done. Otherwise, continue to populate entries 628 628 + * until the level type (ECX[15:8]) of the previously 629 629 + * added entry is zero. 630 630 + */ 631 631 + for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { 626 632 if (*nent >= maxnent) 627 633 goto out; 628 634 629 629 - level_type = entry[i - 1].ecx & 0xff00; 630 630 - if (!level_type) 631 631 - break; 632 635 do_host_cpuid(&entry[i], function, i); 633 636 ++*nent; 634 637 } ··· 974 969 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); 975 970 976 971 /* 977 977 - * If no match is found, check whether we exceed the vCPU's limit 978 978 - * and return the content of the highest valid _standard_ leaf instead. 979 979 - * This is to satisfy the CPUID specification. 972 972 + * If the basic or extended CPUID leaf requested is higher than the 973 973 + * maximum supported basic or extended leaf, respectively, then it is 974 974 + * out of range. 980 975 */ 981 981 - static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, 982 982 - u32 function, u32 index) 976 976 + static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) 983 977 { 984 984 - struct kvm_cpuid_entry2 *maxlevel; 978 978 + struct kvm_cpuid_entry2 *max; 985 979 986 986 - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); 987 987 - if (!maxlevel || maxlevel->eax >= function) 988 988 - return NULL; 989 989 - if (function & 0x80000000) { 990 990 - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); 991 991 - if (!maxlevel) 992 992 - return NULL; 993 993 - } 994 994 - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); 980 980 + max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); 981 981 + return max && function <= max->eax; 995 982 } 996 983 997 984 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 998 985 u32 *ecx, u32 *edx, bool check_limit) 999 986 { 1000 987 u32 function = *eax, index = *ecx; 1001 1001 - struct kvm_cpuid_entry2 *best; 1002 1002 - bool entry_found = true; 988 988 + struct kvm_cpuid_entry2 *entry; 989 989 + struct kvm_cpuid_entry2 *max; 990 990 + bool found; 1003 991 1004 1004 - best = kvm_find_cpuid_entry(vcpu, function, index); 1005 1005 - 1006 1006 - if (!best) { 1007 1007 - entry_found = false; 1008 1008 - if (!check_limit) 1009 1009 - goto out; 1010 1010 - 1011 1011 - best = check_cpuid_limit(vcpu, function, index); 992 992 + entry = kvm_find_cpuid_entry(vcpu, function, index); 993 993 + found = entry; 994 994 + /* 995 995 + * Intel CPUID semantics treats any query for an out-of-range 996 996 + * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were 997 997 + * requested. AMD CPUID semantics returns all zeroes for any 998 998 + * undefined leaf, whether or not the leaf is in range. 999 999 + */ 1000 1000 + if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && 1001 1001 + !cpuid_function_in_range(vcpu, function)) { 1002 1002 + max = kvm_find_cpuid_entry(vcpu, 0, 0); 1003 1003 + if (max) { 1004 1004 + function = max->eax; 1005 1005 + entry = kvm_find_cpuid_entry(vcpu, function, index); 1006 1006 + } 1012 1007 } 1013 1013 - 1014 1014 - out: 1015 1015 - if (best) { 1016 1016 - *eax = best->eax; 1017 1017 - *ebx = best->ebx; 1018 1018 - *ecx = best->ecx; 1019 1019 - *edx = best->edx; 1020 1020 - } else 1008 1008 + if (entry) { 1009 1009 + *eax = entry->eax; 1010 1010 + *ebx = entry->ebx; 1011 1011 + *ecx = entry->ecx; 1012 1012 + *edx = entry->edx; 1013 1013 + } else { 1021 1014 *eax = *ebx = *ecx = *edx = 0; 1022 1022 - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); 1023 1023 - return entry_found; 1015 1015 + /* 1016 1016 + * When leaf 0BH or 1FH is defined, CL is pass-through 1017 1017 + * and EDX is always the x2APIC ID, even for undefined 1018 1018 + * subleaves. Index 1 will exist iff the leaf is 1019 1019 + * implemented, so we pass through CL iff leaf 1 1020 1020 + * exists. EDX can be copied from any existing index. 1021 1021 + */ 1022 1022 + if (function == 0xb || function == 0x1f) { 1023 1023 + entry = kvm_find_cpuid_entry(vcpu, function, 1); 1024 1024 + if (entry) { 1025 1025 + *ecx = index & 0xff; 1026 1026 + *edx = entry->edx; 1027 1027 + } 1028 1028 + } 1029 1029 + } 1030 1030 + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); 1031 1031 + return found; 1024 1032 } 1025 1033 EXPORT_SYMBOL_GPL(kvm_cpuid); 1026 1034

+7 -6

arch/x86/kvm/lapic.c

reviewed

··· 66 66 #define X2APIC_BROADCAST 0xFFFFFFFFul 67 67 68 68 static bool lapic_timer_advance_dynamic __read_mostly; 69 69 - #define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 70 70 - #define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000 71 71 - #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 69 69 + #define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ 70 70 + #define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ 71 71 + #define LAPIC_TIMER_ADVANCE_NS_INIT 1000 72 72 + #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 72 73 /* step-by-step approximation to mitigate fluctuation */ 73 74 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 74 75 ··· 1505 1504 timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; 1506 1505 } 1507 1506 1508 1508 - if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX)) 1509 1509 - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; 1507 1507 + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) 1508 1508 + timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; 1510 1509 apic->lapic_timer.timer_advance_ns = timer_advance_ns; 1511 1510 } 1512 1511 ··· 2303 2302 HRTIMER_MODE_ABS_HARD); 2304 2303 apic->lapic_timer.timer.function = apic_timer_fn; 2305 2304 if (timer_advance_ns == -1) { 2306 2306 - apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; 2305 2305 + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; 2307 2306 lapic_timer_advance_dynamic = true; 2308 2307 } else { 2309 2308 apic->lapic_timer.timer_advance_ns = timer_advance_ns;

+49 -16

arch/x86/kvm/mmu.c

reviewed

··· 83 83 #define PTE_PREFETCH_NUM 8 84 84 85 85 #define PT_FIRST_AVAIL_BITS_SHIFT 10 86 86 - #define PT64_SECOND_AVAIL_BITS_SHIFT 52 86 86 + #define PT64_SECOND_AVAIL_BITS_SHIFT 54 87 87 + 88 88 + /* 89 89 + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or 90 90 + * Access Tracking SPTEs. 91 91 + */ 92 92 + #define SPTE_SPECIAL_MASK (3ULL << 52) 93 93 + #define SPTE_AD_ENABLED_MASK (0ULL << 52) 94 94 + #define SPTE_AD_DISABLED_MASK (1ULL << 52) 95 95 + #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) 96 96 + #define SPTE_MMIO_MASK (3ULL << 52) 87 97 88 98 #define PT64_LEVEL_BITS 9 89 99 ··· 229 219 static u64 __read_mostly shadow_me_mask; 230 220 231 221 /* 232 232 - * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. 233 233 - * Non-present SPTEs with shadow_acc_track_value set are in place for access 234 234 - * tracking. 222 222 + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; 223 223 + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed 224 224 + * pages. 235 225 */ 236 226 static u64 __read_mostly shadow_acc_track_mask; 237 237 - static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; 238 227 239 228 /* 240 229 * The mask/shift to use for saving the original R/X bits when marking the PTE ··· 313 304 { 314 305 BUG_ON((u64)(unsigned)access_mask != access_mask); 315 306 BUG_ON((mmio_mask & mmio_value) != mmio_value); 316 316 - shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; 307 307 + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; 317 308 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; 318 309 shadow_mmio_access_mask = access_mask; 319 310 } ··· 329 320 return sp->role.ad_disabled; 330 321 } 331 322 323 323 + static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) 324 324 + { 325 325 + /* 326 326 + * When using the EPT page-modification log, the GPAs in the log 327 327 + * would come from L2 rather than L1. Therefore, we need to rely 328 328 + * on write protection to record dirty pages. This also bypasses 329 329 + * PML, since writes now result in a vmexit. 330 330 + */ 331 331 + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; 332 332 + } 333 333 + 332 334 static inline bool spte_ad_enabled(u64 spte) 333 335 { 334 336 MMU_WARN_ON(is_mmio_spte(spte)); 335 335 - return !(spte & shadow_acc_track_value); 337 337 + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; 338 338 + } 339 339 + 340 340 + static inline bool spte_ad_need_write_protect(u64 spte) 341 341 + { 342 342 + MMU_WARN_ON(is_mmio_spte(spte)); 343 343 + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; 336 344 } 337 345 338 346 static inline u64 spte_shadow_accessed_mask(u64 spte) ··· 487 461 { 488 462 BUG_ON(!dirty_mask != !accessed_mask); 489 463 BUG_ON(!accessed_mask && !acc_track_mask); 490 490 - BUG_ON(acc_track_mask & shadow_acc_track_value); 464 464 + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); 491 465 492 466 shadow_user_mask = user_mask; 493 467 shadow_accessed_mask = accessed_mask; ··· 1615 1589 1616 1590 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); 1617 1591 1592 1592 + MMU_WARN_ON(!spte_ad_enabled(spte)); 1618 1593 spte &= ~shadow_dirty_mask; 1619 1619 - 1620 1594 return mmu_spte_update(sptep, spte); 1621 1595 } 1622 1596 1623 1623 - static bool wrprot_ad_disabled_spte(u64 *sptep) 1597 1597 + static bool spte_wrprot_for_clear_dirty(u64 *sptep) 1624 1598 { 1625 1599 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, 1626 1600 (unsigned long *)sptep); 1627 1627 - if (was_writable) 1601 1601 + if (was_writable && !spte_ad_enabled(*sptep)) 1628 1602 kvm_set_pfn_dirty(spte_to_pfn(*sptep)); 1629 1603 1630 1604 return was_writable; ··· 1643 1617 bool flush = false; 1644 1618 1645 1619 for_each_rmap_spte(rmap_head, &iter, sptep) 1646 1646 - if (spte_ad_enabled(*sptep)) 1647 1647 - flush |= spte_clear_dirty(sptep); 1620 1620 + if (spte_ad_need_write_protect(*sptep)) 1621 1621 + flush |= spte_wrprot_for_clear_dirty(sptep); 1648 1622 else 1649 1649 - flush |= wrprot_ad_disabled_spte(sptep); 1623 1623 + flush |= spte_clear_dirty(sptep); 1650 1624 1651 1625 return flush; 1652 1626 } ··· 1657 1631 1658 1632 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); 1659 1633 1634 1634 + /* 1635 1635 + * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, 1636 1636 + * do not bother adding back write access to pages marked 1637 1637 + * SPTE_AD_WRPROT_ONLY_MASK. 1638 1638 + */ 1660 1639 spte |= shadow_dirty_mask; 1661 1640 1662 1641 return mmu_spte_update(sptep, spte); ··· 2653 2622 shadow_user_mask | shadow_x_mask | shadow_me_mask; 2654 2623 2655 2624 if (sp_ad_disabled(sp)) 2656 2656 - spte |= shadow_acc_track_value; 2625 2625 + spte |= SPTE_AD_DISABLED_MASK; 2657 2626 else 2658 2627 spte |= shadow_accessed_mask; 2659 2628 ··· 2999 2968 3000 2969 sp = page_header(__pa(sptep)); 3001 2970 if (sp_ad_disabled(sp)) 3002 3002 - spte |= shadow_acc_track_value; 2971 2971 + spte |= SPTE_AD_DISABLED_MASK; 2972 2972 + else if (kvm_vcpu_ad_need_write_protect(vcpu)) 2973 2973 + spte |= SPTE_AD_WRPROT_ONLY_MASK; 3003 2974 3004 2975 /* 3005 2976 * For the EPT case, shadow_present_mask is 0 if hardware

+1 -1

arch/x86/kvm/vmx/nested.c

reviewed

··· 2610 2610 2611 2611 /* VM-entry exception error code */ 2612 2612 if (CC(has_error_code && 2613 2613 - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) 2613 2613 + vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2614 2614 return -EINVAL; 2615 2615 2616 2616 /* VM-entry interruption-info field: reserved bits */

+5 -2

arch/x86/kvm/vmx/pmu_intel.c

reviewed

··· 262 262 static void intel_pmu_refresh(struct kvm_vcpu *vcpu) 263 263 { 264 264 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 265 265 + struct x86_pmu_capability x86_pmu; 265 266 struct kvm_cpuid_entry2 *entry; 266 267 union cpuid10_eax eax; 267 268 union cpuid10_edx edx; ··· 284 283 if (!pmu->version) 285 284 return; 286 285 286 286 + perf_get_x86_pmu_capability(&x86_pmu); 287 287 + 287 288 pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, 288 288 - INTEL_PMC_MAX_GENERIC); 289 289 + x86_pmu.num_counters_gp); 289 290 pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; 290 291 pmu->available_event_types = ~entry->ebx & 291 292 ((1ull << eax.split.mask_length) - 1); ··· 297 294 } else { 298 295 pmu->nr_arch_fixed_counters = 299 296 min_t(int, edx.split.num_counters_fixed, 300 300 - INTEL_PMC_MAX_FIXED); 297 297 + x86_pmu.num_counters_fixed); 301 298 pmu->counter_bitmask[KVM_PMC_FIXED] = 302 299 ((u64)1 << edx.split.bit_width_fixed) - 1; 303 300 }

+9 -6

arch/x86/kvm/vmx/vmx.c

reviewed

··· 209 209 struct page *page; 210 210 unsigned int i; 211 211 212 212 + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 213 213 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 214 214 + return 0; 215 215 + } 216 216 + 212 217 if (!enable_ept) { 213 218 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 214 219 return 0; ··· 8000 7995 * contain 'auto' which will be turned into the default 'cond' 8001 7996 * mitigation mode. 8002 7997 */ 8003 8003 - if (boot_cpu_has(X86_BUG_L1TF)) { 8004 8004 - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8005 8005 - if (r) { 8006 8006 - vmx_exit(); 8007 8007 - return r; 8008 8008 - } 7998 7998 + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 7999 7999 + if (r) { 8000 8000 + vmx_exit(); 8001 8001 + return r; 8009 8002 } 8010 8003 8011 8004 #ifdef CONFIG_KEXEC_CORE

+46 -46

arch/x86/kvm/x86.c

reviewed

··· 92 92 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); 93 93 #endif 94 94 95 95 - #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 96 96 - #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 95 95 + #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ 96 96 + #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ 97 97 98 98 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ 99 99 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) ··· 212 212 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 213 213 { "mmu_unsync", VM_STAT(mmu_unsync) }, 214 214 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 215 215 - { "largepages", VM_STAT(lpages) }, 215 215 + { "largepages", VM_STAT(lpages, .mode = 0444) }, 216 216 { "max_mmu_page_hash_collisions", 217 217 VM_STAT(max_mmu_page_hash_collisions) }, 218 218 { NULL } ··· 885 885 } 886 886 EXPORT_SYMBOL_GPL(kvm_set_xcr); 887 887 888 888 + static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 889 889 + { 890 890 + if (cr4 & CR4_RESERVED_BITS) 891 891 + return -EINVAL; 892 892 + 893 893 + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) 894 894 + return -EINVAL; 895 895 + 896 896 + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) 897 897 + return -EINVAL; 898 898 + 899 899 + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) 900 900 + return -EINVAL; 901 901 + 902 902 + if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) 903 903 + return -EINVAL; 904 904 + 905 905 + if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) 906 906 + return -EINVAL; 907 907 + 908 908 + if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) 909 909 + return -EINVAL; 910 910 + 911 911 + if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) 912 912 + return -EINVAL; 913 913 + 914 914 + return 0; 915 915 + } 916 916 + 888 917 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 889 918 { 890 919 unsigned long old_cr4 = kvm_read_cr4(vcpu); 891 920 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | 892 921 X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; 893 922 894 894 - if (cr4 & CR4_RESERVED_BITS) 895 895 - return 1; 896 896 - 897 897 - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) 898 898 - return 1; 899 899 - 900 900 - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) 901 901 - return 1; 902 902 - 903 903 - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) 904 904 - return 1; 905 905 - 906 906 - if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) 907 907 - return 1; 908 908 - 909 909 - if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) 910 910 - return 1; 911 911 - 912 912 - if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) 913 913 - return 1; 914 914 - 915 915 - if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) 923 923 + if (kvm_valid_cr4(vcpu, cr4)) 916 924 return 1; 917 925 918 926 if (is_long_mode(vcpu)) { ··· 1169 1161 MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, 1170 1162 MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, 1171 1163 MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, 1172 1172 - MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, 1173 1173 - MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, 1174 1174 - MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, 1175 1175 - MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, 1176 1176 - MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, 1177 1177 - MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, 1178 1178 - MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, 1179 1164 MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 1180 1165 MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 1181 1166 MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, ··· 1178 1177 MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, 1179 1178 MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, 1180 1179 MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, 1181 1181 - MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, 1182 1182 - MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, 1183 1183 - MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, 1184 1184 - MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, 1185 1185 - MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, 1186 1186 - MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, 1187 1187 - MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, 1188 1180 }; 1189 1181 1190 1182 static unsigned num_msrs_to_save; ··· 5091 5097 5092 5098 static void kvm_init_msr_list(void) 5093 5099 { 5100 5100 + struct x86_pmu_capability x86_pmu; 5094 5101 u32 dummy[2]; 5095 5102 unsigned i, j; 5096 5103 5097 5104 BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, 5098 5105 "Please update the fixed PMCs in msrs_to_save[]"); 5099 5099 - BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, 5100 5100 - "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); 5106 5106 + 5107 5107 + perf_get_x86_pmu_capability(&x86_pmu); 5101 5108 5102 5109 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 5103 5110 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) ··· 5140 5145 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) 5141 5146 continue; 5142 5147 break; 5148 5148 + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: 5149 5149 + if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >= 5150 5150 + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) 5151 5151 + continue; 5152 5152 + break; 5153 5153 + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: 5154 5154 + if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= 5155 5155 + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) 5156 5156 + continue; 5143 5157 } 5144 5158 default: 5145 5159 break; ··· 8718 8714 8719 8715 static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 8720 8716 { 8721 8721 - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 8722 8722 - (sregs->cr4 & X86_CR4_OSXSAVE)) 8723 8723 - return -EINVAL; 8724 8724 - 8725 8717 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { 8726 8718 /* 8727 8719 * When EFER.LME and CR0.PG are set, the processor is in ··· 8736 8736 return -EINVAL; 8737 8737 } 8738 8738 8739 8739 - return 0; 8739 8739 + return kvm_valid_cr4(vcpu, sregs->cr4); 8740 8740 } 8741 8741 8742 8742 static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)

include/linux/kvm_host.h

reviewed

··· 1090 1090 1091 1091 struct kvm_stat_data { 1092 1092 int offset; 1093 1093 + int mode; 1093 1094 struct kvm *kvm; 1094 1095 }; 1095 1096 ··· 1098 1097 const char *name; 1099 1098 int offset; 1100 1099 enum kvm_stat_kind kind; 1100 1100 + int mode; 1101 1101 }; 1102 1102 extern struct kvm_stats_debugfs_item debugfs_entries[]; 1103 1103 extern struct dentry *kvm_debugfs_dir;

+2 -1

tools/testing/selftests/kvm/Makefile

reviewed

··· 22 22 TEST_GEN_PROGS_x86_64 += x86_64/state_test 23 23 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test 24 24 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test 25 25 + TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test 25 26 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test 26 27 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test 27 28 TEST_GEN_PROGS_x86_64 += clear_dirty_log_test ··· 49 48 -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(UNAME_M) -I.. 50 49 51 50 no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ 52 52 - $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) 51 51 + $(CC) -Werror -no-pie -x c - -o "$$TMP", -no-pie) 53 52 54 53 # On s390, build the testcases KVM-enabled 55 54 pgste-option = $(call try-run, echo 'int main() { return 0; }' | \

tools/testing/selftests/kvm/include/x86_64/processor.h

reviewed

··· 1083 1083 #define VMX_BASIC_MEM_TYPE_WB 6LLU 1084 1084 #define VMX_BASIC_INOUT 0x0040000000000000LLU 1085 1085 1086 1086 + /* VMX_EPT_VPID_CAP bits */ 1087 1087 + #define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) 1088 1088 + 1086 1089 /* MSR_IA32_VMX_MISC bits */ 1087 1090 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) 1088 1091 #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F

+14

tools/testing/selftests/kvm/include/x86_64/vmx.h

reviewed

··· 569 569 void *enlightened_vmcs_hva; 570 570 uint64_t enlightened_vmcs_gpa; 571 571 void *enlightened_vmcs; 572 572 + 573 573 + void *eptp_hva; 574 574 + uint64_t eptp_gpa; 575 575 + void *eptp; 572 576 }; 573 577 574 578 struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva); 575 579 bool prepare_for_vmx_operation(struct vmx_pages *vmx); 576 580 void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); 577 581 bool load_vmcs(struct vmx_pages *vmx); 582 582 + 583 583 + void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, 584 584 + uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot); 585 585 + void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, 586 586 + uint64_t nested_paddr, uint64_t paddr, uint64_t size, 587 587 + uint32_t eptp_memslot); 588 588 + void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, 589 589 + uint32_t memslot, uint32_t eptp_memslot); 590 590 + void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, 591 591 + uint32_t eptp_memslot); 578 592 579 593 #endif /* SELFTEST_KVM_VMX_H */

+1 -1

tools/testing/selftests/kvm/lib/kvm_util.c

reviewed

··· 705 705 * on error (e.g. currently no memory region using memslot as a KVM 706 706 * memory slot ID). 707 707 */ 708 708 - static struct userspace_mem_region * 708 708 + struct userspace_mem_region * 709 709 memslot2region(struct kvm_vm *vm, uint32_t memslot) 710 710 { 711 711 struct userspace_mem_region *region;

tools/testing/selftests/kvm/lib/kvm_util_internal.h

reviewed

··· 68 68 void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); 69 69 void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); 70 70 71 71 + struct userspace_mem_region * 72 72 + memslot2region(struct kvm_vm *vm, uint32_t memslot); 73 73 + 71 74 #endif /* SELFTEST_KVM_UTIL_INTERNAL_H */

+1 -1

tools/testing/selftests/kvm/lib/x86_64/processor.c

reviewed

··· 1085 1085 for (i = 0; i < nmsrs; i++) 1086 1086 state->msrs.entries[i].index = list->indices[i]; 1087 1087 r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); 1088 1088 - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed at %x)", 1088 1088 + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", 1089 1089 r, r == nmsrs ? -1 : list->indices[r]); 1090 1090 1091 1091 r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);

+199 -2

tools/testing/selftests/kvm/lib/x86_64/vmx.c

reviewed

··· 7 7 8 8 #include "test_util.h" 9 9 #include "kvm_util.h" 10 10 + #include "../kvm_util_internal.h" 10 11 #include "processor.h" 11 12 #include "vmx.h" 12 13 14 14 + #define PAGE_SHIFT_4K 12 15 15 + 16 16 + #define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000 17 17 + 13 18 bool enable_evmcs; 14 19 20 20 + struct eptPageTableEntry { 21 21 + uint64_t readable:1; 22 22 + uint64_t writable:1; 23 23 + uint64_t executable:1; 24 24 + uint64_t memory_type:3; 25 25 + uint64_t ignore_pat:1; 26 26 + uint64_t page_size:1; 27 27 + uint64_t accessed:1; 28 28 + uint64_t dirty:1; 29 29 + uint64_t ignored_11_10:2; 30 30 + uint64_t address:40; 31 31 + uint64_t ignored_62_52:11; 32 32 + uint64_t suppress_ve:1; 33 33 + }; 34 34 + 35 35 + struct eptPageTablePointer { 36 36 + uint64_t memory_type:3; 37 37 + uint64_t page_walk_length:3; 38 38 + uint64_t ad_enabled:1; 39 39 + uint64_t reserved_11_07:5; 40 40 + uint64_t address:40; 41 41 + uint64_t reserved_63_52:12; 42 42 + }; 15 43 int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) 16 44 { 17 45 uint16_t evmcs_ver; ··· 202 174 */ 203 175 static inline void init_vmcs_control_fields(struct vmx_pages *vmx) 204 176 { 177 177 + uint32_t sec_exec_ctl = 0; 178 178 + 205 179 vmwrite(VIRTUAL_PROCESSOR_ID, 0); 206 180 vmwrite(POSTED_INTR_NV, 0); 207 181 208 182 vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS)); 209 209 - if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, 0)) 183 183 + 184 184 + if (vmx->eptp_gpa) { 185 185 + uint64_t ept_paddr; 186 186 + struct eptPageTablePointer eptp = { 187 187 + .memory_type = VMX_BASIC_MEM_TYPE_WB, 188 188 + .page_walk_length = 3, /* + 1 */ 189 189 + .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS), 190 190 + .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, 191 191 + }; 192 192 + 193 193 + memcpy(&ept_paddr, &eptp, sizeof(ept_paddr)); 194 194 + vmwrite(EPT_POINTER, ept_paddr); 195 195 + sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT; 196 196 + } 197 197 + 198 198 + if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl)) 210 199 vmwrite(CPU_BASED_VM_EXEC_CONTROL, 211 200 rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); 212 212 - else 201 201 + else { 213 202 vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS)); 203 203 + GUEST_ASSERT(!sec_exec_ctl); 204 204 + } 205 205 + 214 206 vmwrite(EXCEPTION_BITMAP, 0); 215 207 vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); 216 208 vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */ ··· 374 326 init_vmcs_control_fields(vmx); 375 327 init_vmcs_host_state(); 376 328 init_vmcs_guest_state(guest_rip, guest_rsp); 329 329 + } 330 330 + 331 331 + void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, 332 332 + uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot) 333 333 + { 334 334 + uint16_t index[4]; 335 335 + struct eptPageTableEntry *pml4e; 336 336 + 337 337 + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " 338 338 + "unknown or unsupported guest mode, mode: 0x%x", vm->mode); 339 339 + 340 340 + TEST_ASSERT((nested_paddr % vm->page_size) == 0, 341 341 + "Nested physical address not on page boundary,\n" 342 342 + " nested_paddr: 0x%lx vm->page_size: 0x%x", 343 343 + nested_paddr, vm->page_size); 344 344 + TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, 345 345 + "Physical address beyond beyond maximum supported,\n" 346 346 + " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", 347 347 + paddr, vm->max_gfn, vm->page_size); 348 348 + TEST_ASSERT((paddr % vm->page_size) == 0, 349 349 + "Physical address not on page boundary,\n" 350 350 + " paddr: 0x%lx vm->page_size: 0x%x", 351 351 + paddr, vm->page_size); 352 352 + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, 353 353 + "Physical address beyond beyond maximum supported,\n" 354 354 + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", 355 355 + paddr, vm->max_gfn, vm->page_size); 356 356 + 357 357 + index[0] = (nested_paddr >> 12) & 0x1ffu; 358 358 + index[1] = (nested_paddr >> 21) & 0x1ffu; 359 359 + index[2] = (nested_paddr >> 30) & 0x1ffu; 360 360 + index[3] = (nested_paddr >> 39) & 0x1ffu; 361 361 + 362 362 + /* Allocate page directory pointer table if not present. */ 363 363 + pml4e = vmx->eptp_hva; 364 364 + if (!pml4e[index[3]].readable) { 365 365 + pml4e[index[3]].address = vm_phy_page_alloc(vm, 366 366 + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) 367 367 + >> vm->page_shift; 368 368 + pml4e[index[3]].writable = true; 369 369 + pml4e[index[3]].readable = true; 370 370 + pml4e[index[3]].executable = true; 371 371 + } 372 372 + 373 373 + /* Allocate page directory table if not present. */ 374 374 + struct eptPageTableEntry *pdpe; 375 375 + pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); 376 376 + if (!pdpe[index[2]].readable) { 377 377 + pdpe[index[2]].address = vm_phy_page_alloc(vm, 378 378 + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) 379 379 + >> vm->page_shift; 380 380 + pdpe[index[2]].writable = true; 381 381 + pdpe[index[2]].readable = true; 382 382 + pdpe[index[2]].executable = true; 383 383 + } 384 384 + 385 385 + /* Allocate page table if not present. */ 386 386 + struct eptPageTableEntry *pde; 387 387 + pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); 388 388 + if (!pde[index[1]].readable) { 389 389 + pde[index[1]].address = vm_phy_page_alloc(vm, 390 390 + KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) 391 391 + >> vm->page_shift; 392 392 + pde[index[1]].writable = true; 393 393 + pde[index[1]].readable = true; 394 394 + pde[index[1]].executable = true; 395 395 + } 396 396 + 397 397 + /* Fill in page table entry. */ 398 398 + struct eptPageTableEntry *pte; 399 399 + pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); 400 400 + pte[index[0]].address = paddr >> vm->page_shift; 401 401 + pte[index[0]].writable = true; 402 402 + pte[index[0]].readable = true; 403 403 + pte[index[0]].executable = true; 404 404 + 405 405 + /* 406 406 + * For now mark these as accessed and dirty because the only 407 407 + * testcase we have needs that. Can be reconsidered later. 408 408 + */ 409 409 + pte[index[0]].accessed = true; 410 410 + pte[index[0]].dirty = true; 411 411 + } 412 412 + 413 413 + /* 414 414 + * Map a range of EPT guest physical addresses to the VM's physical address 415 415 + * 416 416 + * Input Args: 417 417 + * vm - Virtual Machine 418 418 + * nested_paddr - Nested guest physical address to map 419 419 + * paddr - VM Physical Address 420 420 + * size - The size of the range to map 421 421 + * eptp_memslot - Memory region slot for new virtual translation tables 422 422 + * 423 423 + * Output Args: None 424 424 + * 425 425 + * Return: None 426 426 + * 427 427 + * Within the VM given by vm, creates a nested guest translation for the 428 428 + * page range starting at nested_paddr to the page range starting at paddr. 429 429 + */ 430 430 + void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, 431 431 + uint64_t nested_paddr, uint64_t paddr, uint64_t size, 432 432 + uint32_t eptp_memslot) 433 433 + { 434 434 + size_t page_size = vm->page_size; 435 435 + size_t npages = size / page_size; 436 436 + 437 437 + TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); 438 438 + TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); 439 439 + 440 440 + while (npages--) { 441 441 + nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot); 442 442 + nested_paddr += page_size; 443 443 + paddr += page_size; 444 444 + } 445 445 + } 446 446 + 447 447 + /* Prepare an identity extended page table that maps all the 448 448 + * physical pages in VM. 449 449 + */ 450 450 + void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, 451 451 + uint32_t memslot, uint32_t eptp_memslot) 452 452 + { 453 453 + sparsebit_idx_t i, last; 454 454 + struct userspace_mem_region *region = 455 455 + memslot2region(vm, memslot); 456 456 + 457 457 + i = (region->region.guest_phys_addr >> vm->page_shift) - 1; 458 458 + last = i + (region->region.memory_size >> vm->page_shift); 459 459 + for (;;) { 460 460 + i = sparsebit_next_clear(region->unused_phy_pages, i); 461 461 + if (i > last) 462 462 + break; 463 463 + 464 464 + nested_map(vmx, vm, 465 465 + (uint64_t)i << vm->page_shift, 466 466 + (uint64_t)i << vm->page_shift, 467 467 + 1 << vm->page_shift, 468 468 + eptp_memslot); 469 469 + } 470 470 + } 471 471 + 472 472 + void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, 473 473 + uint32_t eptp_memslot) 474 474 + { 475 475 + vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); 476 476 + vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); 477 477 + vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); 377 478 }

+156

tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c

reviewed

··· 1 1 + // SPDX-License-Identifier: GPL-2.0 2 2 + /* 3 3 + * KVM dirty page logging test 4 4 + * 5 5 + * Copyright (C) 2018, Red Hat, Inc. 6 6 + */ 7 7 + 8 8 + #define _GNU_SOURCE /* for program_invocation_name */ 9 9 + 10 10 + #include <stdio.h> 11 11 + #include <stdlib.h> 12 12 + #include <linux/bitmap.h> 13 13 + #include <linux/bitops.h> 14 14 + 15 15 + #include "test_util.h" 16 16 + #include "kvm_util.h" 17 17 + #include "processor.h" 18 18 + #include "vmx.h" 19 19 + 20 20 + #define VCPU_ID 1 21 21 + 22 22 + /* The memory slot index to track dirty pages */ 23 23 + #define TEST_MEM_SLOT_INDEX 1 24 24 + #define TEST_MEM_SIZE 3 25 25 + 26 26 + /* L1 guest test virtual memory offset */ 27 27 + #define GUEST_TEST_MEM 0xc0000000 28 28 + 29 29 + /* L2 guest test virtual memory offset */ 30 30 + #define NESTED_TEST_MEM1 0xc0001000 31 31 + #define NESTED_TEST_MEM2 0xc0002000 32 32 + 33 33 + static void l2_guest_code(void) 34 34 + { 35 35 + *(volatile uint64_t *)NESTED_TEST_MEM1; 36 36 + *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; 37 37 + GUEST_SYNC(true); 38 38 + GUEST_SYNC(false); 39 39 + 40 40 + *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; 41 41 + GUEST_SYNC(true); 42 42 + *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; 43 43 + GUEST_SYNC(true); 44 44 + GUEST_SYNC(false); 45 45 + 46 46 + /* Exit to L1 and never come back. */ 47 47 + vmcall(); 48 48 + } 49 49 + 50 50 + void l1_guest_code(struct vmx_pages *vmx) 51 51 + { 52 52 + #define L2_GUEST_STACK_SIZE 64 53 53 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 54 54 + 55 55 + GUEST_ASSERT(vmx->vmcs_gpa); 56 56 + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); 57 57 + GUEST_ASSERT(load_vmcs(vmx)); 58 58 + 59 59 + prepare_vmcs(vmx, l2_guest_code, 60 60 + &l2_guest_stack[L2_GUEST_STACK_SIZE]); 61 61 + 62 62 + GUEST_SYNC(false); 63 63 + GUEST_ASSERT(!vmlaunch()); 64 64 + GUEST_SYNC(false); 65 65 + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); 66 66 + GUEST_DONE(); 67 67 + } 68 68 + 69 69 + int main(int argc, char *argv[]) 70 70 + { 71 71 + vm_vaddr_t vmx_pages_gva = 0; 72 72 + struct vmx_pages *vmx; 73 73 + unsigned long *bmap; 74 74 + uint64_t *host_test_mem; 75 75 + 76 76 + struct kvm_vm *vm; 77 77 + struct kvm_run *run; 78 78 + struct ucall uc; 79 79 + bool done = false; 80 80 + 81 81 + /* Create VM */ 82 82 + vm = vm_create_default(VCPU_ID, 0, l1_guest_code); 83 83 + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); 84 84 + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); 85 85 + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); 86 86 + run = vcpu_state(vm, VCPU_ID); 87 87 + 88 88 + /* Add an extra memory slot for testing dirty logging */ 89 89 + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 90 90 + GUEST_TEST_MEM, 91 91 + TEST_MEM_SLOT_INDEX, 92 92 + TEST_MEM_SIZE, 93 93 + KVM_MEM_LOG_DIRTY_PAGES); 94 94 + 95 95 + /* 96 96 + * Add an identity map for GVA range [0xc0000000, 0xc0002000). This 97 97 + * affects both L1 and L2. However... 98 98 + */ 99 99 + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, 100 100 + TEST_MEM_SIZE * 4096, 0); 101 101 + 102 102 + /* 103 103 + * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to 104 104 + * 0xc0000000. 105 105 + * 106 106 + * Note that prepare_eptp should be called only L1's GPA map is done, 107 107 + * meaning after the last call to virt_map. 108 108 + */ 109 109 + prepare_eptp(vmx, vm, 0); 110 110 + nested_map_memslot(vmx, vm, 0, 0); 111 111 + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0); 112 112 + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0); 113 113 + 114 114 + bmap = bitmap_alloc(TEST_MEM_SIZE); 115 115 + host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); 116 116 + 117 117 + while (!done) { 118 118 + memset(host_test_mem, 0xaa, TEST_MEM_SIZE * 4096); 119 119 + _vcpu_run(vm, VCPU_ID); 120 120 + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 121 121 + "Unexpected exit reason: %u (%s),\n", 122 122 + run->exit_reason, 123 123 + exit_reason_str(run->exit_reason)); 124 124 + 125 125 + switch (get_ucall(vm, VCPU_ID, &uc)) { 126 126 + case UCALL_ABORT: 127 127 + TEST_ASSERT(false, "%s at %s:%d", (const char *)uc.args[0], 128 128 + __FILE__, uc.args[1]); 129 129 + /* NOT REACHED */ 130 130 + case UCALL_SYNC: 131 131 + /* 132 132 + * The nested guest wrote at offset 0x1000 in the memslot, but the 133 133 + * dirty bitmap must be filled in according to L1 GPA, not L2. 134 134 + */ 135 135 + kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); 136 136 + if (uc.args[1]) { 137 137 + TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n"); 138 138 + TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n"); 139 139 + } else { 140 140 + TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n"); 141 141 + TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n"); 142 142 + } 143 143 + 144 144 + TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n"); 145 145 + TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n"); 146 146 + TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n"); 147 147 + TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n"); 148 148 + break; 149 149 + case UCALL_DONE: 150 150 + done = true; 151 151 + break; 152 152 + default: 153 153 + TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd); 154 154 + } 155 155 + } 156 156 + }

+1 -1

virt/kvm/arm/vgic/trace.h

reviewed

··· 30 30 #endif /* _TRACE_VGIC_H */ 31 31 32 32 #undef TRACE_INCLUDE_PATH 33 33 - #define TRACE_INCLUDE_PATH ../../../virt/kvm/arm/vgic 33 33 + #define TRACE_INCLUDE_PATH ../../virt/kvm/arm/vgic 34 34 #undef TRACE_INCLUDE_FILE 35 35 #define TRACE_INCLUDE_FILE trace 36 36

+7 -3

virt/kvm/kvm_main.c

reviewed

··· 617 617 618 618 stat_data->kvm = kvm; 619 619 stat_data->offset = p->offset; 620 620 + stat_data->mode = p->mode ? p->mode : 0644; 620 621 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 621 621 - debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, 622 622 + debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, 622 623 stat_data, stat_fops_per_vm[p->kind]); 623 624 } 624 625 return 0; ··· 3930 3929 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 3931 3930 return -ENOENT; 3932 3931 3933 3933 - if (simple_attr_open(inode, file, get, set, fmt)) { 3932 3932 + if (simple_attr_open(inode, file, get, 3933 3933 + stat_data->mode & S_IWUGO ? set : NULL, 3934 3934 + fmt)) { 3934 3935 kvm_put_kvm(stat_data->kvm); 3935 3936 return -ENOMEM; 3936 3937 } ··· 4180 4177 4181 4178 kvm_debugfs_num_entries = 0; 4182 4179 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4183 4183 - debugfs_create_file(p->name, 0644, kvm_debugfs_dir, 4180 4180 + int mode = p->mode ? p->mode : 0644; 4181 4181 + debugfs_create_file(p->name, mode, kvm_debugfs_dir, 4184 4182 (void *)(long)p->offset, 4185 4183 stat_fops[p->kind]); 4186 4184 }