Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini:

- PPC and ARM bugfixes from submaintainers

- Fix old Windows versions on AMD (recent regression)

- Fix old Linux versions on processors without EPT

- Fixes for LAPIC timer optimizations

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (21 commits)
KVM: nVMX: Fix size checks in vmx_set_nested_state
KVM: selftests: make hyperv_cpuid test pass on AMD
KVM: lapic: Check for in-kernel LAPIC before deferencing apic pointer
KVM: fix KVM_CLEAR_DIRTY_LOG for memory slots of unaligned size
x86/kvm/mmu: reset MMU context when 32-bit guest switches PAE
KVM: x86: Whitelist port 0x7e for pre-incrementing %rip
Documentation: kvm: fix dirty log ioctl arch lists
KVM: VMX: Move RSB stuffing to before the first RET after VM-Exit
KVM: arm/arm64: Don't emulate virtual timers on userspace ioctls
kvm: arm: Skip stage2 huge mappings for unaligned ipa backed by THP
KVM: arm/arm64: Ensure vcpu target is unset on reset failure
KVM: lapic: Convert guest TSC to host time domain if necessary
KVM: lapic: Allow user to disable adaptive tuning of timer advancement
KVM: lapic: Track lapic timer advance per vCPU
KVM: lapic: Disable timer advancement if adaptive tuning goes haywire
x86: kvm: hyper-v: deal with buggy TLB flush requests from WS2012
KVM: x86: Consider LAPIC TSC-Deadline timer expired if deadline too short
KVM: PPC: Book3S: Protect memslots while validating user address
KVM: PPC: Book3S HV: Perserve PSSCR FAKE_SUSPEND bit on guest exit
KVM: arm/arm64: vgic-v3: Retire pending interrupts on disabling LPIs
...

Linus Torvalds 7 years ago aa1be08f 82463436

+192 -65

23 changed files

expand all collapse all

Documentation

virtual

kvm

api.txt

arch

powerpc

kvm

book3s_64_vio.c

book3s_hv.c

x86

include

asm

kvm_host.h

uapi

asm

kvm.h

kvm

hyperv.c

lapic.c

lapic.h

mmu.c

vmx

nested.c

vmenter.S

vmx.c

x86.c

x86.h

tools

testing

selftests

kvm

dirty_log_test.c

x86_64

hyperv_cpuid.c

virt

kvm

arm

arch_timer.c

arm.c

mmu.c

vgic

vgic-mmio-v3.c

vgic.c

vgic.h

kvm_main.c

+6 -5

Documentation/virtual/kvm/api.txt

reviewed

··· 321 321 4.8 KVM_GET_DIRTY_LOG (vm ioctl) 322 322 323 323 Capability: basic 324 324 - Architectures: x86 324 324 + Architectures: all 325 325 Type: vm ioctl 326 326 Parameters: struct kvm_dirty_log (in/out) 327 327 Returns: 0 on success, -1 on error ··· 3810 3810 4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) 3811 3811 3812 3812 Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 3813 3813 - Architectures: x86 3813 3813 + Architectures: x86, arm, arm64, mips 3814 3814 Type: vm ioctl 3815 3815 Parameters: struct kvm_dirty_log (in) 3816 3816 Returns: 0 on success, -1 on error ··· 3830 3830 the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap 3831 3831 field. Bit 0 of the bitmap corresponds to page "first_page" in the 3832 3832 memory slot, and num_pages is the size in bits of the input bitmap. 3833 3833 - Both first_page and num_pages must be a multiple of 64. For each bit 3834 3834 - that is set in the input bitmap, the corresponding page is marked "clean" 3833 3833 + first_page must be a multiple of 64; num_pages must also be a multiple of 3834 3834 + 64 unless first_page + num_pages is the size of the memory slot. For each 3835 3835 + bit that is set in the input bitmap, the corresponding page is marked "clean" 3835 3836 in KVM's dirty bitmap, and dirty tracking is re-enabled for that page 3836 3837 (for example via write-protection, or by clearing the dirty bit in 3837 3838 a page table entry). ··· 4800 4799 4801 4800 7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT 4802 4801 4803 4803 - Architectures: all 4802 4802 + Architectures: x86, arm, arm64, mips 4804 4803 Parameters: args[0] whether feature should be enabled or not 4805 4804 4806 4805 With this capability enabled, KVM_GET_DIRTY_LOG will not automatically

+3 -3

arch/powerpc/kvm/book3s_64_vio.c

reviewed

··· 543 543 if (ret != H_SUCCESS) 544 544 return ret; 545 545 546 546 + idx = srcu_read_lock(&vcpu->kvm->srcu); 547 547 + 546 548 ret = kvmppc_tce_validate(stt, tce); 547 549 if (ret != H_SUCCESS) 548 548 - return ret; 550 550 + goto unlock_exit; 549 551 550 552 dir = iommu_tce_direction(tce); 551 551 - 552 552 - idx = srcu_read_lock(&vcpu->kvm->srcu); 553 553 554 554 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { 555 555 ret = H_PARAMETER;

+3 -1

arch/powerpc/kvm/book3s_hv.c

reviewed

··· 3423 3423 vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); 3424 3424 vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); 3425 3425 3426 3426 - mtspr(SPRN_PSSCR, host_psscr); 3426 3426 + /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ 3427 3427 + mtspr(SPRN_PSSCR, host_psscr | 3428 3428 + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); 3427 3429 mtspr(SPRN_HFSCR, host_hfscr); 3428 3430 mtspr(SPRN_CIABR, host_ciabr); 3429 3431 mtspr(SPRN_DAWR, host_dawr);

arch/x86/include/asm/kvm_host.h

reviewed

··· 295 295 unsigned int valid:1; 296 296 unsigned int execonly:1; 297 297 unsigned int cr0_pg:1; 298 298 + unsigned int cr4_pae:1; 298 299 unsigned int cr4_pse:1; 299 300 unsigned int cr4_pke:1; 300 301 unsigned int cr4_smap:1;

arch/x86/include/uapi/asm/kvm.h

reviewed

··· 381 381 #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) 382 382 #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) 383 383 #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) 384 384 + #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) 384 385 385 386 #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 386 387 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002

+10 -1

arch/x86/kvm/hyperv.c

reviewed

··· 1371 1371 1372 1372 valid_bank_mask = BIT_ULL(0); 1373 1373 sparse_banks[0] = flush.processor_mask; 1374 1374 - all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; 1374 1374 + 1375 1375 + /* 1376 1376 + * Work around possible WS2012 bug: it sends hypercalls 1377 1377 + * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, 1378 1378 + * while also expecting us to flush something and crashing if 1379 1379 + * we don't. Let's treat processor_mask == 0 same as 1380 1380 + * HV_FLUSH_ALL_PROCESSORS. 1381 1381 + */ 1382 1382 + all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || 1383 1383 + flush.processor_mask == 0; 1375 1384 } else { 1376 1385 if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, 1377 1386 sizeof(flush_ex))))

+52 -21

arch/x86/kvm/lapic.c

reviewed

··· 70 70 #define APIC_BROADCAST 0xFF 71 71 #define X2APIC_BROADCAST 0xFFFFFFFFul 72 72 73 73 - static bool lapic_timer_advance_adjust_done = false; 74 73 #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 75 74 /* step-by-step approximation to mitigate fluctuation */ 76 75 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 ··· 1481 1482 return false; 1482 1483 } 1483 1484 1485 1485 + static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) 1486 1486 + { 1487 1487 + u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns; 1488 1488 + 1489 1489 + /* 1490 1490 + * If the guest TSC is running at a different ratio than the host, then 1491 1491 + * convert the delay to nanoseconds to achieve an accurate delay. Note 1492 1492 + * that __delay() uses delay_tsc whenever the hardware has TSC, thus 1493 1493 + * always for VMX enabled hardware. 1494 1494 + */ 1495 1495 + if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { 1496 1496 + __delay(min(guest_cycles, 1497 1497 + nsec_to_cycles(vcpu, timer_advance_ns))); 1498 1498 + } else { 1499 1499 + u64 delay_ns = guest_cycles * 1000000ULL; 1500 1500 + do_div(delay_ns, vcpu->arch.virtual_tsc_khz); 1501 1501 + ndelay(min_t(u32, delay_ns, timer_advance_ns)); 1502 1502 + } 1503 1503 + } 1504 1504 + 1484 1505 void wait_lapic_expire(struct kvm_vcpu *vcpu) 1485 1506 { 1486 1507 struct kvm_lapic *apic = vcpu->arch.apic; 1508 1508 + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; 1487 1509 u64 guest_tsc, tsc_deadline, ns; 1488 1488 - 1489 1489 - if (!lapic_in_kernel(vcpu)) 1490 1490 - return; 1491 1510 1492 1511 if (apic->lapic_timer.expired_tscdeadline == 0) 1493 1512 return; ··· 1518 1501 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); 1519 1502 trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); 1520 1503 1521 1521 - /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ 1522 1504 if (guest_tsc < tsc_deadline) 1523 1523 - __delay(min(tsc_deadline - guest_tsc, 1524 1524 - nsec_to_cycles(vcpu, lapic_timer_advance_ns))); 1505 1505 + __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); 1525 1506 1526 1526 - if (!lapic_timer_advance_adjust_done) { 1507 1507 + if (!apic->lapic_timer.timer_advance_adjust_done) { 1527 1508 /* too early */ 1528 1509 if (guest_tsc < tsc_deadline) { 1529 1510 ns = (tsc_deadline - guest_tsc) * 1000000ULL; 1530 1511 do_div(ns, vcpu->arch.virtual_tsc_khz); 1531 1531 - lapic_timer_advance_ns -= min((unsigned int)ns, 1532 1532 - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); 1512 1512 + timer_advance_ns -= min((u32)ns, 1513 1513 + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); 1533 1514 } else { 1534 1515 /* too late */ 1535 1516 ns = (guest_tsc - tsc_deadline) * 1000000ULL; 1536 1517 do_div(ns, vcpu->arch.virtual_tsc_khz); 1537 1537 - lapic_timer_advance_ns += min((unsigned int)ns, 1538 1538 - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); 1518 1518 + timer_advance_ns += min((u32)ns, 1519 1519 + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); 1539 1520 } 1540 1521 if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) 1541 1541 - lapic_timer_advance_adjust_done = true; 1522 1522 + apic->lapic_timer.timer_advance_adjust_done = true; 1523 1523 + if (unlikely(timer_advance_ns > 5000)) { 1524 1524 + timer_advance_ns = 0; 1525 1525 + apic->lapic_timer.timer_advance_adjust_done = true; 1526 1526 + } 1527 1527 + apic->lapic_timer.timer_advance_ns = timer_advance_ns; 1542 1528 } 1543 1529 } 1544 1530 1545 1531 static void start_sw_tscdeadline(struct kvm_lapic *apic) 1546 1532 { 1547 1547 - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; 1533 1533 + struct kvm_timer *ktimer = &apic->lapic_timer; 1534 1534 + u64 guest_tsc, tscdeadline = ktimer->tscdeadline; 1548 1535 u64 ns = 0; 1549 1536 ktime_t expire; 1550 1537 struct kvm_vcpu *vcpu = apic->vcpu; ··· 1563 1542 1564 1543 now = ktime_get(); 1565 1544 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); 1566 1566 - if (likely(tscdeadline > guest_tsc)) { 1567 1567 - ns = (tscdeadline - guest_tsc) * 1000000ULL; 1568 1568 - do_div(ns, this_tsc_khz); 1545 1545 + 1546 1546 + ns = (tscdeadline - guest_tsc) * 1000000ULL; 1547 1547 + do_div(ns, this_tsc_khz); 1548 1548 + 1549 1549 + if (likely(tscdeadline > guest_tsc) && 1550 1550 + likely(ns > apic->lapic_timer.timer_advance_ns)) { 1569 1551 expire = ktime_add_ns(now, ns); 1570 1570 - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); 1571 1571 - hrtimer_start(&apic->lapic_timer.timer, 1572 1572 - expire, HRTIMER_MODE_ABS_PINNED); 1552 1552 + expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); 1553 1553 + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); 1573 1554 } else 1574 1555 apic_timer_expired(apic); 1575 1556 ··· 2278 2255 return HRTIMER_NORESTART; 2279 2256 } 2280 2257 2281 2281 - int kvm_create_lapic(struct kvm_vcpu *vcpu) 2258 2258 + int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) 2282 2259 { 2283 2260 struct kvm_lapic *apic; 2284 2261 ··· 2302 2279 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 2303 2280 HRTIMER_MODE_ABS_PINNED); 2304 2281 apic->lapic_timer.timer.function = apic_timer_fn; 2282 2282 + if (timer_advance_ns == -1) { 2283 2283 + apic->lapic_timer.timer_advance_ns = 1000; 2284 2284 + apic->lapic_timer.timer_advance_adjust_done = false; 2285 2285 + } else { 2286 2286 + apic->lapic_timer.timer_advance_ns = timer_advance_ns; 2287 2287 + apic->lapic_timer.timer_advance_adjust_done = true; 2288 2288 + } 2289 2289 + 2305 2290 2306 2291 /* 2307 2292 * APIC is created enabled. This will prevent kvm_lapic_set_base from

+3 -1

arch/x86/kvm/lapic.h

reviewed

··· 31 31 u32 timer_mode_mask; 32 32 u64 tscdeadline; 33 33 u64 expired_tscdeadline; 34 34 + u32 timer_advance_ns; 34 35 atomic_t pending; /* accumulated triggered timers */ 35 36 bool hv_timer_in_use; 37 37 + bool timer_advance_adjust_done; 36 38 }; 37 39 38 40 struct kvm_lapic { ··· 64 62 65 63 struct dest_map; 66 64 67 67 - int kvm_create_lapic(struct kvm_vcpu *vcpu); 65 65 + int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns); 68 66 void kvm_free_lapic(struct kvm_vcpu *vcpu); 69 67 70 68 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);

arch/x86/kvm/mmu.c

reviewed

··· 4781 4781 union kvm_mmu_extended_role ext = {0}; 4782 4782 4783 4783 ext.cr0_pg = !!is_paging(vcpu); 4784 4784 + ext.cr4_pae = !!is_pae(vcpu); 4784 4785 ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 4785 4786 ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); 4786 4787 ext.cr4_pse = !!is_pse(vcpu);

+2 -2

arch/x86/kvm/vmx/nested.c

reviewed

··· 5423 5423 return ret; 5424 5424 5425 5425 /* Empty 'VMXON' state is permitted */ 5426 5426 - if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) 5426 5426 + if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) 5427 5427 return 0; 5428 5428 5429 5429 if (kvm_state->vmx.vmcs_pa != -1ull) { ··· 5467 5467 vmcs12->vmcs_link_pointer != -1ull) { 5468 5468 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 5469 5469 5470 5470 - if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) 5470 5470 + if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12)) 5471 5471 return -EINVAL; 5472 5472 5473 5473 if (copy_from_user(shadow_vmcs12,

+12

arch/x86/kvm/vmx/vmenter.S

reviewed

··· 3 3 #include <asm/asm.h> 4 4 #include <asm/bitsperlong.h> 5 5 #include <asm/kvm_vcpu_regs.h> 6 6 + #include <asm/nospec-branch.h> 6 7 7 8 #define WORD_SIZE (BITS_PER_LONG / 8) 8 9 ··· 78 77 * referred to by VMCS.HOST_RIP. 79 78 */ 80 79 ENTRY(vmx_vmexit) 80 80 + #ifdef CONFIG_RETPOLINE 81 81 + ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE 82 82 + /* Preserve guest's RAX, it's used to stuff the RSB. */ 83 83 + push %_ASM_AX 84 84 + 85 85 + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 86 86 + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE 87 87 + 88 88 + pop %_ASM_AX 89 89 + .Lvmexit_skip_rsb: 90 90 + #endif 81 91 ret 82 92 ENDPROC(vmx_vmexit) 83 93

+3 -4

arch/x86/kvm/vmx/vmx.c

reviewed

··· 6462 6462 6463 6463 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); 6464 6464 6465 6465 - /* Eliminate branch target predictions from guest mode */ 6466 6466 - vmexit_fill_RSB(); 6467 6467 - 6468 6465 /* All fields are clean at this point */ 6469 6466 if (static_branch_unlikely(&enable_evmcs)) 6470 6467 current_evmcs->hv_clean_fields |= ··· 7029 7032 { 7030 7033 struct vcpu_vmx *vmx; 7031 7034 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 7035 7035 + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 7032 7036 7033 7037 if (kvm_mwait_in_guest(vcpu->kvm)) 7034 7038 return -EOPNOTSUPP; ··· 7038 7040 tscl = rdtsc(); 7039 7041 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 7040 7042 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 7041 7041 - lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); 7043 7043 + lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 7044 7044 + ktimer->timer_advance_ns); 7042 7045 7043 7046 if (delta_tsc > lapic_timer_advance_cycles) 7044 7047 delta_tsc -= lapic_timer_advance_cycles;

+29 -7

arch/x86/kvm/x86.c

reviewed

··· 136 136 static u32 __read_mostly tsc_tolerance_ppm = 250; 137 137 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); 138 138 139 139 - /* lapic timer advance (tscdeadline mode only) in nanoseconds */ 140 140 - unsigned int __read_mostly lapic_timer_advance_ns = 1000; 139 139 + /* 140 140 + * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables 141 141 + * adaptive tuning starting from default advancment of 1000ns. '0' disables 142 142 + * advancement entirely. Any other value is used as-is and disables adaptive 143 143 + * tuning, i.e. allows priveleged userspace to set an exact advancement time. 144 144 + */ 145 145 + static int __read_mostly lapic_timer_advance_ns = -1; 141 146 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); 142 142 - EXPORT_SYMBOL_GPL(lapic_timer_advance_ns); 143 147 144 148 static bool __read_mostly vector_hashing = true; 145 149 module_param(vector_hashing, bool, S_IRUGO); ··· 6539 6535 } 6540 6536 EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); 6541 6537 6538 6538 + static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) 6539 6539 + { 6540 6540 + vcpu->arch.pio.count = 0; 6541 6541 + return 1; 6542 6542 + } 6543 6543 + 6542 6544 static int complete_fast_pio_out(struct kvm_vcpu *vcpu) 6543 6545 { 6544 6546 vcpu->arch.pio.count = 0; ··· 6561 6551 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 6562 6552 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, 6563 6553 size, port, &val, 1); 6554 6554 + if (ret) 6555 6555 + return ret; 6564 6556 6565 6565 - if (!ret) { 6557 6557 + /* 6558 6558 + * Workaround userspace that relies on old KVM behavior of %rip being 6559 6559 + * incremented prior to exiting to userspace to handle "OUT 0x7e". 6560 6560 + */ 6561 6561 + if (port == 0x7e && 6562 6562 + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) { 6563 6563 + vcpu->arch.complete_userspace_io = 6564 6564 + complete_fast_pio_out_port_0x7e; 6565 6565 + kvm_skip_emulated_instruction(vcpu); 6566 6566 + } else { 6566 6567 vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); 6567 6568 vcpu->arch.complete_userspace_io = complete_fast_pio_out; 6568 6569 } 6569 6569 - return ret; 6570 6570 + return 0; 6570 6571 } 6571 6572 6572 6573 static int complete_fast_pio_in(struct kvm_vcpu *vcpu) ··· 7894 7873 } 7895 7874 7896 7875 trace_kvm_entry(vcpu->vcpu_id); 7897 7897 - if (lapic_timer_advance_ns) 7876 7876 + if (lapic_in_kernel(vcpu) && 7877 7877 + vcpu->arch.apic->lapic_timer.timer_advance_ns) 7898 7878 wait_lapic_expire(vcpu); 7899 7879 guest_enter_irqoff(); 7900 7880 ··· 9083 9061 9084 9062 if (irqchip_in_kernel(vcpu->kvm)) { 9085 9063 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); 9086 9086 - r = kvm_create_lapic(vcpu); 9064 9064 + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); 9087 9065 if (r < 0) 9088 9066 goto fail_mmu_destroy; 9089 9067 } else

-2

arch/x86/kvm/x86.h

reviewed

··· 294 294 295 295 extern unsigned int min_timer_period_us; 296 296 297 297 - extern unsigned int lapic_timer_advance_ns; 298 298 - 299 297 extern bool enable_vmware_backdoor; 300 298 301 299 extern struct static_key kvm_no_apic_vcpu;

+6 -3

tools/testing/selftests/kvm/dirty_log_test.c

reviewed

··· 288 288 #endif 289 289 max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1; 290 290 guest_page_size = (1ul << guest_page_shift); 291 291 - /* 1G of guest page sized pages */ 292 292 - guest_num_pages = (1ul << (30 - guest_page_shift)); 291 291 + /* 292 292 + * A little more than 1G of guest page sized pages. Cover the 293 293 + * case where the size is not aligned to 64 pages. 294 294 + */ 295 295 + guest_num_pages = (1ul << (30 - guest_page_shift)) + 3; 293 296 host_page_size = getpagesize(); 294 297 host_num_pages = (guest_num_pages * guest_page_size) / host_page_size + 295 298 !!((guest_num_pages * guest_page_size) % host_page_size); ··· 362 359 kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); 363 360 #ifdef USE_CLEAR_DIRTY_LOG 364 361 kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, 365 365 - DIV_ROUND_UP(host_num_pages, 64) * 64); 362 362 + host_num_pages); 366 363 #endif 367 364 vm_dirty_log_verify(bmap); 368 365 iteration++;

+8 -1

tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c

reviewed

··· 141 141 142 142 free(hv_cpuid_entries); 143 143 144 144 - vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); 144 144 + rv = _vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); 145 145 + 146 146 + if (rv) { 147 147 + fprintf(stderr, 148 148 + "Enlightened VMCS is unsupported, skip related test\n"); 149 149 + goto vm_free; 150 150 + } 145 151 146 152 hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); 147 153 if (!hv_cpuid_entries) ··· 157 151 158 152 free(hv_cpuid_entries); 159 153 154 154 + vm_free: 160 155 kvm_vm_free(vm); 161 156 162 157 return 0;

+10 -7

virt/kvm/arm/arch_timer.c

reviewed

··· 508 508 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 509 509 510 510 /* 511 511 + * Update the timer output so that it is likely to match the 512 512 + * state we're about to restore. If the timer expires between 513 513 + * this point and the register restoration, we'll take the 514 514 + * interrupt anyway. 515 515 + */ 516 516 + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer); 517 517 + 518 518 + /* 511 519 * When using a userspace irqchip with the architected timers and a 512 520 * host interrupt controller that doesn't support an active state, we 513 521 * must still prevent continuously exiting from the guest, and ··· 738 730 int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) 739 731 { 740 732 struct arch_timer_context *timer; 741 741 - bool level; 742 733 743 734 switch (regid) { 744 735 case KVM_REG_ARM_TIMER_CTL: ··· 764 757 default: 765 758 return -1; 766 759 } 767 767 - 768 768 - level = kvm_timer_should_fire(timer); 769 769 - kvm_timer_update_irq(vcpu, level, timer); 770 770 - timer_emulate(timer); 771 760 772 761 return 0; 773 762 } ··· 815 812 816 813 switch (treg) { 817 814 case TIMER_REG_TVAL: 818 818 - val = kvm_phys_timer_read() - timer->cntvoff - timer->cnt_cval; 815 815 + val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff; 819 816 break; 820 817 821 818 case TIMER_REG_CTL: ··· 861 858 { 862 859 switch (treg) { 863 860 case TIMER_REG_TVAL: 864 864 - timer->cnt_cval = val - kvm_phys_timer_read() - timer->cntvoff; 861 861 + timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val; 865 862 break; 866 863 867 864 case TIMER_REG_CTL:

+8 -3

virt/kvm/arm/arm.c

reviewed

··· 934 934 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, 935 935 const struct kvm_vcpu_init *init) 936 936 { 937 937 - unsigned int i; 937 937 + unsigned int i, ret; 938 938 int phys_target = kvm_target_cpu(); 939 939 940 940 if (init->target != phys_target) ··· 969 969 vcpu->arch.target = phys_target; 970 970 971 971 /* Now we know what it is, we can reset it. */ 972 972 - return kvm_reset_vcpu(vcpu); 973 973 - } 972 972 + ret = kvm_reset_vcpu(vcpu); 973 973 + if (ret) { 974 974 + vcpu->arch.target = -1; 975 975 + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 976 976 + } 974 977 978 978 + return ret; 979 979 + } 975 980 976 981 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, 977 982 struct kvm_vcpu_init *init)

+5 -1

virt/kvm/arm/mmu.c

reviewed

··· 1781 1781 * Only PMD_SIZE transparent hugepages(THP) are 1782 1782 * currently supported. This code will need to be 1783 1783 * updated to support other THP sizes. 1784 1784 + * 1785 1785 + * Make sure the host VA and the guest IPA are sufficiently 1786 1786 + * aligned and that the block is contained within the memslot. 1784 1787 */ 1785 1785 - if (transparent_hugepage_adjust(&pfn, &fault_ipa)) 1788 1788 + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && 1789 1789 + transparent_hugepage_adjust(&pfn, &fault_ipa)) 1786 1790 vma_pagesize = PMD_SIZE; 1787 1791 } 1788 1792

virt/kvm/arm/vgic/vgic-mmio-v3.c

reviewed

··· 200 200 201 201 vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; 202 202 203 203 + if (was_enabled && !vgic_cpu->lpis_enabled) 204 204 + vgic_flush_pending_lpis(vcpu); 205 205 + 203 206 if (!was_enabled && vgic_cpu->lpis_enabled) 204 207 vgic_enable_lpis(vcpu); 205 208 }

+21

virt/kvm/arm/vgic/vgic.c

reviewed

··· 151 151 kfree(irq); 152 152 } 153 153 154 154 + void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) 155 155 + { 156 156 + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 157 157 + struct vgic_irq *irq, *tmp; 158 158 + unsigned long flags; 159 159 + 160 160 + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); 161 161 + 162 162 + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { 163 163 + if (irq->intid >= VGIC_MIN_LPI) { 164 164 + raw_spin_lock(&irq->irq_lock); 165 165 + list_del(&irq->ap_list); 166 166 + irq->vcpu = NULL; 167 167 + raw_spin_unlock(&irq->irq_lock); 168 168 + vgic_put_irq(vcpu->kvm, irq); 169 169 + } 170 170 + } 171 171 + 172 172 + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); 173 173 + } 174 174 + 154 175 void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) 155 176 { 156 177 WARN_ON(irq_set_irqchip_state(irq->host_irq,

virt/kvm/arm/vgic/vgic.h

reviewed

··· 238 238 bool vgic_has_its(struct kvm *kvm); 239 239 int kvm_vgic_register_its_device(void); 240 240 void vgic_enable_lpis(struct kvm_vcpu *vcpu); 241 241 + void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); 241 242 int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); 242 243 int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); 243 244 int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,

+4 -3

virt/kvm/kvm_main.c

reviewed

··· 1240 1240 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1241 1241 return -EINVAL; 1242 1242 1243 1243 - if ((log->first_page & 63) || (log->num_pages & 63)) 1243 1243 + if (log->first_page & 63) 1244 1244 return -EINVAL; 1245 1245 1246 1246 slots = __kvm_memslots(kvm, as_id); ··· 1253 1253 n = kvm_dirty_bitmap_bytes(memslot); 1254 1254 1255 1255 if (log->first_page > memslot->npages || 1256 1256 - log->num_pages > memslot->npages - log->first_page) 1257 1257 - return -EINVAL; 1256 1256 + log->num_pages > memslot->npages - log->first_page || 1257 1257 + (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1258 1258 + return -EINVAL; 1258 1259 1259 1260 *flush = false; 1260 1261 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);