Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

KVM: LAPIC: Inject timer interrupt via posted interrupt

Dedicated instances are currently disturbed by unnecessary jitter due
to the emulated lapic timers firing on the same pCPUs where the
vCPUs reside. There is no hardware virtual timer on Intel for guest
like ARM, so both programming timer in guest and the emulated timer fires
incur vmexits. This patch tries to avoid vmexit when the emulated timer
fires, at least in dedicated instance scenario when nohz_full is enabled.

In that case, the emulated timers can be offload to the nearest busy
housekeeping cpus since APICv has been found for several years in server
processors. The guest timer interrupt can then be injected via posted interrupts,
which are delivered by the housekeeping cpu once the emulated timer fires.

The host should tuned so that vCPUs are placed on isolated physical
processors, and with several pCPUs surplus for busy housekeeping.
If disabled mwait/hlt/pause vmexits keep the vCPUs in non-root mode,
~3% redis performance benefit can be observed on Skylake server, and the
number of external interrupt vmexits drops substantially. Without patch

VM-EXIT Samples Samples% Time% Min Time Max Time Avg time
EXTERNAL_INTERRUPT 42916 49.43% 39.30% 0.47us 106.09us 0.71us ( +- 1.09% )

While with patch:

VM-EXIT Samples Samples% Time% Min Time Max Time Avg time
EXTERNAL_INTERRUPT 6871 9.29% 2.96% 0.44us 57.88us 0.72us ( +- 4.02% )

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

authored by

Wanpeng Li and committed by
Paolo Bonzini
0c5f81da 4d151bf3

+87 -36
+64 -35
arch/x86/kvm/lapic.c
··· 118 118 return apic->vcpu->vcpu_id; 119 119 } 120 120 121 + bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) 122 + { 123 + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); 124 + } 125 + EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt); 126 + 127 + static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) 128 + { 129 + return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE; 130 + } 131 + 121 132 static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, 122 133 u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { 123 134 switch (map->mode) { ··· 1432 1421 } 1433 1422 } 1434 1423 1435 - static void apic_timer_expired(struct kvm_lapic *apic) 1436 - { 1437 - struct kvm_vcpu *vcpu = apic->vcpu; 1438 - struct swait_queue_head *q = &vcpu->wq; 1439 - struct kvm_timer *ktimer = &apic->lapic_timer; 1440 - 1441 - if (atomic_read(&apic->lapic_timer.pending)) 1442 - return; 1443 - 1444 - atomic_inc(&apic->lapic_timer.pending); 1445 - kvm_set_pending_timer(vcpu); 1446 - 1447 - /* 1448 - * For x86, the atomic_inc() is serialized, thus 1449 - * using swait_active() is safe. 1450 - */ 1451 - if (swait_active(q)) 1452 - swake_up_one(q); 1453 - 1454 - if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) 1455 - ktimer->expired_tscdeadline = ktimer->tscdeadline; 1456 - } 1457 - 1458 1424 /* 1459 1425 * On APICv, this test will cause a busy wait 1460 1426 * during a higher-priority task. ··· 1505 1517 apic->lapic_timer.timer_advance_ns = timer_advance_ns; 1506 1518 } 1507 1519 1508 - void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) 1520 + static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) 1509 1521 { 1510 1522 struct kvm_lapic *apic = vcpu->arch.apic; 1511 1523 u64 guest_tsc, tsc_deadline; 1512 1524 1513 1525 if (apic->lapic_timer.expired_tscdeadline == 0) 1514 - return; 1515 - 1516 - if (!lapic_timer_int_injected(vcpu)) 1517 1526 return; 1518 1527 1519 1528 tsc_deadline = apic->lapic_timer.expired_tscdeadline; ··· 1524 1539 if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) 1525 1540 adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); 1526 1541 } 1542 + 1543 + void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) 1544 + { 1545 + if (lapic_timer_int_injected(vcpu)) 1546 + __kvm_wait_lapic_expire(vcpu); 1547 + } 1527 1548 EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); 1549 + 1550 + static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) 1551 + { 1552 + struct kvm_timer *ktimer = &apic->lapic_timer; 1553 + 1554 + kvm_apic_local_deliver(apic, APIC_LVTT); 1555 + if (apic_lvtt_tscdeadline(apic)) 1556 + ktimer->tscdeadline = 0; 1557 + if (apic_lvtt_oneshot(apic)) { 1558 + ktimer->tscdeadline = 0; 1559 + ktimer->target_expiration = 0; 1560 + } 1561 + } 1562 + 1563 + static void apic_timer_expired(struct kvm_lapic *apic) 1564 + { 1565 + struct kvm_vcpu *vcpu = apic->vcpu; 1566 + struct swait_queue_head *q = &vcpu->wq; 1567 + struct kvm_timer *ktimer = &apic->lapic_timer; 1568 + 1569 + if (atomic_read(&apic->lapic_timer.pending)) 1570 + return; 1571 + 1572 + if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) 1573 + ktimer->expired_tscdeadline = ktimer->tscdeadline; 1574 + 1575 + if (kvm_use_posted_timer_interrupt(apic->vcpu)) { 1576 + if (apic->lapic_timer.timer_advance_ns) 1577 + __kvm_wait_lapic_expire(vcpu); 1578 + kvm_apic_inject_pending_timer_irqs(apic); 1579 + return; 1580 + } 1581 + 1582 + atomic_inc(&apic->lapic_timer.pending); 1583 + kvm_set_pending_timer(vcpu); 1584 + 1585 + /* 1586 + * For x86, the atomic_inc() is serialized, thus 1587 + * using swait_active() is safe. 1588 + */ 1589 + if (swait_active(q)) 1590 + swake_up_one(q); 1591 + } 1528 1592 1529 1593 static void start_sw_tscdeadline(struct kvm_lapic *apic) 1530 1594 { ··· 2359 2325 struct kvm_lapic *apic = vcpu->arch.apic; 2360 2326 2361 2327 if (atomic_read(&apic->lapic_timer.pending) > 0) { 2362 - kvm_apic_local_deliver(apic, APIC_LVTT); 2363 - if (apic_lvtt_tscdeadline(apic)) 2364 - apic->lapic_timer.tscdeadline = 0; 2365 - if (apic_lvtt_oneshot(apic)) { 2366 - apic->lapic_timer.tscdeadline = 0; 2367 - apic->lapic_timer.target_expiration = 0; 2368 - } 2328 + kvm_apic_inject_pending_timer_irqs(apic); 2369 2329 atomic_set(&apic->lapic_timer.pending, 0); 2370 2330 } 2371 2331 } ··· 2481 2453 { 2482 2454 struct hrtimer *timer; 2483 2455 2484 - if (!lapic_in_kernel(vcpu)) 2456 + if (!lapic_in_kernel(vcpu) || 2457 + kvm_can_post_timer_interrupt(vcpu)) 2485 2458 return; 2486 2459 2487 2460 timer = &vcpu->arch.apic->lapic_timer.timer;
+1
arch/x86/kvm/lapic.h
··· 236 236 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); 237 237 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); 238 238 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); 239 + bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu); 239 240 240 241 static inline enum lapic_mode kvm_apic_mode(u64 apic_base) 241 242 {
+2 -1
arch/x86/kvm/vmx/vmx.c
··· 7064 7064 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 7065 7065 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 7066 7066 7067 - if (kvm_mwait_in_guest(vcpu->kvm)) 7067 + if (kvm_mwait_in_guest(vcpu->kvm) || 7068 + kvm_can_post_timer_interrupt(vcpu)) 7068 7069 return -EOPNOTSUPP; 7069 7070 7070 7071 vmx = to_vmx(vcpu);
+6
arch/x86/kvm/x86.c
··· 51 51 #include <linux/kvm_irqfd.h> 52 52 #include <linux/irqbypass.h> 53 53 #include <linux/sched/stat.h> 54 + #include <linux/sched/isolation.h> 54 55 #include <linux/mem_encrypt.h> 55 56 56 57 #include <trace/events/kvm.h> ··· 153 152 154 153 static bool __read_mostly force_emulation_prefix = false; 155 154 module_param(force_emulation_prefix, bool, S_IRUGO); 155 + 156 + int __read_mostly pi_inject_timer = -1; 157 + module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); 156 158 157 159 #define KVM_NR_SHARED_MSRS 16 158 160 ··· 7062 7058 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 7063 7059 7064 7060 kvm_lapic_init(); 7061 + if (pi_inject_timer == -1) 7062 + pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); 7065 7063 #ifdef CONFIG_X86_64 7066 7064 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); 7067 7065
+2
arch/x86/kvm/x86.h
··· 301 301 302 302 extern bool enable_vmware_backdoor; 303 303 304 + extern int pi_inject_timer; 305 + 304 306 extern struct static_key kvm_no_apic_vcpu; 305 307 306 308 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+6
include/linux/sched/isolation.h
··· 19 19 DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); 20 20 extern int housekeeping_any_cpu(enum hk_flags flags); 21 21 extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); 22 + extern bool housekeeping_enabled(enum hk_flags flags); 22 23 extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); 23 24 extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); 24 25 extern void __init housekeeping_init(void); ··· 34 33 static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) 35 34 { 36 35 return cpu_possible_mask; 36 + } 37 + 38 + static inline bool housekeeping_enabled(enum hk_flags flags) 39 + { 40 + return false; 37 41 } 38 42 39 43 static inline void housekeeping_affine(struct task_struct *t,
+6
kernel/sched/isolation.c
··· 14 14 static cpumask_var_t housekeeping_mask; 15 15 static unsigned int housekeeping_flags; 16 16 17 + bool housekeeping_enabled(enum hk_flags flags) 18 + { 19 + return !!(housekeeping_flags & flags); 20 + } 21 + EXPORT_SYMBOL_GPL(housekeeping_enabled); 22 + 17 23 int housekeeping_any_cpu(enum hk_flags flags) 18 24 { 19 25 if (static_branch_unlikely(&housekeeping_overridden))