Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm:
KVM: Remove now unused structs from kvm_para.h
x86: KVM guest: Use the paravirt clocksource structs and functions
KVM: Make kvm host use the paravirt clocksource structs
x86: Make xen use the paravirt clocksource structs and functions
x86: Add structs and functions for paravirt clocksource
KVM: VMX: Fix host msr corruption with preemption enabled
KVM: ioapic: fix lost interrupt when changing a device's irq
KVM: MMU: Fix oops on guest userspace access to guest pagetable
KVM: MMU: large page update_pte issue with non-PAE 32-bit guests (resend)
KVM: MMU: Fix rmap_write_protect() hugepage iteration bug
KVM: close timer injection race window in __vcpu_run
KVM: Fix race between timer migration and vcpu migration

Linus Torvalds 18 years ago 919c0d14 de08341a

+358 -266

18 changed files

expand all collapse all

arch

x86

Kconfig

kernel

Makefile

kvmclock.c

pvclock.c

kvm

i8254.c

lapic.c

mmu.c

vmx.c

x86.c

xen

Kconfig

time.c

include

asm-x86

kvm_host.h

kvm_para.h

pvclock-abi.h

pvclock.h

linux

kvm_host.h

xen

interface

xen.h

virt

kvm

ioapic.c

arch/x86/Kconfig

reviewed

··· 383 383 config KVM_CLOCK 384 384 bool "KVM paravirtualized clock" 385 385 select PARAVIRT 386 386 + select PARAVIRT_CLOCK 386 387 depends on !(X86_VISWS || X86_VOYAGER) 387 388 help 388 389 Turning on this option will allow you to run a paravirtualized clock ··· 410 409 under a hypervisor, potentially improving performance significantly 411 410 over full virtualization. However, when run without a hypervisor 412 411 the kernel is theoretically slower and slightly larger. 412 412 + 413 413 + config PARAVIRT_CLOCK 414 414 + bool 415 415 + default n 413 416 414 417 endif 415 418

arch/x86/kernel/Makefile

reviewed

··· 82 82 obj-$(CONFIG_KVM_GUEST) += kvm.o 83 83 obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 84 84 obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 85 85 + obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 85 86 86 87 obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 87 88

+33 -56

arch/x86/kernel/kvmclock.c

reviewed

··· 18 18 19 19 #include <linux/clocksource.h> 20 20 #include <linux/kvm_para.h> 21 21 + #include <asm/pvclock.h> 21 22 #include <asm/arch_hooks.h> 22 23 #include <asm/msr.h> 23 24 #include <asm/apic.h> ··· 37 36 early_param("no-kvmclock", parse_no_kvmclock); 38 37 39 38 /* The hypervisor will put information about time periodically here */ 40 40 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); 41 41 - #define get_clock(cpu, field) per_cpu(hv_clock, cpu).field 39 39 + static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); 40 40 + static struct pvclock_wall_clock wall_clock; 42 41 43 43 - static inline u64 kvm_get_delta(u64 last_tsc) 44 44 - { 45 45 - int cpu = smp_processor_id(); 46 46 - u64 delta = native_read_tsc() - last_tsc; 47 47 - return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; 48 48 - } 49 49 - 50 50 - static struct kvm_wall_clock wall_clock; 51 51 - static cycle_t kvm_clock_read(void); 52 42 /* 53 43 * The wallclock is the time of day when we booted. Since then, some time may 54 44 * have elapsed since the hypervisor wrote the data. So we try to account for ··· 47 55 */ 48 56 static unsigned long kvm_get_wallclock(void) 49 57 { 50 50 - u32 wc_sec, wc_nsec; 51 51 - u64 delta; 58 58 + struct pvclock_vcpu_time_info *vcpu_time; 52 59 struct timespec ts; 53 53 - int version, nsec; 54 60 int low, high; 55 61 56 62 low = (int)__pa(&wall_clock); 57 63 high = ((u64)__pa(&wall_clock) >> 32); 58 58 - 59 59 - delta = kvm_clock_read(); 60 60 - 61 64 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 62 62 - do { 63 63 - version = wall_clock.wc_version; 64 64 - rmb(); 65 65 - wc_sec = wall_clock.wc_sec; 66 66 - wc_nsec = wall_clock.wc_nsec; 67 67 - rmb(); 68 68 - } while ((wall_clock.wc_version != version) || (version & 1)); 69 65 70 70 - delta = kvm_clock_read() - delta; 71 71 - delta += wc_nsec; 72 72 - nsec = do_div(delta, NSEC_PER_SEC); 73 73 - set_normalized_timespec(&ts, wc_sec + delta, nsec); 74 74 - /* 75 75 - * Of all mechanisms of time adjustment I've tested, this one 76 76 - * was the champion! 77 77 - */ 78 78 - return ts.tv_sec + 1; 66 66 + vcpu_time = &get_cpu_var(hv_clock); 67 67 + pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 68 68 + put_cpu_var(hv_clock); 69 69 + 70 70 + return ts.tv_sec; 79 71 } 80 72 81 73 static int kvm_set_wallclock(unsigned long now) 82 74 { 83 83 - return 0; 75 75 + return -1; 84 76 } 85 77 86 86 - /* 87 87 - * This is our read_clock function. The host puts an tsc timestamp each time 88 88 - * it updates a new time. Without the tsc adjustment, we can have a situation 89 89 - * in which a vcpu starts to run earlier (smaller system_time), but probes 90 90 - * time later (compared to another vcpu), leading to backwards time 91 91 - */ 92 78 static cycle_t kvm_clock_read(void) 93 79 { 94 94 - u64 last_tsc, now; 95 95 - int cpu; 80 80 + struct pvclock_vcpu_time_info *src; 81 81 + cycle_t ret; 96 82 97 97 - preempt_disable(); 98 98 - cpu = smp_processor_id(); 99 99 - 100 100 - last_tsc = get_clock(cpu, tsc_timestamp); 101 101 - now = get_clock(cpu, system_time); 102 102 - 103 103 - now += kvm_get_delta(last_tsc); 104 104 - preempt_enable(); 105 105 - 106 106 - return now; 83 83 + src = &get_cpu_var(hv_clock); 84 84 + ret = pvclock_clocksource_read(src); 85 85 + put_cpu_var(hv_clock); 86 86 + return ret; 107 87 } 88 88 + 108 89 static struct clocksource kvm_clock = { 109 90 .name = "kvm-clock", 110 91 .read = kvm_clock_read, ··· 88 123 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 89 124 }; 90 125 91 91 - static int kvm_register_clock(void) 126 126 + static int kvm_register_clock(char *txt) 92 127 { 93 128 int cpu = smp_processor_id(); 94 129 int low, high; 95 130 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 96 131 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 97 97 - 132 132 + printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 133 133 + cpu, high, low, txt); 98 134 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 99 135 } 100 136 ··· 106 140 * Now that the first cpu already had this clocksource initialized, 107 141 * we shouldn't fail. 108 142 */ 109 109 - WARN_ON(kvm_register_clock()); 143 143 + WARN_ON(kvm_register_clock("secondary cpu clock")); 110 144 /* ok, done with our trickery, call native */ 111 145 setup_secondary_APIC_clock(); 146 146 + } 147 147 + #endif 148 148 + 149 149 + #ifdef CONFIG_SMP 150 150 + void __init kvm_smp_prepare_boot_cpu(void) 151 151 + { 152 152 + WARN_ON(kvm_register_clock("primary cpu clock")); 153 153 + native_smp_prepare_boot_cpu(); 112 154 } 113 155 #endif 114 156 ··· 148 174 return; 149 175 150 176 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 151 151 - if (kvm_register_clock()) 177 177 + if (kvm_register_clock("boot clock")) 152 178 return; 153 179 pv_time_ops.get_wallclock = kvm_get_wallclock; 154 180 pv_time_ops.set_wallclock = kvm_set_wallclock; 155 181 pv_time_ops.sched_clock = kvm_clock_read; 156 182 #ifdef CONFIG_X86_LOCAL_APIC 157 183 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 184 184 + #endif 185 185 + #ifdef CONFIG_SMP 186 186 + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 158 187 #endif 159 188 machine_ops.shutdown = kvm_shutdown; 160 189 #ifdef CONFIG_KEXEC

+141

arch/x86/kernel/pvclock.c

reviewed

··· 1 1 + /* paravirtual clock -- common code used by kvm/xen 2 2 + 3 3 + This program is free software; you can redistribute it and/or modify 4 4 + it under the terms of the GNU General Public License as published by 5 5 + the Free Software Foundation; either version 2 of the License, or 6 6 + (at your option) any later version. 7 7 + 8 8 + This program is distributed in the hope that it will be useful, 9 9 + but WITHOUT ANY WARRANTY; without even the implied warranty of 10 10 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 11 + GNU General Public License for more details. 12 12 + 13 13 + You should have received a copy of the GNU General Public License 14 14 + along with this program; if not, write to the Free Software 15 15 + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 16 + */ 17 17 + 18 18 + #include <linux/kernel.h> 19 19 + #include <linux/percpu.h> 20 20 + #include <asm/pvclock.h> 21 21 + 22 22 + /* 23 23 + * These are perodically updated 24 24 + * xen: magic shared_info page 25 25 + * kvm: gpa registered via msr 26 26 + * and then copied here. 27 27 + */ 28 28 + struct pvclock_shadow_time { 29 29 + u64 tsc_timestamp; /* TSC at last update of time vals. */ 30 30 + u64 system_timestamp; /* Time, in nanosecs, since boot. */ 31 31 + u32 tsc_to_nsec_mul; 32 32 + int tsc_shift; 33 33 + u32 version; 34 34 + }; 35 35 + 36 36 + /* 37 37 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 38 38 + * yielding a 64-bit result. 39 39 + */ 40 40 + static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) 41 41 + { 42 42 + u64 product; 43 43 + #ifdef __i386__ 44 44 + u32 tmp1, tmp2; 45 45 + #endif 46 46 + 47 47 + if (shift < 0) 48 48 + delta >>= -shift; 49 49 + else 50 50 + delta <<= shift; 51 51 + 52 52 + #ifdef __i386__ 53 53 + __asm__ ( 54 54 + "mul %5 ; " 55 55 + "mov %4,%%eax ; " 56 56 + "mov %%edx,%4 ; " 57 57 + "mul %5 ; " 58 58 + "xor %5,%5 ; " 59 59 + "add %4,%%eax ; " 60 60 + "adc %5,%%edx ; " 61 61 + : "=A" (product), "=r" (tmp1), "=r" (tmp2) 62 62 + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 63 63 + #elif __x86_64__ 64 64 + __asm__ ( 65 65 + "mul %%rdx ; shrd $32,%%rdx,%%rax" 66 66 + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 67 67 + #else 68 68 + #error implement me! 69 69 + #endif 70 70 + 71 71 + return product; 72 72 + } 73 73 + 74 74 + static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) 75 75 + { 76 76 + u64 delta = native_read_tsc() - shadow->tsc_timestamp; 77 77 + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); 78 78 + } 79 79 + 80 80 + /* 81 81 + * Reads a consistent set of time-base values from hypervisor, 82 82 + * into a shadow data area. 83 83 + */ 84 84 + static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, 85 85 + struct pvclock_vcpu_time_info *src) 86 86 + { 87 87 + do { 88 88 + dst->version = src->version; 89 89 + rmb(); /* fetch version before data */ 90 90 + dst->tsc_timestamp = src->tsc_timestamp; 91 91 + dst->system_timestamp = src->system_time; 92 92 + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; 93 93 + dst->tsc_shift = src->tsc_shift; 94 94 + rmb(); /* test version after fetching data */ 95 95 + } while ((src->version & 1) || (dst->version != src->version)); 96 96 + 97 97 + return dst->version; 98 98 + } 99 99 + 100 100 + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 101 101 + { 102 102 + struct pvclock_shadow_time shadow; 103 103 + unsigned version; 104 104 + cycle_t ret, offset; 105 105 + 106 106 + do { 107 107 + version = pvclock_get_time_values(&shadow, src); 108 108 + barrier(); 109 109 + offset = pvclock_get_nsec_offset(&shadow); 110 110 + ret = shadow.system_timestamp + offset; 111 111 + barrier(); 112 112 + } while (version != src->version); 113 113 + 114 114 + return ret; 115 115 + } 116 116 + 117 117 + void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, 118 118 + struct pvclock_vcpu_time_info *vcpu_time, 119 119 + struct timespec *ts) 120 120 + { 121 121 + u32 version; 122 122 + u64 delta; 123 123 + struct timespec now; 124 124 + 125 125 + /* get wallclock at system boot */ 126 126 + do { 127 127 + version = wall_clock->version; 128 128 + rmb(); /* fetch version before time */ 129 129 + now.tv_sec = wall_clock->sec; 130 130 + now.tv_nsec = wall_clock->nsec; 131 131 + rmb(); /* fetch time before checking version */ 132 132 + } while ((wall_clock->version & 1) || (version != wall_clock->version)); 133 133 + 134 134 + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ 135 135 + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 136 136 + 137 137 + now.tv_nsec = do_div(delta, NSEC_PER_SEC); 138 138 + now.tv_sec = delta; 139 139 + 140 140 + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 141 141 + }

+6 -3

arch/x86/kvm/i8254.c

reviewed

··· 200 200 201 201 atomic_inc(&pt->pending); 202 202 smp_mb__after_atomic_inc(); 203 203 - if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 204 204 - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 205 205 - wake_up_interruptible(&vcpu0->wq); 203 203 + if (vcpu0) { 204 204 + set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 205 205 + if (waitqueue_active(&vcpu0->wq)) { 206 206 + vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 207 207 + wake_up_interruptible(&vcpu0->wq); 208 208 + } 206 209 } 207 210 208 211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);

arch/x86/kvm/lapic.c

reviewed

··· 940 940 wait_queue_head_t *q = &apic->vcpu->wq; 941 941 942 942 atomic_inc(&apic->timer.pending); 943 943 + set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 943 944 if (waitqueue_active(q)) { 944 945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 945 946 wake_up_interruptible(q);

+8 -11

arch/x86/kvm/mmu.c

reviewed

··· 640 640 rmap_remove(kvm, spte); 641 641 --kvm->stat.lpages; 642 642 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 643 643 + spte = NULL; 643 644 write_protected = 1; 644 645 } 645 646 spte = rmap_next(kvm, rmapp, spte); ··· 1083 1082 struct kvm_mmu_page *shadow; 1084 1083 1085 1084 spte |= PT_WRITABLE_MASK; 1086 1086 - if (user_fault) { 1087 1087 - mmu_unshadow(vcpu->kvm, gfn); 1088 1088 - goto unshadowed; 1089 1089 - } 1090 1085 1091 1086 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1092 1087 if (shadow || ··· 1098 1101 *ptwrite = 1; 1099 1102 } 1100 1103 } 1101 1101 - 1102 1102 - unshadowed: 1103 1104 1104 1105 if (pte_access & ACC_WRITE_MASK) 1105 1106 mark_page_dirty(vcpu->kvm, gfn); ··· 1575 1580 u64 *spte, 1576 1581 const void *new) 1577 1582 { 1578 1578 - if ((sp->role.level != PT_PAGE_TABLE_LEVEL) 1579 1579 - && !vcpu->arch.update_pte.largepage) { 1580 1580 - ++vcpu->kvm->stat.mmu_pde_zapped; 1581 1581 - return; 1582 1582 - } 1583 1583 + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 1584 1584 + if (!vcpu->arch.update_pte.largepage || 1585 1585 + sp->role.glevels == PT32_ROOT_LEVEL) { 1586 1586 + ++vcpu->kvm->stat.mmu_pde_zapped; 1587 1587 + return; 1588 1588 + } 1589 1589 + } 1583 1590 1584 1591 ++vcpu->kvm->stat.mmu_pte_updated; 1585 1592 if (sp->role.glevels == PT32_ROOT_LEVEL)

+11 -8

arch/x86/kvm/vmx.c

reviewed

··· 566 566 load_transition_efer(vmx); 567 567 } 568 568 569 569 - static void vmx_load_host_state(struct vcpu_vmx *vmx) 569 569 + static void __vmx_load_host_state(struct vcpu_vmx *vmx) 570 570 { 571 571 unsigned long flags; 572 572 ··· 594 594 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 595 595 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 596 596 reload_host_efer(vmx); 597 597 + } 598 598 + 599 599 + static void vmx_load_host_state(struct vcpu_vmx *vmx) 600 600 + { 601 601 + preempt_disable(); 602 602 + __vmx_load_host_state(vmx); 603 603 + preempt_enable(); 597 604 } 598 605 599 606 /* ··· 661 654 662 655 static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 663 656 { 664 664 - vmx_load_host_state(to_vmx(vcpu)); 657 657 + __vmx_load_host_state(to_vmx(vcpu)); 665 658 } 666 659 667 660 static void vmx_fpu_activate(struct kvm_vcpu *vcpu) ··· 891 884 switch (msr_index) { 892 885 #ifdef CONFIG_X86_64 893 886 case MSR_EFER: 887 887 + vmx_load_host_state(vmx); 894 888 ret = kvm_set_msr_common(vcpu, msr_index, data); 895 895 - if (vmx->host_state.loaded) { 896 896 - reload_host_efer(vmx); 897 897 - load_transition_efer(vmx); 898 898 - } 899 889 break; 900 890 case MSR_FS_BASE: 901 891 vmcs_writel(GUEST_FS_BASE, data); ··· 914 910 guest_write_tsc(data); 915 911 break; 916 912 default: 913 913 + vmx_load_host_state(vmx); 917 914 msr = find_msr_entry(vmx, msr_index); 918 915 if (msr) { 919 916 msr->data = data; 920 920 - if (vmx->host_state.loaded) 921 921 - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 922 917 break; 923 918 } 924 919 ret = kvm_set_msr_common(vcpu, msr_index, data);

+66 -25

arch/x86/kvm/x86.c

reviewed

··· 492 492 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 493 493 { 494 494 static int version; 495 495 - struct kvm_wall_clock wc; 496 496 - struct timespec wc_ts; 495 495 + struct pvclock_wall_clock wc; 496 496 + struct timespec now, sys, boot; 497 497 498 498 if (!wall_clock) 499 499 return; ··· 502 502 503 503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 504 504 505 505 - wc_ts = current_kernel_time(); 506 506 - wc.wc_sec = wc_ts.tv_sec; 507 507 - wc.wc_nsec = wc_ts.tv_nsec; 508 508 - wc.wc_version = version; 505 505 + /* 506 506 + * The guest calculates current wall clock time by adding 507 507 + * system time (updated by kvm_write_guest_time below) to the 508 508 + * wall clock specified here. guest system time equals host 509 509 + * system time for us, thus we must fill in host boot time here. 510 510 + */ 511 511 + now = current_kernel_time(); 512 512 + ktime_get_ts(&sys); 513 513 + boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); 514 514 + 515 515 + wc.sec = boot.tv_sec; 516 516 + wc.nsec = boot.tv_nsec; 517 517 + wc.version = version; 509 518 510 519 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); 511 520 512 521 version++; 513 522 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 523 523 + } 524 524 + 525 525 + static uint32_t div_frac(uint32_t dividend, uint32_t divisor) 526 526 + { 527 527 + uint32_t quotient, remainder; 528 528 + 529 529 + /* Don't try to replace with do_div(), this one calculates 530 530 + * "(dividend << 32) / divisor" */ 531 531 + __asm__ ( "divl %4" 532 532 + : "=a" (quotient), "=d" (remainder) 533 533 + : "0" (0), "1" (dividend), "r" (divisor) ); 534 534 + return quotient; 535 535 + } 536 536 + 537 537 + static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 538 538 + { 539 539 + uint64_t nsecs = 1000000000LL; 540 540 + int32_t shift = 0; 541 541 + uint64_t tps64; 542 542 + uint32_t tps32; 543 543 + 544 544 + tps64 = tsc_khz * 1000LL; 545 545 + while (tps64 > nsecs*2) { 546 546 + tps64 >>= 1; 547 547 + shift--; 548 548 + } 549 549 + 550 550 + tps32 = (uint32_t)tps64; 551 551 + while (tps32 <= (uint32_t)nsecs) { 552 552 + tps32 <<= 1; 553 553 + shift++; 554 554 + } 555 555 + 556 556 + hv_clock->tsc_shift = shift; 557 557 + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 558 558 + 559 559 + pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 560 560 + __FUNCTION__, tsc_khz, hv_clock->tsc_shift, 561 561 + hv_clock->tsc_to_system_mul); 514 562 } 515 563 516 564 static void kvm_write_guest_time(struct kvm_vcpu *v) ··· 570 522 571 523 if ((!vcpu->time_page)) 572 524 return; 525 525 + 526 526 + if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { 527 527 + kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); 528 528 + vcpu->hv_clock_tsc_khz = tsc_khz; 529 529 + } 573 530 574 531 /* Keep irq disabled to prevent changes to the clock */ 575 532 local_irq_save(flags); ··· 590 537 /* 591 538 * The interface expects us to write an even number signaling that the 592 539 * update is finished. Since the guest won't see the intermediate 593 593 - * state, we just write "2" at the end 540 540 + * state, we just increase by 2 at the end. 594 541 */ 595 595 - vcpu->hv_clock.version = 2; 542 542 + vcpu->hv_clock.version += 2; 596 543 597 544 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); 598 545 599 546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 600 600 - sizeof(vcpu->hv_clock)); 547 547 + sizeof(vcpu->hv_clock)); 601 548 602 549 kunmap_atomic(shared_kaddr, KM_USER0); 603 550 ··· 651 598 652 599 /* ...but clean it before doing the actual write */ 653 600 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 654 654 - 655 655 - vcpu->arch.hv_clock.tsc_to_system_mul = 656 656 - clocksource_khz2mult(tsc_khz, 22); 657 657 - vcpu->arch.hv_clock.tsc_shift = 22; 658 601 659 602 down_read(&current->mm->mmap_sem); 660 603 vcpu->arch.time_page = ··· 2808 2759 if (vcpu->requests) { 2809 2760 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2810 2761 __kvm_migrate_timers(vcpu); 2762 2762 + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2763 2763 + kvm_x86_ops->tlb_flush(vcpu); 2811 2764 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2812 2765 &vcpu->requests)) { 2813 2766 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; ··· 2823 2772 } 2824 2773 } 2825 2774 2775 2775 + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 2826 2776 kvm_inject_pending_timer_irqs(vcpu); 2827 2777 2828 2778 preempt_disable(); ··· 2833 2781 2834 2782 local_irq_disable(); 2835 2783 2836 2836 - if (need_resched()) { 2784 2784 + if (vcpu->requests || need_resched()) { 2837 2785 local_irq_enable(); 2838 2786 preempt_enable(); 2839 2787 r = 1; 2840 2788 goto out; 2841 2789 } 2842 2842 - 2843 2843 - if (vcpu->requests) 2844 2844 - if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { 2845 2845 - local_irq_enable(); 2846 2846 - preempt_enable(); 2847 2847 - r = 1; 2848 2848 - goto out; 2849 2849 - } 2850 2790 2851 2791 if (signal_pending(current)) { 2852 2792 local_irq_enable(); ··· 2869 2825 2870 2826 kvm_guest_enter(); 2871 2827 2872 2872 - if (vcpu->requests) 2873 2873 - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2874 2874 - kvm_x86_ops->tlb_flush(vcpu); 2875 2828 2876 2829 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 2877 2830 kvm_x86_ops->run(vcpu, kvm_run);

arch/x86/xen/Kconfig

reviewed

··· 5 5 config XEN 6 6 bool "Xen guest support" 7 7 select PARAVIRT 8 8 + select PARAVIRT_CLOCK 8 9 depends on X86_32 9 10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 11 help

+12 -120

arch/x86/xen/time.c

reviewed

··· 14 14 #include <linux/kernel_stat.h> 15 15 #include <linux/math64.h> 16 16 17 17 + #include <asm/pvclock.h> 17 18 #include <asm/xen/hypervisor.h> 18 19 #include <asm/xen/hypercall.h> 19 20 ··· 31 30 #define NS_PER_TICK (1000000000LL / HZ) 32 31 33 32 static cycle_t xen_clocksource_read(void); 34 34 - 35 35 - /* These are perodically updated in shared_info, and then copied here. */ 36 36 - struct shadow_time_info { 37 37 - u64 tsc_timestamp; /* TSC at last update of time vals. */ 38 38 - u64 system_timestamp; /* Time, in nanosecs, since boot. */ 39 39 - u32 tsc_to_nsec_mul; 40 40 - int tsc_shift; 41 41 - u32 version; 42 42 - }; 43 43 - 44 44 - static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); 45 33 46 34 /* runstate info updated by Xen */ 47 35 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); ··· 201 211 unsigned long xen_cpu_khz(void) 202 212 { 203 213 u64 xen_khz = 1000000ULL << 32; 204 204 - const struct vcpu_time_info *info = 214 214 + const struct pvclock_vcpu_time_info *info = 205 215 &HYPERVISOR_shared_info->vcpu_info[0].time; 206 216 207 217 do_div(xen_khz, info->tsc_to_system_mul); ··· 213 223 return xen_khz; 214 224 } 215 225 216 216 - /* 217 217 - * Reads a consistent set of time-base values from Xen, into a shadow data 218 218 - * area. 219 219 - */ 220 220 - static unsigned get_time_values_from_xen(void) 221 221 - { 222 222 - struct vcpu_time_info *src; 223 223 - struct shadow_time_info *dst; 224 224 - 225 225 - /* src is shared memory with the hypervisor, so we need to 226 226 - make sure we get a consistent snapshot, even in the face of 227 227 - being preempted. */ 228 228 - src = &__get_cpu_var(xen_vcpu)->time; 229 229 - dst = &__get_cpu_var(shadow_time); 230 230 - 231 231 - do { 232 232 - dst->version = src->version; 233 233 - rmb(); /* fetch version before data */ 234 234 - dst->tsc_timestamp = src->tsc_timestamp; 235 235 - dst->system_timestamp = src->system_time; 236 236 - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; 237 237 - dst->tsc_shift = src->tsc_shift; 238 238 - rmb(); /* test version after fetching data */ 239 239 - } while ((src->version & 1) | (dst->version ^ src->version)); 240 240 - 241 241 - return dst->version; 242 242 - } 243 243 - 244 244 - /* 245 245 - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 246 246 - * yielding a 64-bit result. 247 247 - */ 248 248 - static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) 249 249 - { 250 250 - u64 product; 251 251 - #ifdef __i386__ 252 252 - u32 tmp1, tmp2; 253 253 - #endif 254 254 - 255 255 - if (shift < 0) 256 256 - delta >>= -shift; 257 257 - else 258 258 - delta <<= shift; 259 259 - 260 260 - #ifdef __i386__ 261 261 - __asm__ ( 262 262 - "mul %5 ; " 263 263 - "mov %4,%%eax ; " 264 264 - "mov %%edx,%4 ; " 265 265 - "mul %5 ; " 266 266 - "xor %5,%5 ; " 267 267 - "add %4,%%eax ; " 268 268 - "adc %5,%%edx ; " 269 269 - : "=A" (product), "=r" (tmp1), "=r" (tmp2) 270 270 - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); 271 271 - #elif __x86_64__ 272 272 - __asm__ ( 273 273 - "mul %%rdx ; shrd $32,%%rdx,%%rax" 274 274 - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); 275 275 - #else 276 276 - #error implement me! 277 277 - #endif 278 278 - 279 279 - return product; 280 280 - } 281 281 - 282 282 - static u64 get_nsec_offset(struct shadow_time_info *shadow) 283 283 - { 284 284 - u64 now, delta; 285 285 - now = native_read_tsc(); 286 286 - delta = now - shadow->tsc_timestamp; 287 287 - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); 288 288 - } 289 289 - 290 226 static cycle_t xen_clocksource_read(void) 291 227 { 292 292 - struct shadow_time_info *shadow = &get_cpu_var(shadow_time); 228 228 + struct pvclock_vcpu_time_info *src; 293 229 cycle_t ret; 294 294 - unsigned version; 295 230 296 296 - do { 297 297 - version = get_time_values_from_xen(); 298 298 - barrier(); 299 299 - ret = shadow->system_timestamp + get_nsec_offset(shadow); 300 300 - barrier(); 301 301 - } while (version != __get_cpu_var(xen_vcpu)->time.version); 302 302 - 303 303 - put_cpu_var(shadow_time); 304 304 - 231 231 + src = &get_cpu_var(xen_vcpu)->time; 232 232 + ret = pvclock_clocksource_read(src); 233 233 + put_cpu_var(xen_vcpu); 305 234 return ret; 306 235 } 307 236 308 237 static void xen_read_wallclock(struct timespec *ts) 309 238 { 310 310 - const struct shared_info *s = HYPERVISOR_shared_info; 311 311 - u32 version; 312 312 - u64 delta; 313 313 - struct timespec now; 239 239 + struct shared_info *s = HYPERVISOR_shared_info; 240 240 + struct pvclock_wall_clock *wall_clock = &(s->wc); 241 241 + struct pvclock_vcpu_time_info *vcpu_time; 314 242 315 315 - /* get wallclock at system boot */ 316 316 - do { 317 317 - version = s->wc_version; 318 318 - rmb(); /* fetch version before time */ 319 319 - now.tv_sec = s->wc_sec; 320 320 - now.tv_nsec = s->wc_nsec; 321 321 - rmb(); /* fetch time before checking version */ 322 322 - } while ((s->wc_version & 1) | (version ^ s->wc_version)); 323 323 - 324 324 - delta = xen_clocksource_read(); /* time since system boot */ 325 325 - delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; 326 326 - 327 327 - now.tv_nsec = do_div(delta, NSEC_PER_SEC); 328 328 - now.tv_sec = delta; 329 329 - 330 330 - set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 243 243 + vcpu_time = &get_cpu_var(xen_vcpu)->time; 244 244 + pvclock_read_wallclock(wall_clock, vcpu_time, ts); 245 245 + put_cpu_var(xen_vcpu); 331 246 } 332 247 333 248 unsigned long xen_get_wallclock(void) ··· 240 345 struct timespec ts; 241 346 242 347 xen_read_wallclock(&ts); 243 243 - 244 348 return ts.tv_sec; 245 349 } 246 350 ··· 462 568 __init void xen_time_init(void) 463 569 { 464 570 int cpu = smp_processor_id(); 465 465 - 466 466 - get_time_values_from_xen(); 467 571 468 572 clocksource_register(&xen_clocksource); 469 573

+3 -1

include/asm-x86/kvm_host.h

reviewed

··· 18 18 #include <linux/kvm_para.h> 19 19 #include <linux/kvm_types.h> 20 20 21 21 + #include <asm/pvclock-abi.h> 21 22 #include <asm/desc.h> 22 23 23 24 #define KVM_MAX_VCPUS 16 ··· 283 282 struct x86_emulate_ctxt emulate_ctxt; 284 283 285 284 gpa_t time; 286 286 - struct kvm_vcpu_time_info hv_clock; 285 285 + struct pvclock_vcpu_time_info hv_clock; 286 286 + unsigned int hv_clock_tsc_khz; 287 287 unsigned int time_offset; 288 288 struct page *time_page; 289 289 };

-18

include/asm-x86/kvm_para.h

reviewed

··· 48 48 #ifdef __KERNEL__ 49 49 #include <asm/processor.h> 50 50 51 51 - /* xen binary-compatible interface. See xen headers for details */ 52 52 - struct kvm_vcpu_time_info { 53 53 - uint32_t version; 54 54 - uint32_t pad0; 55 55 - uint64_t tsc_timestamp; 56 56 - uint64_t system_time; 57 57 - uint32_t tsc_to_system_mul; 58 58 - int8_t tsc_shift; 59 59 - int8_t pad[3]; 60 60 - } __attribute__((__packed__)); /* 32 bytes */ 61 61 - 62 62 - struct kvm_wall_clock { 63 63 - uint32_t wc_version; 64 64 - uint32_t wc_sec; 65 65 - uint32_t wc_nsec; 66 66 - } __attribute__((__packed__)); 67 67 - 68 68 - 69 51 extern void kvmclock_init(void); 70 52 71 53

+42

include/asm-x86/pvclock-abi.h

reviewed

··· 1 1 + #ifndef _ASM_X86_PVCLOCK_ABI_H_ 2 2 + #define _ASM_X86_PVCLOCK_ABI_H_ 3 3 + #ifndef __ASSEMBLY__ 4 4 + 5 5 + /* 6 6 + * These structs MUST NOT be changed. 7 7 + * They are the ABI between hypervisor and guest OS. 8 8 + * Both Xen and KVM are using this. 9 9 + * 10 10 + * pvclock_vcpu_time_info holds the system time and the tsc timestamp 11 11 + * of the last update. So the guest can use the tsc delta to get a 12 12 + * more precise system time. There is one per virtual cpu. 13 13 + * 14 14 + * pvclock_wall_clock references the point in time when the system 15 15 + * time was zero (usually boot time), thus the guest calculates the 16 16 + * current wall clock by adding the system time. 17 17 + * 18 18 + * Protocol for the "version" fields is: hypervisor raises it (making 19 19 + * it uneven) before it starts updating the fields and raises it again 20 20 + * (making it even) when it is done. Thus the guest can make sure the 21 21 + * time values it got are consistent by checking the version before 22 22 + * and after reading them. 23 23 + */ 24 24 + 25 25 + struct pvclock_vcpu_time_info { 26 26 + u32 version; 27 27 + u32 pad0; 28 28 + u64 tsc_timestamp; 29 29 + u64 system_time; 30 30 + u32 tsc_to_system_mul; 31 31 + s8 tsc_shift; 32 32 + u8 pad[3]; 33 33 + } __attribute__((__packed__)); /* 32 bytes */ 34 34 + 35 35 + struct pvclock_wall_clock { 36 36 + u32 version; 37 37 + u32 sec; 38 38 + u32 nsec; 39 39 + } __attribute__((__packed__)); 40 40 + 41 41 + #endif /* __ASSEMBLY__ */ 42 42 + #endif /* _ASM_X86_PVCLOCK_ABI_H_ */

+13

include/asm-x86/pvclock.h

reviewed

··· 1 1 + #ifndef _ASM_X86_PVCLOCK_H_ 2 2 + #define _ASM_X86_PVCLOCK_H_ 3 3 + 4 4 + #include <linux/clocksource.h> 5 5 + #include <asm/pvclock-abi.h> 6 6 + 7 7 + /* some helper functions for xen and kvm pv clock sources */ 8 8 + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 9 9 + void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 10 10 + struct pvclock_vcpu_time_info *vcpu, 11 11 + struct timespec *ts); 12 12 + 13 13 + #endif /* _ASM_X86_PVCLOCK_H_ */

include/linux/kvm_host.h

reviewed

··· 33 33 #define KVM_REQ_REPORT_TPR_ACCESS 2 34 34 #define KVM_REQ_MMU_RELOAD 3 35 35 #define KVM_REQ_TRIPLE_FAULT 4 36 36 + #define KVM_REQ_PENDING_TIMER 5 36 37 37 38 struct kvm_vcpu; 38 39 extern struct kmem_cache *kvm_vcpu_cache;

+3 -4

include/xen/interface/xen.h

reviewed

··· 10 10 #define __XEN_PUBLIC_XEN_H__ 11 11 12 12 #include <asm/xen/interface.h> 13 13 + #include <asm/pvclock-abi.h> 13 14 14 15 /* 15 16 * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). ··· 337 336 uint8_t evtchn_upcall_mask; 338 337 unsigned long evtchn_pending_sel; 339 338 struct arch_vcpu_info arch; 340 340 - struct vcpu_time_info time; 339 339 + struct pvclock_vcpu_time_info time; 341 340 }; /* 64 bytes (x86) */ 342 341 343 342 /* ··· 385 384 * Wallclock time: updated only by control software. Guests should base 386 385 * their gettimeofday() syscall on this wallclock-base value. 387 386 */ 388 388 - uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ 389 389 - uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ 390 390 - uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ 387 387 + struct pvclock_wall_clock wc; 391 388 392 389 struct arch_shared_info arch; 393 390

+11 -20

virt/kvm/ioapic.c

reviewed

··· 269 269 } 270 270 } 271 271 272 272 - static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) 272 272 + static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) 273 273 { 274 274 - int i; 275 275 - 276 276 - for (i = 0; i < IOAPIC_NUM_PINS; i++) 277 277 - if (ioapic->redirtbl[i].fields.vector == vector) 278 278 - return i; 279 279 - return -1; 280 280 - } 281 281 - 282 282 - void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) 283 283 - { 284 284 - struct kvm_ioapic *ioapic = kvm->arch.vioapic; 285 274 union ioapic_redir_entry *ent; 286 286 - int gsi; 287 287 - 288 288 - gsi = get_eoi_gsi(ioapic, vector); 289 289 - if (gsi == -1) { 290 290 - printk(KERN_WARNING "Can't find redir item for %d EOI\n", 291 291 - vector); 292 292 - return; 293 293 - } 294 275 295 276 ent = &ioapic->redirtbl[gsi]; 296 277 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ··· 279 298 ent->fields.remote_irr = 0; 280 299 if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) 281 300 ioapic_deliver(ioapic, gsi); 301 301 + } 302 302 + 303 303 + void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) 304 304 + { 305 305 + struct kvm_ioapic *ioapic = kvm->arch.vioapic; 306 306 + int i; 307 307 + 308 308 + for (i = 0; i < IOAPIC_NUM_PINS; i++) 309 309 + if (ioapic->redirtbl[i].fields.vector == vector) 310 310 + __kvm_ioapic_update_eoi(ioapic, i); 282 311 } 283 312 284 313 static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)