Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm changes from Paolo Bonzini:
"Remove from guest code the handling of task migration during a pvclock
read; instead use the correct protocol in KVM.

This removes the need for task migration notifiers in core scheduler
code"

[ The scheduler people really hated the migration notifiers, so this was
kind of required - Linus ]

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
x86: pvclock: Really remove the sched notifier for cross-cpu migrations
kvm: x86: fix kvmclock update protocol

+43 -92
-1
arch/x86/include/asm/pvclock.h
··· 95 95 96 96 struct pvclock_vsyscall_time_info { 97 97 struct pvclock_vcpu_time_info pvti; 98 - u32 migrate_count; 99 98 } __attribute__((__aligned__(SMP_CACHE_BYTES))); 100 99 101 100 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
-44
arch/x86/kernel/pvclock.c
··· 141 141 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 142 142 } 143 143 144 - static struct pvclock_vsyscall_time_info *pvclock_vdso_info; 145 - 146 - static struct pvclock_vsyscall_time_info * 147 - pvclock_get_vsyscall_user_time_info(int cpu) 148 - { 149 - if (!pvclock_vdso_info) { 150 - BUG(); 151 - return NULL; 152 - } 153 - 154 - return &pvclock_vdso_info[cpu]; 155 - } 156 - 157 - struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) 158 - { 159 - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; 160 - } 161 - 162 144 #ifdef CONFIG_X86_64 163 - static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, 164 - void *v) 165 - { 166 - struct task_migration_notifier *mn = v; 167 - struct pvclock_vsyscall_time_info *pvti; 168 - 169 - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); 170 - 171 - /* this is NULL when pvclock vsyscall is not initialized */ 172 - if (unlikely(pvti == NULL)) 173 - return NOTIFY_DONE; 174 - 175 - pvti->migrate_count++; 176 - 177 - return NOTIFY_DONE; 178 - } 179 - 180 - static struct notifier_block pvclock_migrate = { 181 - .notifier_call = pvclock_task_migrate, 182 - }; 183 - 184 145 /* 185 146 * Initialize the generic pvclock vsyscall state. This will allocate 186 147 * a/some page(s) for the per-vcpu pvclock information, set up a ··· 155 194 156 195 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); 157 196 158 - pvclock_vdso_info = i; 159 - 160 197 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 161 198 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 162 199 __pa(i) + (idx*PAGE_SIZE), 163 200 PAGE_KERNEL_VVAR); 164 201 } 165 - 166 - 167 - register_task_migration_notifier(&pvclock_migrate); 168 202 169 203 return 0; 170 204 }
+28 -5
arch/x86/kvm/x86.c
··· 1669 1669 &guest_hv_clock, sizeof(guest_hv_clock)))) 1670 1670 return 0; 1671 1671 1672 - /* 1673 - * The interface expects us to write an even number signaling that the 1674 - * update is finished. Since the guest won't see the intermediate 1675 - * state, we just increase by 2 at the end. 1672 + /* This VCPU is paused, but it's legal for a guest to read another 1673 + * VCPU's kvmclock, so we really have to follow the specification where 1674 + * it says that version is odd if data is being modified, and even after 1675 + * it is consistent. 1676 + * 1677 + * Version field updates must be kept separate. This is because 1678 + * kvm_write_guest_cached might use a "rep movs" instruction, and 1679 + * writes within a string instruction are weakly ordered. So there 1680 + * are three writes overall. 1681 + * 1682 + * As a small optimization, only write the version field in the first 1683 + * and third write. The vcpu->pv_time cache is still valid, because the 1684 + * version field is the first in the struct. 1676 1685 */ 1677 - vcpu->hv_clock.version = guest_hv_clock.version + 2; 1686 + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); 1687 + 1688 + vcpu->hv_clock.version = guest_hv_clock.version + 1; 1689 + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, 1690 + &vcpu->hv_clock, 1691 + sizeof(vcpu->hv_clock.version)); 1692 + 1693 + smp_wmb(); 1678 1694 1679 1695 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ 1680 1696 pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); ··· 1711 1695 kvm_write_guest_cached(v->kvm, &vcpu->pv_time, 1712 1696 &vcpu->hv_clock, 1713 1697 sizeof(vcpu->hv_clock)); 1698 + 1699 + smp_wmb(); 1700 + 1701 + vcpu->hv_clock.version++; 1702 + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, 1703 + &vcpu->hv_clock, 1704 + sizeof(vcpu->hv_clock.version)); 1714 1705 return 0; 1715 1706 } 1716 1707
+15 -19
arch/x86/vdso/vclock_gettime.c
··· 82 82 cycle_t ret; 83 83 u64 last; 84 84 u32 version; 85 - u32 migrate_count; 86 85 u8 flags; 87 86 unsigned cpu, cpu1; 88 87 89 88 90 89 /* 91 - * When looping to get a consistent (time-info, tsc) pair, we 92 - * also need to deal with the possibility we can switch vcpus, 93 - * so make sure we always re-fetch time-info for the current vcpu. 90 + * Note: hypervisor must guarantee that: 91 + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. 92 + * 2. that per-CPU pvclock time info is updated if the 93 + * underlying CPU changes. 94 + * 3. that version is increased whenever underlying CPU 95 + * changes. 96 + * 94 97 */ 95 98 do { 96 99 cpu = __getcpu() & VGETCPU_CPU_MASK; ··· 102 99 * __getcpu() calls (Gleb). 103 100 */ 104 101 105 - /* Make sure migrate_count will change if we leave the VCPU. */ 106 - do { 107 - pvti = get_pvti(cpu); 108 - migrate_count = pvti->migrate_count; 109 - 110 - cpu1 = cpu; 111 - cpu = __getcpu() & VGETCPU_CPU_MASK; 112 - } while (unlikely(cpu != cpu1)); 102 + pvti = get_pvti(cpu); 113 103 114 104 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); 115 105 116 106 /* 117 107 * Test we're still on the cpu as well as the version. 118 - * - We must read TSC of pvti's VCPU. 119 - * - KVM doesn't follow the versioning protocol, so data could 120 - * change before version if we left the VCPU. 108 + * We could have been migrated just after the first 109 + * vgetcpu but before fetching the version, so we 110 + * wouldn't notice a version change. 121 111 */ 122 - smp_rmb(); 123 - } while (unlikely((pvti->pvti.version & 1) || 124 - pvti->pvti.version != version || 125 - pvti->migrate_count != migrate_count)); 112 + cpu1 = __getcpu() & VGETCPU_CPU_MASK; 113 + } while (unlikely(cpu != cpu1 || 114 + (pvti->pvti.version & 1) || 115 + pvti->pvti.version != version)); 126 116 127 117 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) 128 118 *mode = VCLOCK_NONE;
-8
include/linux/sched.h
··· 175 175 extern void calc_global_load(unsigned long ticks); 176 176 extern void update_cpu_load_nohz(void); 177 177 178 - /* Notifier for when a task gets migrated to a new CPU */ 179 - struct task_migration_notifier { 180 - struct task_struct *task; 181 - int from_cpu; 182 - int to_cpu; 183 - }; 184 - extern void register_task_migration_notifier(struct notifier_block *n); 185 - 186 178 extern unsigned long get_parent_ip(unsigned long addr); 187 179 188 180 extern void dump_cpu_task(int cpu);
-15
kernel/sched/core.c
··· 1016 1016 rq_clock_skip_update(rq, true); 1017 1017 } 1018 1018 1019 - static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); 1020 - 1021 - void register_task_migration_notifier(struct notifier_block *n) 1022 - { 1023 - atomic_notifier_chain_register(&task_migration_notifier, n); 1024 - } 1025 - 1026 1019 #ifdef CONFIG_SMP 1027 1020 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1028 1021 { ··· 1046 1053 trace_sched_migrate_task(p, new_cpu); 1047 1054 1048 1055 if (task_cpu(p) != new_cpu) { 1049 - struct task_migration_notifier tmn; 1050 - 1051 1056 if (p->sched_class->migrate_task_rq) 1052 1057 p->sched_class->migrate_task_rq(p, new_cpu); 1053 1058 p->se.nr_migrations++; 1054 1059 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); 1055 - 1056 - tmn.task = p; 1057 - tmn.from_cpu = task_cpu(p); 1058 - tmn.to_cpu = new_cpu; 1059 - 1060 - atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); 1061 1060 } 1062 1061 1063 1062 __set_task_cpu(p, new_cpu);