Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-x86-dirty_ring-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM Dirty Ring changes for 6.17

Fix issues with dirty ring harvesting where KVM doesn't bound the processing
of entries in any way, which allows userspace to keep KVM in a tight loop
indefinitely. Clean up code and comments along the way.

+87 -47
+5 -13
include/linux/kvm_dirty_ring.h
··· 49 49 } 50 50 51 51 static inline int kvm_dirty_ring_reset(struct kvm *kvm, 52 - struct kvm_dirty_ring *ring) 52 + struct kvm_dirty_ring *ring, 53 + int *nr_entries_reset) 53 54 { 54 - return 0; 55 + return -ENOENT; 55 56 } 56 57 57 58 static inline void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, ··· 78 77 u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm); 79 78 int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, 80 79 int index, u32 size); 81 - 82 - /* 83 - * called with kvm->slots_lock held, returns the number of 84 - * processed pages. 85 - */ 86 - int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring); 87 - 88 - /* 89 - * returns =0: successfully pushed 90 - * <0: unable to push, need to wait 91 - */ 80 + int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, 81 + int *nr_entries_reset); 92 82 void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset); 93 83 94 84 bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu);
+76 -31
virt/kvm/dirty_ring.c
··· 55 55 struct kvm_memory_slot *memslot; 56 56 int as_id, id; 57 57 58 - if (!mask) 59 - return; 60 - 61 58 as_id = slot >> 16; 62 59 id = (u16)slot; 63 60 ··· 102 105 return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET; 103 106 } 104 107 105 - int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) 108 + int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring, 109 + int *nr_entries_reset) 106 110 { 111 + /* 112 + * To minimize mmu_lock contention, batch resets for harvested entries 113 + * whose gfns are in the same slot, and are within N frame numbers of 114 + * each other, where N is the number of bits in an unsigned long. For 115 + * simplicity, process the current set of entries when the next entry 116 + * can't be included in the batch. 117 + * 118 + * Track the current batch slot, the gfn offset into the slot for the 119 + * batch, and the bitmask of gfns that need to be reset (relative to 120 + * offset). Note, the offset may be adjusted backwards, e.g. so that 121 + * a sequence of gfns X, X-1, ... X-N-1 can be batched. 122 + */ 107 123 u32 cur_slot, next_slot; 108 124 u64 cur_offset, next_offset; 109 - unsigned long mask; 110 - int count = 0; 125 + unsigned long mask = 0; 111 126 struct kvm_dirty_gfn *entry; 112 - bool first_round = true; 113 127 114 - /* This is only needed to make compilers happy */ 115 - cur_slot = cur_offset = mask = 0; 128 + /* 129 + * Ensure concurrent calls to KVM_RESET_DIRTY_RINGS are serialized, 130 + * e.g. so that KVM fully resets all entries processed by a given call 131 + * before returning to userspace. Holding slots_lock also protects 132 + * the various memslot accesses. 133 + */ 134 + lockdep_assert_held(&kvm->slots_lock); 116 135 117 - while (true) { 136 + while (likely((*nr_entries_reset) < INT_MAX)) { 137 + if (signal_pending(current)) 138 + return -EINTR; 139 + 118 140 entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)]; 119 141 120 142 if (!kvm_dirty_gfn_harvested(entry)) ··· 146 130 kvm_dirty_gfn_set_invalid(entry); 147 131 148 132 ring->reset_index++; 149 - count++; 150 - /* 151 - * Try to coalesce the reset operations when the guest is 152 - * scanning pages in the same slot. 153 - */ 154 - if (!first_round && next_slot == cur_slot) { 155 - s64 delta = next_offset - cur_offset; 133 + (*nr_entries_reset)++; 156 134 157 - if (delta >= 0 && delta < BITS_PER_LONG) { 158 - mask |= 1ull << delta; 159 - continue; 135 + if (mask) { 136 + /* 137 + * While the size of each ring is fixed, it's possible 138 + * for the ring to be constantly re-dirtied/harvested 139 + * while the reset is in-progress (the hard limit exists 140 + * only to guard against the count becoming negative). 141 + */ 142 + cond_resched(); 143 + 144 + /* 145 + * Try to coalesce the reset operations when the guest 146 + * is scanning pages in the same slot. 147 + */ 148 + if (next_slot == cur_slot) { 149 + s64 delta = next_offset - cur_offset; 150 + 151 + if (delta >= 0 && delta < BITS_PER_LONG) { 152 + mask |= 1ull << delta; 153 + continue; 154 + } 155 + 156 + /* Backwards visit, careful about overflows! */ 157 + if (delta > -BITS_PER_LONG && delta < 0 && 158 + (mask << -delta >> -delta) == mask) { 159 + cur_offset = next_offset; 160 + mask = (mask << -delta) | 1; 161 + continue; 162 + } 160 163 } 161 164 162 - /* Backwards visit, careful about overflows! */ 163 - if (delta > -BITS_PER_LONG && delta < 0 && 164 - (mask << -delta >> -delta) == mask) { 165 - cur_offset = next_offset; 166 - mask = (mask << -delta) | 1; 167 - continue; 168 - } 165 + /* 166 + * Reset the slot for all the harvested entries that 167 + * have been gathered, but not yet fully processed. 168 + */ 169 + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); 169 170 } 170 - kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); 171 + 172 + /* 173 + * The current slot was reset or this is the first harvested 174 + * entry, (re)initialize the metadata. 175 + */ 171 176 cur_slot = next_slot; 172 177 cur_offset = next_offset; 173 178 mask = 1; 174 - first_round = false; 175 179 } 176 180 177 - kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); 181 + /* 182 + * Perform a final reset if there are harvested entries that haven't 183 + * been processed, which is guaranteed if at least one harvested was 184 + * found. The loop only performs a reset when the "next" entry can't 185 + * be batched with the "current" entry(s), and that reset processes the 186 + * _current_ entry(s); i.e. the last harvested entry, a.k.a. next, will 187 + * always be left pending. 188 + */ 189 + if (mask) 190 + kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); 178 191 179 192 /* 180 193 * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared ··· 212 167 213 168 trace_kvm_dirty_ring_reset(ring); 214 169 215 - return count; 170 + return 0; 216 171 } 217 172 218 173 void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset)
+6 -3
virt/kvm/kvm_main.c
··· 4967 4967 { 4968 4968 unsigned long i; 4969 4969 struct kvm_vcpu *vcpu; 4970 - int cleared = 0; 4970 + int cleared = 0, r; 4971 4971 4972 4972 if (!kvm->dirty_ring_size) 4973 4973 return -EINVAL; 4974 4974 4975 4975 mutex_lock(&kvm->slots_lock); 4976 4976 4977 - kvm_for_each_vcpu(i, vcpu, kvm) 4978 - cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring); 4977 + kvm_for_each_vcpu(i, vcpu, kvm) { 4978 + r = kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring, &cleared); 4979 + if (r) 4980 + break; 4981 + } 4979 4982 4980 4983 mutex_unlock(&kvm->slots_lock); 4981 4984