Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: change vma_start_read() to drop RCU lock on failure

vma_start_read() can drop and reacquire RCU lock in certain failure cases.
It's not apparent that the RCU session started by the caller of this
function might be interrupted when vma_start_read() fails to lock the vma.
This might become a source of subtle bugs and to prevent that we change
the locking rules for vma_start_read() to drop RCU read lock upon failure.
This way it's more obvious that RCU-protected objects are unsafe after
vma locking fails.

Link: https://lkml.kernel.org/r/20250804233349.1278678-2-surenb@google.com
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Suren Baghdasaryan and committed by
Andrew Morton
0b16f8be cc483b32

+45 -39
+45 -39
mm/mmap_lock.c
··· 136 136 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got 137 137 * detached. 138 138 * 139 - * WARNING! The vma passed to this function cannot be used if the function 140 - * fails to lock it because in certain cases RCU lock is dropped and then 141 - * reacquired. Once RCU lock is dropped the vma can be concurently freed. 139 + * IMPORTANT: RCU lock must be held upon entering the function, but upon error 140 + * IT IS RELEASED. The caller must handle this correctly. 142 141 */ 143 142 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 144 143 struct vm_area_struct *vma) 145 144 { 145 + struct mm_struct *other_mm; 146 146 int oldcnt; 147 147 148 + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); 148 149 /* 149 150 * Check before locking. A race might cause false locked result. 150 151 * We can use READ_ONCE() for the mm_lock_seq here, and don't need ··· 153 152 * we don't rely on for anything - the mm_lock_seq read against which we 154 153 * need ordering is below. 155 154 */ 156 - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) 157 - return NULL; 155 + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { 156 + vma = NULL; 157 + goto err; 158 + } 158 159 159 160 /* 160 161 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() ··· 167 164 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, 168 165 VMA_REF_LIMIT))) { 169 166 /* return EAGAIN if vma got detached from under us */ 170 - return oldcnt ? NULL : ERR_PTR(-EAGAIN); 167 + vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); 168 + goto err; 171 169 } 172 170 173 171 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); 174 172 175 - /* 176 - * If vma got attached to another mm from under us, that mm is not 177 - * stable and can be freed in the narrow window after vma->vm_refcnt 178 - * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 179 - * releasing vma->vm_refcnt. 180 - */ 181 - if (unlikely(vma->vm_mm != mm)) { 182 - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ 183 - struct mm_struct *other_mm = vma->vm_mm; 184 - 185 - /* 186 - * __mmdrop() is a heavy operation and we don't need RCU 187 - * protection here. Release RCU lock during these operations. 188 - * We reinstate the RCU read lock as the caller expects it to 189 - * be held when this function returns even on error. 190 - */ 191 - rcu_read_unlock(); 192 - mmgrab(other_mm); 193 - vma_refcount_put(vma); 194 - mmdrop(other_mm); 195 - rcu_read_lock(); 196 - return NULL; 197 - } 173 + if (unlikely(vma->vm_mm != mm)) 174 + goto err_unstable; 198 175 199 176 /* 200 177 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. ··· 189 206 */ 190 207 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { 191 208 vma_refcount_put(vma); 192 - return NULL; 209 + vma = NULL; 210 + goto err; 193 211 } 194 212 195 213 return vma; 214 + err: 215 + rcu_read_unlock(); 216 + 217 + return vma; 218 + err_unstable: 219 + /* 220 + * If vma got attached to another mm from under us, that mm is not 221 + * stable and can be freed in the narrow window after vma->vm_refcnt 222 + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 223 + * releasing vma->vm_refcnt. 224 + */ 225 + other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ 226 + 227 + /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ 228 + rcu_read_unlock(); 229 + mmgrab(other_mm); 230 + vma_refcount_put(vma); 231 + mmdrop(other_mm); 232 + 233 + return NULL; 196 234 } 197 235 198 236 /* ··· 227 223 MA_STATE(mas, &mm->mm_mt, address, address); 228 224 struct vm_area_struct *vma; 229 225 230 - rcu_read_lock(); 231 226 retry: 227 + rcu_read_lock(); 232 228 vma = mas_walk(&mas); 233 - if (!vma) 229 + if (!vma) { 230 + rcu_read_unlock(); 234 231 goto inval; 232 + } 235 233 236 234 vma = vma_start_read(mm, vma); 237 235 if (IS_ERR_OR_NULL(vma)) { ··· 253 247 * From here on, we can access the VMA without worrying about which 254 248 * fields are accessible for RCU readers. 255 249 */ 250 + rcu_read_unlock(); 256 251 257 252 /* Check if the vma we locked is the right one. */ 258 - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 259 - goto inval_end_read; 253 + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 254 + vma_end_read(vma); 255 + goto inval; 256 + } 260 257 261 - rcu_read_unlock(); 262 258 return vma; 263 259 264 - inval_end_read: 265 - vma_end_read(vma); 266 260 inval: 267 - rcu_read_unlock(); 268 261 count_vm_vma_lock_event(VMA_LOCK_ABORT); 269 262 return NULL; 270 263 } ··· 318 313 */ 319 314 if (PTR_ERR(vma) == -EAGAIN) { 320 315 /* reset to search from the last address */ 316 + rcu_read_lock(); 321 317 vma_iter_set(vmi, from_addr); 322 318 goto retry; 323 319 } ··· 348 342 return vma; 349 343 350 344 fallback_unlock: 345 + rcu_read_unlock(); 351 346 vma_end_read(vma); 352 347 fallback: 353 - rcu_read_unlock(); 354 348 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); 355 349 rcu_read_lock(); 356 350 /* Reinitialize the iterator after re-entering rcu read section */