Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_MMAP_LOCK_H
3#define _LINUX_MMAP_LOCK_H
4
5/* Avoid a dependency loop by declaring here. */
6extern int rcuwait_wake_up(struct rcuwait *w);
7
8#include <linux/lockdep.h>
9#include <linux/mm_types.h>
10#include <linux/mmdebug.h>
11#include <linux/rwsem.h>
12#include <linux/tracepoint-defs.h>
13#include <linux/types.h>
14#include <linux/cleanup.h>
15#include <linux/sched/mm.h>
16
17#define MMAP_LOCK_INITIALIZER(name) \
18 .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
19
20DECLARE_TRACEPOINT(mmap_lock_start_locking);
21DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
22DECLARE_TRACEPOINT(mmap_lock_released);
23
24#ifdef CONFIG_TRACING
25
26void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
27void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
28 bool success);
29void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);
30
31static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
32 bool write)
33{
34 if (tracepoint_enabled(mmap_lock_start_locking))
35 __mmap_lock_do_trace_start_locking(mm, write);
36}
37
38static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
39 bool write, bool success)
40{
41 if (tracepoint_enabled(mmap_lock_acquire_returned))
42 __mmap_lock_do_trace_acquire_returned(mm, write, success);
43}
44
45static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
46{
47 if (tracepoint_enabled(mmap_lock_released))
48 __mmap_lock_do_trace_released(mm, write);
49}
50
51#else /* !CONFIG_TRACING */
52
53static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
54 bool write)
55{
56}
57
58static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
59 bool write, bool success)
60{
61}
62
63static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
64{
65}
66
67#endif /* CONFIG_TRACING */
68
69static inline void mmap_assert_locked(const struct mm_struct *mm)
70{
71 rwsem_assert_held(&mm->mmap_lock);
72}
73
74static inline void mmap_assert_write_locked(const struct mm_struct *mm)
75{
76 rwsem_assert_held_write(&mm->mmap_lock);
77}
78
79#ifdef CONFIG_PER_VMA_LOCK
80
81#ifdef CONFIG_LOCKDEP
82#define __vma_lockdep_map(vma) (&vma->vmlock_dep_map)
83#else
84#define __vma_lockdep_map(vma) NULL
85#endif
86
87/*
88 * VMA locks do not behave like most ordinary locks found in the kernel, so we
89 * cannot quite have full lockdep tracking in the way we would ideally prefer.
90 *
91 * Read locks act as shared locks which exclude an exclusive lock being
92 * taken. We therefore mark these accordingly on read lock acquire/release.
93 *
94 * Write locks are acquired exclusively per-VMA, but released in a shared
95 * fashion, that is upon vma_end_write_all(), we update the mmap's seqcount such
96 * that write lock is released.
97 *
98 * We therefore cannot track write locks per-VMA, nor do we try. Mitigating this
99 * is the fact that, of course, we do lockdep-track the mmap lock rwsem which
100 * must be held when taking a VMA write lock.
101 *
102 * We do, however, want to indicate that during either acquisition of a VMA
103 * write lock or detachment of a VMA that we require the lock held be exclusive,
104 * so we utilise lockdep to do so.
105 */
106#define __vma_lockdep_acquire_read(vma) \
107 lock_acquire_shared(__vma_lockdep_map(vma), 0, 1, NULL, _RET_IP_)
108#define __vma_lockdep_release_read(vma) \
109 lock_release(__vma_lockdep_map(vma), _RET_IP_)
110#define __vma_lockdep_acquire_exclusive(vma) \
111 lock_acquire_exclusive(__vma_lockdep_map(vma), 0, 0, NULL, _RET_IP_)
112#define __vma_lockdep_release_exclusive(vma) \
113 lock_release(__vma_lockdep_map(vma), _RET_IP_)
114/* Only meaningful if CONFIG_LOCK_STAT is defined. */
115#define __vma_lockdep_stat_mark_acquired(vma) \
116 lock_acquired(__vma_lockdep_map(vma), _RET_IP_)
117
118static inline void mm_lock_seqcount_init(struct mm_struct *mm)
119{
120 seqcount_init(&mm->mm_lock_seq);
121}
122
123static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
124{
125 do_raw_write_seqcount_begin(&mm->mm_lock_seq);
126}
127
128static inline void mm_lock_seqcount_end(struct mm_struct *mm)
129{
130 ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
131 do_raw_write_seqcount_end(&mm->mm_lock_seq);
132}
133
134static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
135{
136 /*
137 * Since mmap_lock is a sleeping lock, and waiting for it to become
138 * unlocked is more or less equivalent with taking it ourselves, don't
139 * bother with the speculative path if mmap_lock is already write-locked
140 * and take the slow path, which takes the lock.
141 */
142 return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
143}
144
145static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
146{
147 return read_seqcount_retry(&mm->mm_lock_seq, seq);
148}
149
150static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
151{
152#ifdef CONFIG_DEBUG_LOCK_ALLOC
153 static struct lock_class_key lockdep_key;
154
155 lockdep_init_map(__vma_lockdep_map(vma), "vm_lock", &lockdep_key, 0);
156#endif
157 if (reset_refcnt)
158 refcount_set(&vma->vm_refcnt, 0);
159 vma->vm_lock_seq = UINT_MAX;
160}
161
162/*
163 * This function determines whether the input VMA reference count describes a
164 * VMA which has excluded all VMA read locks.
165 *
166 * In the case of a detached VMA, we may incorrectly indicate that readers are
167 * excluded when one remains, because in that scenario we target a refcount of
168 * VM_REFCNT_EXCLUDE_READERS_FLAG, rather than the attached target of
169 * VM_REFCNT_EXCLUDE_READERS_FLAG + 1.
170 *
171 * However, the race window for that is very small so it is unlikely.
172 *
173 * Returns: true if readers are excluded, false otherwise.
174 */
175static inline bool __vma_are_readers_excluded(int refcnt)
176{
177 /*
178 * See the comment describing the vm_area_struct->vm_refcnt field for
179 * details of possible refcnt values.
180 */
181 return (refcnt & VM_REFCNT_EXCLUDE_READERS_FLAG) &&
182 refcnt <= VM_REFCNT_EXCLUDE_READERS_FLAG + 1;
183}
184
185/*
186 * Actually decrement the VMA reference count.
187 *
188 * The function returns the reference count as it was immediately after the
189 * decrement took place. If it returns zero, the VMA is now detached.
190 */
191static inline __must_check unsigned int
192__vma_refcount_put_return(struct vm_area_struct *vma)
193{
194 int oldcnt;
195
196 if (__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt))
197 return 0;
198
199 return oldcnt - 1;
200}
201
202/**
203 * vma_refcount_put() - Drop reference count in VMA vm_refcnt field due to a
204 * read-lock being dropped.
205 * @vma: The VMA whose reference count we wish to decrement.
206 *
207 * If we were the last reader, wake up threads waiting to obtain an exclusive
208 * lock.
209 */
210static inline void vma_refcount_put(struct vm_area_struct *vma)
211{
212 /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt. */
213 struct mm_struct *mm = vma->vm_mm;
214 int newcnt;
215
216 __vma_lockdep_release_read(vma);
217 newcnt = __vma_refcount_put_return(vma);
218
219 /*
220 * __vma_start_exclude_readers() may be sleeping waiting for readers to
221 * drop their reference count, so wake it up if we were the last reader
222 * blocking it from being acquired.
223 *
224 * We may be raced by other readers temporarily incrementing the
225 * reference count, though the race window is very small, this might
226 * cause spurious wakeups.
227 */
228 if (newcnt && __vma_are_readers_excluded(newcnt))
229 rcuwait_wake_up(&mm->vma_writer_wait);
230}
231
232/*
233 * Use only while holding mmap read lock which guarantees that locking will not
234 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
235 * not be used in such cases because it might fail due to mm_lock_seq overflow.
236 * This functionality is used to obtain vma read lock and drop the mmap read lock.
237 */
238static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
239{
240 int oldcnt;
241
242 mmap_assert_locked(vma->vm_mm);
243 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
244 VM_REFCNT_LIMIT)))
245 return false;
246
247 __vma_lockdep_acquire_read(vma);
248 return true;
249}
250
251/*
252 * Use only while holding mmap read lock which guarantees that locking will not
253 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
254 * not be used in such cases because it might fail due to mm_lock_seq overflow.
255 * This functionality is used to obtain vma read lock and drop the mmap read lock.
256 */
257static inline bool vma_start_read_locked(struct vm_area_struct *vma)
258{
259 return vma_start_read_locked_nested(vma, 0);
260}
261
262static inline void vma_end_read(struct vm_area_struct *vma)
263{
264 vma_refcount_put(vma);
265}
266
267static inline unsigned int __vma_raw_mm_seqnum(struct vm_area_struct *vma)
268{
269 const struct mm_struct *mm = vma->vm_mm;
270
271 /* We must hold an exclusive write lock for this access to be valid. */
272 mmap_assert_write_locked(vma->vm_mm);
273 return mm->mm_lock_seq.sequence;
274}
275
276/*
277 * Determine whether a VMA is write-locked. Must be invoked ONLY if the mmap
278 * write lock is held.
279 *
280 * Returns true if write-locked, otherwise false.
281 */
282static inline bool __is_vma_write_locked(struct vm_area_struct *vma)
283{
284 /*
285 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
286 * mm->mm_lock_seq can't be concurrently modified.
287 */
288 return vma->vm_lock_seq == __vma_raw_mm_seqnum(vma);
289}
290
291int __vma_start_write(struct vm_area_struct *vma, int state);
292
293/*
294 * Begin writing to a VMA.
295 * Exclude concurrent readers under the per-VMA lock until the currently
296 * write-locked mmap_lock is dropped or downgraded.
297 */
298static inline void vma_start_write(struct vm_area_struct *vma)
299{
300 if (__is_vma_write_locked(vma))
301 return;
302
303 __vma_start_write(vma, TASK_UNINTERRUPTIBLE);
304}
305
306/**
307 * vma_start_write_killable - Begin writing to a VMA.
308 * @vma: The VMA we are going to modify.
309 *
310 * Exclude concurrent readers under the per-VMA lock until the currently
311 * write-locked mmap_lock is dropped or downgraded.
312 *
313 * Context: May sleep while waiting for readers to drop the vma read lock.
314 * Caller must already hold the mmap_lock for write.
315 *
316 * Return: 0 for a successful acquisition. -EINTR if a fatal signal was
317 * received.
318 */
319static inline __must_check
320int vma_start_write_killable(struct vm_area_struct *vma)
321{
322 if (__is_vma_write_locked(vma))
323 return 0;
324
325 return __vma_start_write(vma, TASK_KILLABLE);
326}
327
328/**
329 * vma_assert_write_locked() - assert that @vma holds a VMA write lock.
330 * @vma: The VMA to assert.
331 */
332static inline void vma_assert_write_locked(struct vm_area_struct *vma)
333{
334 VM_WARN_ON_ONCE_VMA(!__is_vma_write_locked(vma), vma);
335}
336
337/**
338 * vma_assert_locked() - assert that @vma holds either a VMA read or a VMA write
339 * lock and is not detached.
340 * @vma: The VMA to assert.
341 */
342static inline void vma_assert_locked(struct vm_area_struct *vma)
343{
344 unsigned int refcnt;
345
346 if (IS_ENABLED(CONFIG_LOCKDEP)) {
347 if (!lock_is_held(__vma_lockdep_map(vma)))
348 vma_assert_write_locked(vma);
349 return;
350 }
351
352 /*
353 * See the comment describing the vm_area_struct->vm_refcnt field for
354 * details of possible refcnt values.
355 */
356 refcnt = refcount_read(&vma->vm_refcnt);
357
358 /*
359 * In this case we're either read-locked, write-locked with temporary
360 * readers, or in the midst of excluding readers, all of which means
361 * we're locked.
362 */
363 if (refcnt > 1)
364 return;
365
366 /* It is a bug for the VMA to be detached here. */
367 VM_WARN_ON_ONCE_VMA(!refcnt, vma);
368
369 /*
370 * OK, the VMA has a reference count of 1 which means it is either
371 * unlocked and attached or write-locked, so assert that it is
372 * write-locked.
373 */
374 vma_assert_write_locked(vma);
375}
376
377/**
378 * vma_assert_stabilised() - assert that this VMA cannot be changed from
379 * underneath us either by having a VMA or mmap lock held.
380 * @vma: The VMA whose stability we wish to assess.
381 *
382 * If lockdep is enabled we can precisely ensure stability via either an mmap
383 * lock owned by us or a specific VMA lock.
384 *
385 * With lockdep disabled we may sometimes race with other threads acquiring the
386 * mmap read lock simultaneous with our VMA read lock.
387 */
388static inline void vma_assert_stabilised(struct vm_area_struct *vma)
389{
390 /*
391 * If another thread owns an mmap lock, it may go away at any time, and
392 * thus is no guarantee of stability.
393 *
394 * If lockdep is enabled we can accurately determine if an mmap lock is
395 * held and owned by us. Otherwise we must approximate.
396 *
397 * It doesn't necessarily mean we are not stabilised however, as we may
398 * hold a VMA read lock (not a write lock as this would require an owned
399 * mmap lock).
400 *
401 * If (assuming lockdep is not enabled) we were to assert a VMA read
402 * lock first we may also run into issues, as other threads can hold VMA
403 * read locks simlutaneous to us.
404 *
405 * Therefore if lockdep is not enabled we risk a false negative (i.e. no
406 * assert fired). If accurate checking is required, enable lockdep.
407 */
408 if (IS_ENABLED(CONFIG_LOCKDEP)) {
409 if (lockdep_is_held(&vma->vm_mm->mmap_lock))
410 return;
411 } else {
412 if (rwsem_is_locked(&vma->vm_mm->mmap_lock))
413 return;
414 }
415
416 /*
417 * We're not stabilised by the mmap lock, so assert that we're
418 * stabilised by a VMA lock.
419 */
420 vma_assert_locked(vma);
421}
422
423static inline bool vma_is_attached(struct vm_area_struct *vma)
424{
425 return refcount_read(&vma->vm_refcnt);
426}
427
428/*
429 * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
430 * assertions should be made either under mmap_write_lock or when the object
431 * has been isolated under mmap_write_lock, ensuring no competing writers.
432 */
433static inline void vma_assert_attached(struct vm_area_struct *vma)
434{
435 WARN_ON_ONCE(!vma_is_attached(vma));
436}
437
438static inline void vma_assert_detached(struct vm_area_struct *vma)
439{
440 WARN_ON_ONCE(vma_is_attached(vma));
441}
442
443static inline void vma_mark_attached(struct vm_area_struct *vma)
444{
445 vma_assert_write_locked(vma);
446 vma_assert_detached(vma);
447 refcount_set_release(&vma->vm_refcnt, 1);
448}
449
450void __vma_exclude_readers_for_detach(struct vm_area_struct *vma);
451
452static inline void vma_mark_detached(struct vm_area_struct *vma)
453{
454 vma_assert_write_locked(vma);
455 vma_assert_attached(vma);
456
457 /*
458 * The VMA still being attached (refcnt > 0) - is unlikely, because the
459 * vma has been already write-locked and readers can increment vm_refcnt
460 * only temporarily before they check vm_lock_seq, realize the vma is
461 * locked and drop back the vm_refcnt. That is a narrow window for
462 * observing a raised vm_refcnt.
463 *
464 * See the comment describing the vm_area_struct->vm_refcnt field for
465 * details of possible refcnt values.
466 */
467 if (likely(!__vma_refcount_put_return(vma)))
468 return;
469
470 __vma_exclude_readers_for_detach(vma);
471}
472
473struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
474 unsigned long address);
475
476/*
477 * Locks next vma pointed by the iterator. Confirms the locked vma has not
478 * been modified and will retry under mmap_lock protection if modification
479 * was detected. Should be called from read RCU section.
480 * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
481 * process was interrupted.
482 */
483struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
484 struct vma_iterator *iter,
485 unsigned long address);
486
487#else /* CONFIG_PER_VMA_LOCK */
488
489static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
490static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
491static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
492
493static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
494{
495 return false;
496}
497
498static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
499{
500 return true;
501}
502static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
503static inline void vma_end_read(struct vm_area_struct *vma) {}
504static inline void vma_start_write(struct vm_area_struct *vma) {}
505static inline __must_check
506int vma_start_write_killable(struct vm_area_struct *vma) { return 0; }
507static inline void vma_assert_write_locked(struct vm_area_struct *vma)
508 { mmap_assert_write_locked(vma->vm_mm); }
509static inline void vma_assert_attached(struct vm_area_struct *vma) {}
510static inline void vma_assert_detached(struct vm_area_struct *vma) {}
511static inline void vma_mark_attached(struct vm_area_struct *vma) {}
512static inline void vma_mark_detached(struct vm_area_struct *vma) {}
513
514static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
515 unsigned long address)
516{
517 return NULL;
518}
519
520static inline void vma_assert_locked(struct vm_area_struct *vma)
521{
522 mmap_assert_locked(vma->vm_mm);
523}
524
525static inline void vma_assert_stabilised(struct vm_area_struct *vma)
526{
527 /* If no VMA locks, then either mmap lock suffices to stabilise. */
528 mmap_assert_locked(vma->vm_mm);
529}
530
531#endif /* CONFIG_PER_VMA_LOCK */
532
533static inline void mmap_write_lock(struct mm_struct *mm)
534{
535 __mmap_lock_trace_start_locking(mm, true);
536 down_write(&mm->mmap_lock);
537 mm_lock_seqcount_begin(mm);
538 __mmap_lock_trace_acquire_returned(mm, true, true);
539}
540
541static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
542{
543 __mmap_lock_trace_start_locking(mm, true);
544 down_write_nested(&mm->mmap_lock, subclass);
545 mm_lock_seqcount_begin(mm);
546 __mmap_lock_trace_acquire_returned(mm, true, true);
547}
548
549static inline int __must_check mmap_write_lock_killable(struct mm_struct *mm)
550{
551 int ret;
552
553 __mmap_lock_trace_start_locking(mm, true);
554 ret = down_write_killable(&mm->mmap_lock);
555 if (!ret)
556 mm_lock_seqcount_begin(mm);
557 __mmap_lock_trace_acquire_returned(mm, true, ret == 0);
558 return ret;
559}
560
561/*
562 * Drop all currently-held per-VMA locks.
563 * This is called from the mmap_lock implementation directly before releasing
564 * a write-locked mmap_lock (or downgrading it to read-locked).
565 * This should normally NOT be called manually from other places.
566 * If you want to call this manually anyway, keep in mind that this will release
567 * *all* VMA write locks, including ones from further up the stack.
568 */
569static inline void vma_end_write_all(struct mm_struct *mm)
570{
571 mmap_assert_write_locked(mm);
572 mm_lock_seqcount_end(mm);
573}
574
575static inline void mmap_write_unlock(struct mm_struct *mm)
576{
577 __mmap_lock_trace_released(mm, true);
578 vma_end_write_all(mm);
579 up_write(&mm->mmap_lock);
580}
581
582static inline void mmap_write_downgrade(struct mm_struct *mm)
583{
584 __mmap_lock_trace_acquire_returned(mm, false, true);
585 vma_end_write_all(mm);
586 downgrade_write(&mm->mmap_lock);
587}
588
589static inline void mmap_read_lock(struct mm_struct *mm)
590{
591 __mmap_lock_trace_start_locking(mm, false);
592 down_read(&mm->mmap_lock);
593 __mmap_lock_trace_acquire_returned(mm, false, true);
594}
595
596static inline int __must_check mmap_read_lock_killable(struct mm_struct *mm)
597{
598 int ret;
599
600 __mmap_lock_trace_start_locking(mm, false);
601 ret = down_read_killable(&mm->mmap_lock);
602 __mmap_lock_trace_acquire_returned(mm, false, ret == 0);
603 return ret;
604}
605
606static inline bool __must_check mmap_read_trylock(struct mm_struct *mm)
607{
608 bool ret;
609
610 __mmap_lock_trace_start_locking(mm, false);
611 ret = down_read_trylock(&mm->mmap_lock) != 0;
612 __mmap_lock_trace_acquire_returned(mm, false, ret);
613 return ret;
614}
615
616static inline void mmap_read_unlock(struct mm_struct *mm)
617{
618 __mmap_lock_trace_released(mm, false);
619 up_read(&mm->mmap_lock);
620}
621
622DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
623 mmap_read_lock(_T), mmap_read_unlock(_T))
624
625static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
626{
627 __mmap_lock_trace_released(mm, false);
628 up_read_non_owner(&mm->mmap_lock);
629}
630
631static inline int mmap_lock_is_contended(struct mm_struct *mm)
632{
633 return rwsem_is_contended(&mm->mmap_lock);
634}
635
636#endif /* _LINUX_MMAP_LOCK_H */