rqspinlock: Hardcode cond_acquire loops for arm64

Currently, for rqspinlock usage, the implementation of
smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are
susceptible to stalls on arm64, because they do not guarantee that the
conditional expression will be repeatedly invoked if the address being
loaded from is not written to by other CPUs. When support for
event-streams is absent (which unblocks stuck WFE-based loops every
~100us), we may end up being stuck forever.

This causes a problem for us, as we need to repeatedly invoke the
RES_CHECK_TIMEOUT in the spin loop to break out when the timeout
expires.

Let us import the smp_cond_load_acquire_timewait implementation Ankur is
proposing in [0], and then fallback to it once it is merged.

While we rely on the implementation to amortize the cost of sampling
check_timeout for us, it will not happen when event stream support is
unavailable. This is not the common case, and it would be difficult to
fit our logic in the time_expr_ns >= time_limit_ns comparison, hence
just let it be.

[0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com

Cc: Ankur Arora <ankur.a.arora@oracle.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250316040541.108729-9-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Kumar Kartikeya Dwivedi and committed by

Alexei Starovoitov 1 year ago ebababcd 14c48ee8

+108

2 changed files

expand all

arch

arm64

include

asm

rqspinlock.h

kernel

bpf

rqspinlock.c

+93

arch/arm64/include/asm/rqspinlock.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_RQSPINLOCK_H 3 + #define _ASM_RQSPINLOCK_H 4 + 5 + #include <asm/barrier.h> 6 + 7 + /* 8 + * Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom 9 + * version based on [0]. In rqspinlock code, our conditional expression involves 10 + * checking the value _and_ additionally a timeout. However, on arm64, the 11 + * WFE-based implementation may never spin again if no stores occur to the 12 + * locked byte in the lock word. As such, we may be stuck forever if 13 + * event-stream based unblocking is not available on the platform for WFE spin 14 + * loops (arch_timer_evtstrm_available). 15 + * 16 + * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this 17 + * copy-paste. 18 + * 19 + * While we rely on the implementation to amortize the cost of sampling 20 + * cond_expr for us, it will not happen when event stream support is 21 + * unavailable, time_expr check is amortized. This is not the common case, and 22 + * it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns 23 + * comparison, hence just let it be. In case of event-stream, the loop is woken 24 + * up at microsecond granularity. 25 + * 26 + * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com 27 + */ 28 + 29 + #ifndef smp_cond_load_acquire_timewait 30 + 31 + #define smp_cond_time_check_count 200 32 + 33 + #define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \ 34 + time_limit_ns) ({ \ 35 + typeof(ptr) __PTR = (ptr); \ 36 + __unqual_scalar_typeof(*ptr) VAL; \ 37 + unsigned int __count = 0; \ 38 + for (;;) { \ 39 + VAL = READ_ONCE(*__PTR); \ 40 + if (cond_expr) \ 41 + break; \ 42 + cpu_relax(); \ 43 + if (__count++ < smp_cond_time_check_count) \ 44 + continue; \ 45 + if ((time_expr_ns) >= (time_limit_ns)) \ 46 + break; \ 47 + __count = 0; \ 48 + } \ 49 + (typeof(*ptr))VAL; \ 50 + }) 51 + 52 + #define __smp_cond_load_acquire_timewait(ptr, cond_expr, \ 53 + time_expr_ns, time_limit_ns) \ 54 + ({ \ 55 + typeof(ptr) __PTR = (ptr); \ 56 + __unqual_scalar_typeof(*ptr) VAL; \ 57 + for (;;) { \ 58 + VAL = smp_load_acquire(__PTR); \ 59 + if (cond_expr) \ 60 + break; \ 61 + __cmpwait_relaxed(__PTR, VAL); \ 62 + if ((time_expr_ns) >= (time_limit_ns)) \ 63 + break; \ 64 + } \ 65 + (typeof(*ptr))VAL; \ 66 + }) 67 + 68 + #define smp_cond_load_acquire_timewait(ptr, cond_expr, \ 69 + time_expr_ns, time_limit_ns) \ 70 + ({ \ 71 + __unqual_scalar_typeof(*ptr) _val; \ 72 + int __wfe = arch_timer_evtstrm_available(); \ 73 + \ 74 + if (likely(__wfe)) { \ 75 + _val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \ 76 + time_expr_ns, \ 77 + time_limit_ns); \ 78 + } else { \ 79 + _val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \ 80 + time_expr_ns, \ 81 + time_limit_ns); \ 82 + smp_acquire__after_ctrl_dep(); \ 83 + } \ 84 + (typeof(*ptr))_val; \ 85 + }) 86 + 87 + #endif 88 + 89 + #define res_smp_cond_load_acquire_timewait(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1) 90 + 91 + #include <asm-generic/rqspinlock.h> 92 + 93 + #endif /* _ASM_RQSPINLOCK_H */

+15

kernel/bpf/rqspinlock.c

··· 92 92 return 0; 93 93 } 94 94 95 + /* 96 + * Do not amortize with spins when res_smp_cond_load_acquire is defined, 97 + * as the macro does internal amortization for us. 98 + */ 99 + #ifndef res_smp_cond_load_acquire 95 100 #define RES_CHECK_TIMEOUT(ts, ret) \ 96 101 ({ \ 97 102 if (!(ts).spin++) \ 98 103 (ret) = check_timeout(&(ts)); \ 99 104 (ret); \ 100 105 }) 106 + #else 107 + #define RES_CHECK_TIMEOUT(ts, ret, mask) \ 108 + ({ (ret) = check_timeout(&(ts)); }) 109 + #endif 101 110 102 111 /* 103 112 * Initialize the 'spin' member. ··· 126 117 * Exactly fits one 64-byte cacheline on a 64-bit architecture. 127 118 */ 128 119 static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); 120 + 121 + #ifndef res_smp_cond_load_acquire 122 + #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) 123 + #endif 124 + 125 + #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) 129 126 130 127 /** 131 128 * resilient_queued_spin_lock_slowpath - acquire the queued spinlock

Configure Feed

Configure Feed