Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tracing: Guard __DECLARE_TRACE() use of __DO_TRACE_CALL() with SRCU-fast

The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
to protect invocation of __DO_TRACE_CALL() means that BPF programs
attached to tracepoints are non-preemptible. This is unhelpful in
real-time systems, whose users apparently wish to use BPF while also
achieving low latencies. (Who knew?)

One option would be to use preemptible RCU, but this introduces
many opportunities for infinite recursion, which many consider to
be counterproductive, especially given the relatively small stacks
provided by the Linux kernel. These opportunities could be shut down
by sufficiently energetic duplication of code, but this sort of thing
is considered impolite in some circles.

Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
readers than those of preemptible RCU, at least on Paul E. McKenney's
laptop, where task_struct access is more expensive than access to per-CPU
variables. And SRCU-fast provides way faster readers than does SRCU,
courtesy of being able to avoid the read-side use of smp_mb(). Also,
it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
functions.

Link: https://lore.kernel.org/all/20250613152218.1924093-1-bigeasy@linutronix.de/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Alexei Starovoitov <ast@kernel.org>
Link: https://patch.msgid.link/20260126231256.499701982@kernel.org
Co-developed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+21 -10
+5 -4
include/linux/tracepoint.h
··· 108 108 * An alternative is to use the following for batch reclaim associated 109 109 * with a given tracepoint: 110 110 * 111 - * - tracepoint_is_faultable() == false: call_rcu() 111 + * - tracepoint_is_faultable() == false: call_srcu() 112 112 * - tracepoint_is_faultable() == true: call_rcu_tasks_trace() 113 113 */ 114 114 #ifdef CONFIG_TRACEPOINTS 115 + extern struct srcu_struct tracepoint_srcu; 115 116 static inline void tracepoint_synchronize_unregister(void) 116 117 { 117 118 synchronize_rcu_tasks_trace(); 118 - synchronize_rcu(); 119 + synchronize_srcu(&tracepoint_srcu); 119 120 } 120 121 static inline bool tracepoint_is_faultable(struct tracepoint *tp) 121 122 { ··· 276 275 return static_branch_unlikely(&__tracepoint_##name.key);\ 277 276 } 278 277 279 - #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ 278 + #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ 280 279 __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ 281 280 static inline void __do_trace_##name(proto) \ 282 281 { \ 283 282 TRACEPOINT_CHECK(name) \ 284 283 if (cond) { \ 285 - guard(preempt_notrace)(); \ 284 + guard(srcu_fast_notrace)(&tracepoint_srcu); \ 286 285 __DO_TRACE_CALL(name, TP_ARGS(args)); \ 287 286 } \ 288 287 } \
+2 -2
include/trace/trace_events.h
··· 436 436 static notrace void \ 437 437 trace_event_raw_event_##call(void *__data, proto) \ 438 438 { \ 439 + guard(preempt_notrace)(); \ 439 440 do_trace_event_raw_event_##call(__data, args); \ 440 441 } 441 442 ··· 448 447 trace_event_raw_event_##call(void *__data, proto) \ 449 448 { \ 450 449 might_fault(); \ 451 - preempt_disable_notrace(); \ 450 + guard(preempt_notrace)(); \ 452 451 do_trace_event_raw_event_##call(__data, args); \ 453 - preempt_enable_notrace(); \ 454 452 } 455 453 456 454 /*
+14 -4
kernel/tracepoint.c
··· 34 34 35 35 struct tp_transition_snapshot { 36 36 unsigned long rcu; 37 + unsigned long srcu_gp; 37 38 bool ongoing; 38 39 }; 40 + 41 + DEFINE_SRCU_FAST(tracepoint_srcu); 42 + EXPORT_SYMBOL_GPL(tracepoint_srcu); 39 43 40 44 /* Protected by tracepoints_mutex */ 41 45 static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC]; ··· 50 46 51 47 /* Keep the latest get_state snapshot. */ 52 48 snapshot->rcu = get_state_synchronize_rcu(); 49 + snapshot->srcu_gp = start_poll_synchronize_srcu(&tracepoint_srcu); 53 50 snapshot->ongoing = true; 54 51 } 55 52 ··· 61 56 if (!snapshot->ongoing) 62 57 return; 63 58 cond_synchronize_rcu(snapshot->rcu); 59 + if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu_gp)) 60 + synchronize_srcu(&tracepoint_srcu); 64 61 snapshot->ongoing = false; 65 62 } 66 63 ··· 119 112 struct tp_probes *tp_probes = container_of(old, 120 113 struct tp_probes, probes[0]); 121 114 122 - if (tracepoint_is_faultable(tp)) 123 - call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes); 124 - else 125 - call_rcu(&tp_probes->rcu, rcu_free_old_probes); 115 + if (tracepoint_is_faultable(tp)) { 116 + call_rcu_tasks_trace(&tp_probes->rcu, 117 + rcu_free_old_probes); 118 + } else { 119 + call_srcu(&tracepoint_srcu, &tp_probes->rcu, 120 + rcu_free_old_probes); 121 + } 126 122 } 127 123 } 128 124