Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tracing/perf: disable preemption in syscall probe

In preparation for allowing system call enter/exit instrumentation to
handle page faults, make sure that perf can handle this change by
explicitly disabling preemption within the perf system call tracepoint
probes to respect the current expectations within perf ring buffer code.

This change does not yet allow perf to take page faults per se within
its probe, but allows its existing probes to adapt to the upcoming
change.

Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-4-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

authored by

Mathieu Desnoyers and committed by
Steven Rostedt (Google)
65e7462a 13d750c2

+50 -4
+38 -4
include/trace/perf.h
··· 12 12 #undef __perf_task 13 13 #define __perf_task(t) (__task = (t)) 14 14 15 - #undef DECLARE_EVENT_CLASS 16 - #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ 15 + #undef __DECLARE_EVENT_CLASS 16 + #define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ 17 17 static notrace void \ 18 - perf_trace_##call(void *__data, proto) \ 18 + do_perf_trace_##call(void *__data, proto) \ 19 19 { \ 20 20 struct trace_event_call *event_call = __data; \ 21 21 struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ ··· 55 55 head, __task); \ 56 56 } 57 57 58 + /* 59 + * Define unused __count and __task variables to use @args to pass 60 + * arguments to do_perf_trace_##call. This is needed because the 61 + * macros __perf_count and __perf_task introduce the side-effect to 62 + * store copies into those local variables. 63 + */ 64 + #undef DECLARE_EVENT_CLASS 65 + #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ 66 + __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ 67 + PARAMS(assign), PARAMS(print)) \ 68 + static notrace void \ 69 + perf_trace_##call(void *__data, proto) \ 70 + { \ 71 + u64 __count __attribute__((unused)); \ 72 + struct task_struct *__task __attribute__((unused)); \ 73 + \ 74 + do_perf_trace_##call(__data, args); \ 75 + } 76 + 58 77 #undef DECLARE_EVENT_SYSCALL_CLASS 59 - #define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS 78 + #define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ 79 + __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ 80 + PARAMS(assign), PARAMS(print)) \ 81 + static notrace void \ 82 + perf_trace_##call(void *__data, proto) \ 83 + { \ 84 + u64 __count __attribute__((unused)); \ 85 + struct task_struct *__task __attribute__((unused)); \ 86 + \ 87 + preempt_disable_notrace(); \ 88 + do_perf_trace_##call(__data, args); \ 89 + preempt_enable_notrace(); \ 90 + } 60 91 61 92 /* 62 93 * This part is compiled out, it is only here as a build time check ··· 107 76 DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) 108 77 109 78 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 79 + 80 + #undef __DECLARE_EVENT_CLASS 81 + 110 82 #endif /* CONFIG_PERF_EVENTS */
+12
kernel/trace/trace_syscalls.c
··· 596 596 int rctx; 597 597 int size; 598 598 599 + /* 600 + * Syscall probe called with preemption enabled, but the ring 601 + * buffer and per-cpu data require preemption to be disabled. 602 + */ 603 + guard(preempt_notrace)(); 604 + 599 605 syscall_nr = trace_get_syscall_nr(current, regs); 600 606 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 601 607 return; ··· 703 697 int syscall_nr; 704 698 int rctx; 705 699 int size; 700 + 701 + /* 702 + * Syscall probe called with preemption enabled, but the ring 703 + * buffer and per-cpu data require preemption to be disabled. 704 + */ 705 + guard(preempt_notrace)(); 706 706 707 707 syscall_nr = trace_get_syscall_nr(current, regs); 708 708 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)