Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __LINUX_ENTRYCOMMON_H
3#define __LINUX_ENTRYCOMMON_H
4
5#include <linux/audit.h>
6#include <linux/irq-entry-common.h>
7#include <linux/livepatch.h>
8#include <linux/ptrace.h>
9#include <linux/resume_user_mode.h>
10#include <linux/seccomp.h>
11#include <linux/sched.h>
12
13#include <asm/entry-common.h>
14#include <asm/syscall.h>
15
16#ifndef _TIF_UPROBE
17# define _TIF_UPROBE (0)
18#endif
19
20/*
21 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
22 */
23#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \
24 SYSCALL_WORK_SYSCALL_TRACEPOINT | \
25 SYSCALL_WORK_SYSCALL_TRACE | \
26 SYSCALL_WORK_SYSCALL_EMU | \
27 SYSCALL_WORK_SYSCALL_AUDIT | \
28 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
29 SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
30/*
31 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode()
32 */
33#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
34 SYSCALL_WORK_SYSCALL_TRACE | \
35 SYSCALL_WORK_SYSCALL_AUDIT | \
36 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
37 SYSCALL_WORK_SYSCALL_EXIT_TRAP)
38
39/**
40 * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
41 * @regs: Pointer to the register state at syscall entry
42 *
43 * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
44 *
45 * This allows architecture specific ptrace_report_syscall_entry()
46 * implementations. If not defined by the architecture this falls back to
47 * to ptrace_report_syscall_entry().
48 */
49static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);
50
51#ifndef arch_ptrace_report_syscall_entry
52static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
53{
54 return ptrace_report_syscall_entry(regs);
55}
56#endif
57
58bool syscall_user_dispatch(struct pt_regs *regs);
59long trace_syscall_enter(struct pt_regs *regs, long syscall);
60void trace_syscall_exit(struct pt_regs *regs, long ret);
61
62static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
63{
64 if (unlikely(audit_context())) {
65 unsigned long args[6];
66
67 syscall_get_arguments(current, regs, args);
68 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
69 }
70}
71
72static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
73{
74 long syscall, ret = 0;
75
76 /*
77 * Handle Syscall User Dispatch. This must comes first, since
78 * the ABI here can be something that doesn't make sense for
79 * other syscall_work features.
80 */
81 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
82 if (syscall_user_dispatch(regs))
83 return -1L;
84 }
85
86 /*
87 * User space got a time slice extension granted and relinquishes
88 * the CPU. The work stops the slice timer to avoid an extra round
89 * through hrtimer_interrupt().
90 */
91 if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
92 rseq_syscall_enter_work(syscall_get_nr(current, regs));
93
94 /* Handle ptrace */
95 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
96 ret = arch_ptrace_report_syscall_entry(regs);
97 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
98 return -1L;
99 }
100
101 /* Do seccomp after ptrace, to catch any tracer changes. */
102 if (work & SYSCALL_WORK_SECCOMP) {
103 ret = __secure_computing();
104 if (ret == -1L)
105 return ret;
106 }
107
108 /* Either of the above might have changed the syscall number */
109 syscall = syscall_get_nr(current, regs);
110
111 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
112 syscall = trace_syscall_enter(regs, syscall);
113
114 syscall_enter_audit(regs, syscall);
115
116 return ret ? : syscall;
117}
118
119/**
120 * syscall_enter_from_user_mode_work - Check and handle work before invoking
121 * a syscall
122 * @regs: Pointer to currents pt_regs
123 * @syscall: The syscall number
124 *
125 * Invoked from architecture specific syscall entry code with interrupts
126 * enabled after invoking enter_from_user_mode(), enabling interrupts and
127 * extra architecture specific work.
128 *
129 * Returns: The original or a modified syscall number
130 *
131 * If the returned syscall number is -1 then the syscall should be
132 * skipped. In this case the caller may invoke syscall_set_error() or
133 * syscall_set_return_value() first. If neither of those are called and -1
134 * is returned, then the syscall will fail with ENOSYS.
135 *
136 * It handles the following work items:
137 *
138 * 1) syscall_work flag dependent invocations of
139 * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
140 * 2) Invocation of audit_syscall_entry()
141 */
142static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
143{
144 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
145
146 if (work & SYSCALL_WORK_ENTER)
147 syscall = syscall_trace_enter(regs, work);
148
149 return syscall;
150}
151
152/**
153 * syscall_enter_from_user_mode - Establish state and check and handle work
154 * before invoking a syscall
155 * @regs: Pointer to currents pt_regs
156 * @syscall: The syscall number
157 *
158 * Invoked from architecture specific syscall entry code with interrupts
159 * disabled. The calling code has to be non-instrumentable. When the
160 * function returns all state is correct, interrupts are enabled and the
161 * subsequent functions can be instrumented.
162 *
163 * This is the combination of enter_from_user_mode() and
164 * syscall_enter_from_user_mode_work() to be used when there is no
165 * architecture specific work to be done between the two.
166 *
167 * Returns: The original or a modified syscall number. See
168 * syscall_enter_from_user_mode_work() for further explanation.
169 */
170static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
171{
172 long ret;
173
174 enter_from_user_mode(regs);
175
176 instrumentation_begin();
177 local_irq_enable();
178 ret = syscall_enter_from_user_mode_work(regs, syscall);
179 instrumentation_end();
180
181 return ret;
182}
183
184/*
185 * If SYSCALL_EMU is set, then the only reason to report is when
186 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
187 * instruction has been already reported in syscall_enter_from_user_mode().
188 */
189static __always_inline bool report_single_step(unsigned long work)
190{
191 if (work & SYSCALL_WORK_SYSCALL_EMU)
192 return false;
193
194 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
195}
196
197/**
198 * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
199 * @regs: Pointer to the register state at syscall exit
200 * @step: Indicates a single-step exit rather than a normal syscall exit
201 *
202 * This allows architecture specific ptrace_report_syscall_exit()
203 * implementations. If not defined by the architecture this falls back to
204 * to ptrace_report_syscall_exit().
205 */
206static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
207 int step);
208
209#ifndef arch_ptrace_report_syscall_exit
210static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
211 int step)
212{
213 ptrace_report_syscall_exit(regs, step);
214}
215#endif
216
217/**
218 * syscall_exit_work - Handle work before returning to user mode
219 * @regs: Pointer to current pt_regs
220 * @work: Current thread syscall work
221 *
222 * Do one-time syscall specific work.
223 */
224static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
225{
226 bool step;
227
228 /*
229 * If the syscall was rolled back due to syscall user dispatching,
230 * then the tracers below are not invoked for the same reason as
231 * the entry side was not invoked in syscall_trace_enter(): The ABI
232 * of these syscalls is unknown.
233 */
234 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
235 if (unlikely(current->syscall_dispatch.on_dispatch)) {
236 current->syscall_dispatch.on_dispatch = false;
237 return;
238 }
239 }
240
241 audit_syscall_exit(regs);
242
243 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
244 trace_syscall_exit(regs, syscall_get_return_value(current, regs));
245
246 step = report_single_step(work);
247 if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
248 arch_ptrace_report_syscall_exit(regs, step);
249}
250
251/**
252 * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
253 * @regs: Pointer to currents pt_regs
254 *
255 * Step 1 of syscall_exit_to_user_mode() with the same calling convention.
256 *
257 * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
258 */
259static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
260{
261 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
262 unsigned long nr = syscall_get_nr(current, regs);
263
264 CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
265
266 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
267 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
268 local_irq_enable();
269 }
270
271 rseq_debug_syscall_return(regs);
272
273 /*
274 * Do one-time syscall specific work. If these work items are
275 * enabled, we want to run them exactly once per syscall exit with
276 * interrupts enabled.
277 */
278 if (unlikely(work & SYSCALL_WORK_EXIT))
279 syscall_exit_work(regs, work);
280}
281
282/**
283 * syscall_exit_to_user_mode - Handle work before returning to user mode
284 * @regs: Pointer to currents pt_regs
285 *
286 * Invoked with interrupts enabled and fully valid @regs. Returns with all
287 * work handled, interrupts disabled such that the caller can immediately
288 * switch to user mode. Called from architecture specific syscall and ret
289 * from fork code.
290 *
291 * The call order is:
292 * 1) One-time syscall exit work:
293 * - rseq syscall exit
294 * - audit
295 * - syscall tracing
296 * - ptrace (single stepping)
297 *
298 * 2) Preparatory work
299 * - Disable interrupts
300 * - Exit to user mode loop (common TIF handling). Invokes
301 * arch_exit_to_user_mode_work() for architecture specific TIF work
302 * - Architecture specific one time work arch_exit_to_user_mode_prepare()
303 * - Address limit and lockdep checks
304 *
305 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
306 * functionality in exit_to_user_mode().
307 *
308 * This is a combination of syscall_exit_to_user_mode_work() (1), disabling
309 * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
310 * exit_to_user_mode() (3). This function is preferred unless there is a
311 * compelling architectural reason to invoke the functions separately.
312 */
313static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
314{
315 instrumentation_begin();
316 syscall_exit_to_user_mode_work(regs);
317 local_irq_disable();
318 syscall_exit_to_user_mode_prepare(regs);
319 instrumentation_end();
320 exit_to_user_mode();
321}
322
323#endif