Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __LINUX_ENTRYCOMMON_H
3#define __LINUX_ENTRYCOMMON_H
4
5#include <linux/audit.h>
6#include <linux/irq-entry-common.h>
7#include <linux/livepatch.h>
8#include <linux/ptrace.h>
9#include <linux/resume_user_mode.h>
10#include <linux/seccomp.h>
11#include <linux/sched.h>
12
13#include <asm/entry-common.h>
14#include <asm/syscall.h>
15
16#ifndef _TIF_UPROBE
17# define _TIF_UPROBE (0)
18#endif
19
20/*
21 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
22 */
23#ifndef ARCH_SYSCALL_WORK_ENTER
24# define ARCH_SYSCALL_WORK_ENTER (0)
25#endif
26
27/*
28 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode()
29 */
30#ifndef ARCH_SYSCALL_WORK_EXIT
31# define ARCH_SYSCALL_WORK_EXIT (0)
32#endif
33
34#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \
35 SYSCALL_WORK_SYSCALL_TRACEPOINT | \
36 SYSCALL_WORK_SYSCALL_TRACE | \
37 SYSCALL_WORK_SYSCALL_EMU | \
38 SYSCALL_WORK_SYSCALL_AUDIT | \
39 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
40 SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \
41 ARCH_SYSCALL_WORK_ENTER)
42#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
43 SYSCALL_WORK_SYSCALL_TRACE | \
44 SYSCALL_WORK_SYSCALL_AUDIT | \
45 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
46 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \
47 ARCH_SYSCALL_WORK_EXIT)
48
49/**
50 * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
51 *
52 * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
53 *
54 * This allows architecture specific ptrace_report_syscall_entry()
55 * implementations. If not defined by the architecture this falls back to
56 * to ptrace_report_syscall_entry().
57 */
58static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);
59
60#ifndef arch_ptrace_report_syscall_entry
61static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
62{
63 return ptrace_report_syscall_entry(regs);
64}
65#endif
66
67bool syscall_user_dispatch(struct pt_regs *regs);
68long trace_syscall_enter(struct pt_regs *regs, long syscall);
69void trace_syscall_exit(struct pt_regs *regs, long ret);
70
71static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
72{
73 if (unlikely(audit_context())) {
74 unsigned long args[6];
75
76 syscall_get_arguments(current, regs, args);
77 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
78 }
79}
80
81static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
82{
83 long syscall, ret = 0;
84
85 /*
86 * Handle Syscall User Dispatch. This must comes first, since
87 * the ABI here can be something that doesn't make sense for
88 * other syscall_work features.
89 */
90 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
91 if (syscall_user_dispatch(regs))
92 return -1L;
93 }
94
95 /*
96 * User space got a time slice extension granted and relinquishes
97 * the CPU. The work stops the slice timer to avoid an extra round
98 * through hrtimer_interrupt().
99 */
100 if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
101 rseq_syscall_enter_work(syscall_get_nr(current, regs));
102
103 /* Handle ptrace */
104 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
105 ret = arch_ptrace_report_syscall_entry(regs);
106 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
107 return -1L;
108 }
109
110 /* Do seccomp after ptrace, to catch any tracer changes. */
111 if (work & SYSCALL_WORK_SECCOMP) {
112 ret = __secure_computing();
113 if (ret == -1L)
114 return ret;
115 }
116
117 /* Either of the above might have changed the syscall number */
118 syscall = syscall_get_nr(current, regs);
119
120 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
121 syscall = trace_syscall_enter(regs, syscall);
122
123 syscall_enter_audit(regs, syscall);
124
125 return ret ? : syscall;
126}
127
128/**
129 * syscall_enter_from_user_mode_work - Check and handle work before invoking
130 * a syscall
131 * @regs: Pointer to currents pt_regs
132 * @syscall: The syscall number
133 *
134 * Invoked from architecture specific syscall entry code with interrupts
135 * enabled after invoking enter_from_user_mode(), enabling interrupts and
136 * extra architecture specific work.
137 *
138 * Returns: The original or a modified syscall number
139 *
140 * If the returned syscall number is -1 then the syscall should be
141 * skipped. In this case the caller may invoke syscall_set_error() or
142 * syscall_set_return_value() first. If neither of those are called and -1
143 * is returned, then the syscall will fail with ENOSYS.
144 *
145 * It handles the following work items:
146 *
147 * 1) syscall_work flag dependent invocations of
148 * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
149 * 2) Invocation of audit_syscall_entry()
150 */
151static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
152{
153 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
154
155 if (work & SYSCALL_WORK_ENTER)
156 syscall = syscall_trace_enter(regs, work);
157
158 return syscall;
159}
160
161/**
162 * syscall_enter_from_user_mode - Establish state and check and handle work
163 * before invoking a syscall
164 * @regs: Pointer to currents pt_regs
165 * @syscall: The syscall number
166 *
167 * Invoked from architecture specific syscall entry code with interrupts
168 * disabled. The calling code has to be non-instrumentable. When the
169 * function returns all state is correct, interrupts are enabled and the
170 * subsequent functions can be instrumented.
171 *
172 * This is the combination of enter_from_user_mode() and
173 * syscall_enter_from_user_mode_work() to be used when there is no
174 * architecture specific work to be done between the two.
175 *
176 * Returns: The original or a modified syscall number. See
177 * syscall_enter_from_user_mode_work() for further explanation.
178 */
179static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
180{
181 long ret;
182
183 enter_from_user_mode(regs);
184
185 instrumentation_begin();
186 local_irq_enable();
187 ret = syscall_enter_from_user_mode_work(regs, syscall);
188 instrumentation_end();
189
190 return ret;
191}
192
193/*
194 * If SYSCALL_EMU is set, then the only reason to report is when
195 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
196 * instruction has been already reported in syscall_enter_from_user_mode().
197 */
198static __always_inline bool report_single_step(unsigned long work)
199{
200 if (work & SYSCALL_WORK_SYSCALL_EMU)
201 return false;
202
203 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
204}
205
206/**
207 * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
208 *
209 * This allows architecture specific ptrace_report_syscall_exit()
210 * implementations. If not defined by the architecture this falls back to
211 * to ptrace_report_syscall_exit().
212 */
213static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
214 int step);
215
216#ifndef arch_ptrace_report_syscall_exit
217static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
218 int step)
219{
220 ptrace_report_syscall_exit(regs, step);
221}
222#endif
223
224/**
225 * syscall_exit_work - Handle work before returning to user mode
226 * @regs: Pointer to current pt_regs
227 * @work: Current thread syscall work
228 *
229 * Do one-time syscall specific work.
230 */
231static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
232{
233 bool step;
234
235 /*
236 * If the syscall was rolled back due to syscall user dispatching,
237 * then the tracers below are not invoked for the same reason as
238 * the entry side was not invoked in syscall_trace_enter(): The ABI
239 * of these syscalls is unknown.
240 */
241 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
242 if (unlikely(current->syscall_dispatch.on_dispatch)) {
243 current->syscall_dispatch.on_dispatch = false;
244 return;
245 }
246 }
247
248 audit_syscall_exit(regs);
249
250 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
251 trace_syscall_exit(regs, syscall_get_return_value(current, regs));
252
253 step = report_single_step(work);
254 if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
255 arch_ptrace_report_syscall_exit(regs, step);
256}
257
258/**
259 * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
260 * @regs: Pointer to currents pt_regs
261 *
262 * Step 1 of syscall_exit_to_user_mode() with the same calling convention.
263 *
264 * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
265 */
266static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
267{
268 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
269 unsigned long nr = syscall_get_nr(current, regs);
270
271 CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
272
273 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
274 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
275 local_irq_enable();
276 }
277
278 rseq_debug_syscall_return(regs);
279
280 /*
281 * Do one-time syscall specific work. If these work items are
282 * enabled, we want to run them exactly once per syscall exit with
283 * interrupts enabled.
284 */
285 if (unlikely(work & SYSCALL_WORK_EXIT))
286 syscall_exit_work(regs, work);
287}
288
289/**
290 * syscall_exit_to_user_mode - Handle work before returning to user mode
291 * @regs: Pointer to currents pt_regs
292 *
293 * Invoked with interrupts enabled and fully valid @regs. Returns with all
294 * work handled, interrupts disabled such that the caller can immediately
295 * switch to user mode. Called from architecture specific syscall and ret
296 * from fork code.
297 *
298 * The call order is:
299 * 1) One-time syscall exit work:
300 * - rseq syscall exit
301 * - audit
302 * - syscall tracing
303 * - ptrace (single stepping)
304 *
305 * 2) Preparatory work
306 * - Disable interrupts
307 * - Exit to user mode loop (common TIF handling). Invokes
308 * arch_exit_to_user_mode_work() for architecture specific TIF work
309 * - Architecture specific one time work arch_exit_to_user_mode_prepare()
310 * - Address limit and lockdep checks
311 *
312 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
313 * functionality in exit_to_user_mode().
314 *
315 * This is a combination of syscall_exit_to_user_mode_work() (1), disabling
316 * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
317 * exit_to_user_mode() (3). This function is preferred unless there is a
318 * compelling architectural reason to invoke the functions separately.
319 */
320static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
321{
322 instrumentation_begin();
323 syscall_exit_to_user_mode_work(regs);
324 local_irq_disable_exit_to_user();
325 syscall_exit_to_user_mode_prepare(regs);
326 instrumentation_end();
327 exit_to_user_mode();
328}
329
330#endif