Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'core-entry-2025-07-29' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull generic entry code updates from Thomas Gleixner:

- Split the code into syscall and exception/interrupt parts to ease the
conversion of ARM[64] to the generic entry infrastructure

- Extend syscall user dispatching to support a single intercepted range
instead of the default single non-intercepted range. That allows
monitoring/analysis of a specific executable range, e.g. a library,
and also provides flexibility for sandboxing scenarios

- Cleanup and extend the user dispatch selftest

* tag 'core-entry-2025-07-29' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
entry: Split generic entry into generic exception and syscall entry
selftests: Add tests for PR_SYS_DISPATCH_INCLUSIVE_ON
syscall_user_dispatch: Add PR_SYS_DISPATCH_INCLUSIVE_ON
selftests: Fix errno checking in syscall_user_dispatch test

+665 -565
+13 -8
Documentation/admin-guide/syscall-user-dispatch.rst
··· 53 53 54 54 prctl(PR_SET_SYSCALL_USER_DISPATCH, <op>, <offset>, <length>, [selector]) 55 55 56 - <op> is either PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF, to enable and 57 - disable the mechanism globally for that thread. When 58 - PR_SYS_DISPATCH_OFF is used, the other fields must be zero. 56 + <op> is either PR_SYS_DISPATCH_EXCLUSIVE_ON/PR_SYS_DISPATCH_INCLUSIVE_ON 57 + or PR_SYS_DISPATCH_OFF, to enable and disable the mechanism globally for 58 + that thread. When PR_SYS_DISPATCH_OFF is used, the other fields must be zero. 59 59 60 - [<offset>, <offset>+<length>) delimit a memory region interval 61 - from which syscalls are always executed directly, regardless of the 62 - userspace selector. This provides a fast path for the C library, which 63 - includes the most common syscall dispatchers in the native code 64 - applications, and also provides a way for the signal handler to return 60 + For PR_SYS_DISPATCH_EXCLUSIVE_ON [<offset>, <offset>+<length>) delimit 61 + a memory region interval from which syscalls are always executed directly, 62 + regardless of the userspace selector. This provides a fast path for the 63 + C library, which includes the most common syscall dispatchers in the native 64 + code applications, and also provides a way for the signal handler to return 65 65 without triggering a nested SIGSYS on (rt\_)sigreturn. Users of this 66 66 interface should make sure that at least the signal trampoline code is 67 67 included in this region. In addition, for syscalls that implement the 68 68 trampoline code on the vDSO, that trampoline is never intercepted. 69 + 70 + For PR_SYS_DISPATCH_INCLUSIVE_ON [<offset>, <offset>+<length>) delimit 71 + a memory region interval from which syscalls are dispatched based on 72 + the userspace selector. Syscalls from outside of the range are always 73 + executed directly. 69 74 70 75 [selector] is a pointer to a char-sized region in the process memory 71 76 region, that provides a quick way to enable disable syscall redirection
+1
MAINTAINERS
··· 10126 10126 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/entry 10127 10127 F: include/linux/entry-common.h 10128 10128 F: include/linux/entry-kvm.h 10129 + F: include/linux/irq-entry-common.h 10129 10130 F: kernel/entry/ 10130 10131 10131 10132 GENERIC GPIO I2C DRIVER
+9
arch/Kconfig
··· 64 64 bool 65 65 select HOTPLUG_SPLIT_STARTUP 66 66 67 + config GENERIC_IRQ_ENTRY 68 + bool 69 + 70 + config GENERIC_SYSCALL 71 + bool 72 + depends on GENERIC_IRQ_ENTRY 73 + 67 74 config GENERIC_ENTRY 68 75 bool 76 + select GENERIC_IRQ_ENTRY 77 + select GENERIC_SYSCALL 69 78 70 79 config KPROBES 71 80 bool "Kprobes"
+1 -381
include/linux/entry-common.h
··· 2 2 #ifndef __LINUX_ENTRYCOMMON_H 3 3 #define __LINUX_ENTRYCOMMON_H 4 4 5 - #include <linux/static_call_types.h> 5 + #include <linux/irq-entry-common.h> 6 6 #include <linux/ptrace.h> 7 - #include <linux/syscalls.h> 8 7 #include <linux/seccomp.h> 9 8 #include <linux/sched.h> 10 - #include <linux/context_tracking.h> 11 9 #include <linux/livepatch.h> 12 10 #include <linux/resume_user_mode.h> 13 - #include <linux/tick.h> 14 - #include <linux/kmsan.h> 15 11 16 12 #include <asm/entry-common.h> 17 13 #include <asm/syscall.h> 18 - 19 - /* 20 - * Define dummy _TIF work flags if not defined by the architecture or for 21 - * disabled functionality. 22 - */ 23 - #ifndef _TIF_PATCH_PENDING 24 - # define _TIF_PATCH_PENDING (0) 25 - #endif 26 14 27 15 #ifndef _TIF_UPROBE 28 16 # define _TIF_UPROBE (0) ··· 43 55 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 44 56 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ 45 57 ARCH_SYSCALL_WORK_EXIT) 46 - 47 - /* 48 - * TIF flags handled in exit_to_user_mode_loop() 49 - */ 50 - #ifndef ARCH_EXIT_TO_USER_MODE_WORK 51 - # define ARCH_EXIT_TO_USER_MODE_WORK (0) 52 - #endif 53 - 54 - #define EXIT_TO_USER_MODE_WORK \ 55 - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 56 - _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ 57 - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 58 - ARCH_EXIT_TO_USER_MODE_WORK) 59 - 60 - /** 61 - * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs 62 - * @regs: Pointer to currents pt_regs 63 - * 64 - * Defaults to an empty implementation. Can be replaced by architecture 65 - * specific code. 66 - * 67 - * Invoked from syscall_enter_from_user_mode() in the non-instrumentable 68 - * section. Use __always_inline so the compiler cannot push it out of line 69 - * and make it instrumentable. 70 - */ 71 - static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs); 72 - 73 - #ifndef arch_enter_from_user_mode 74 - static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} 75 - #endif 76 - 77 - /** 78 - * enter_from_user_mode - Establish state when coming from user mode 79 - * 80 - * Syscall/interrupt entry disables interrupts, but user mode is traced as 81 - * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. 82 - * 83 - * 1) Tell lockdep that interrupts are disabled 84 - * 2) Invoke context tracking if enabled to reactivate RCU 85 - * 3) Trace interrupts off state 86 - * 87 - * Invoked from architecture specific syscall entry code with interrupts 88 - * disabled. The calling code has to be non-instrumentable. When the 89 - * function returns all state is correct and interrupts are still 90 - * disabled. The subsequent functions can be instrumented. 91 - * 92 - * This is invoked when there is architecture specific functionality to be 93 - * done between establishing state and enabling interrupts. The caller must 94 - * enable interrupts before invoking syscall_enter_from_user_mode_work(). 95 - */ 96 - static __always_inline void enter_from_user_mode(struct pt_regs *regs) 97 - { 98 - arch_enter_from_user_mode(regs); 99 - lockdep_hardirqs_off(CALLER_ADDR0); 100 - 101 - CT_WARN_ON(__ct_state() != CT_STATE_USER); 102 - user_exit_irqoff(); 103 - 104 - instrumentation_begin(); 105 - kmsan_unpoison_entry_regs(regs); 106 - trace_hardirqs_off_finish(); 107 - instrumentation_end(); 108 - } 109 58 110 59 /** 111 60 * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts ··· 129 204 } 130 205 131 206 /** 132 - * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() 133 - * @ti_work: Cached TIF flags gathered with interrupts disabled 134 - * 135 - * Defaults to local_irq_enable(). Can be supplied by architecture specific 136 - * code. 137 - */ 138 - static inline void local_irq_enable_exit_to_user(unsigned long ti_work); 139 - 140 - #ifndef local_irq_enable_exit_to_user 141 - static inline void local_irq_enable_exit_to_user(unsigned long ti_work) 142 - { 143 - local_irq_enable(); 144 - } 145 - #endif 146 - 147 - /** 148 - * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() 149 - * 150 - * Defaults to local_irq_disable(). Can be supplied by architecture specific 151 - * code. 152 - */ 153 - static inline void local_irq_disable_exit_to_user(void); 154 - 155 - #ifndef local_irq_disable_exit_to_user 156 - static inline void local_irq_disable_exit_to_user(void) 157 - { 158 - local_irq_disable(); 159 - } 160 - #endif 161 - 162 - /** 163 - * arch_exit_to_user_mode_work - Architecture specific TIF work for exit 164 - * to user mode. 165 - * @regs: Pointer to currents pt_regs 166 - * @ti_work: Cached TIF flags gathered with interrupts disabled 167 - * 168 - * Invoked from exit_to_user_mode_loop() with interrupt enabled 169 - * 170 - * Defaults to NOOP. Can be supplied by architecture specific code. 171 - */ 172 - static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 173 - unsigned long ti_work); 174 - 175 - #ifndef arch_exit_to_user_mode_work 176 - static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 177 - unsigned long ti_work) 178 - { 179 - } 180 - #endif 181 - 182 - /** 183 - * arch_exit_to_user_mode_prepare - Architecture specific preparation for 184 - * exit to user mode. 185 - * @regs: Pointer to currents pt_regs 186 - * @ti_work: Cached TIF flags gathered with interrupts disabled 187 - * 188 - * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last 189 - * function before return. Defaults to NOOP. 190 - */ 191 - static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 192 - unsigned long ti_work); 193 - 194 - #ifndef arch_exit_to_user_mode_prepare 195 - static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 196 - unsigned long ti_work) 197 - { 198 - } 199 - #endif 200 - 201 - /** 202 - * arch_exit_to_user_mode - Architecture specific final work before 203 - * exit to user mode. 204 - * 205 - * Invoked from exit_to_user_mode() with interrupt disabled as the last 206 - * function before return. Defaults to NOOP. 207 - * 208 - * This needs to be __always_inline because it is non-instrumentable code 209 - * invoked after context tracking switched to user mode. 210 - * 211 - * An architecture implementation must not do anything complex, no locking 212 - * etc. The main purpose is for speculation mitigations. 213 - */ 214 - static __always_inline void arch_exit_to_user_mode(void); 215 - 216 - #ifndef arch_exit_to_user_mode 217 - static __always_inline void arch_exit_to_user_mode(void) { } 218 - #endif 219 - 220 - /** 221 - * arch_do_signal_or_restart - Architecture specific signal delivery function 222 - * @regs: Pointer to currents pt_regs 223 - * 224 - * Invoked from exit_to_user_mode_loop(). 225 - */ 226 - void arch_do_signal_or_restart(struct pt_regs *regs); 227 - 228 - /** 229 - * exit_to_user_mode_loop - do any pending work before leaving to user space 230 - */ 231 - unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 232 - unsigned long ti_work); 233 - 234 - /** 235 - * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 236 - * @regs: Pointer to pt_regs on entry stack 237 - * 238 - * 1) check that interrupts are disabled 239 - * 2) call tick_nohz_user_enter_prepare() 240 - * 3) call exit_to_user_mode_loop() if any flags from 241 - * EXIT_TO_USER_MODE_WORK are set 242 - * 4) check that interrupts are still disabled 243 - */ 244 - static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) 245 - { 246 - unsigned long ti_work; 247 - 248 - lockdep_assert_irqs_disabled(); 249 - 250 - /* Flush pending rcuog wakeup before the last need_resched() check */ 251 - tick_nohz_user_enter_prepare(); 252 - 253 - ti_work = read_thread_flags(); 254 - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 255 - ti_work = exit_to_user_mode_loop(regs, ti_work); 256 - 257 - arch_exit_to_user_mode_prepare(regs, ti_work); 258 - 259 - /* Ensure that kernel state is sane for a return to userspace */ 260 - kmap_assert_nomap(); 261 - lockdep_assert_irqs_disabled(); 262 - lockdep_sys_exit(); 263 - } 264 - 265 - /** 266 - * exit_to_user_mode - Fixup state when exiting to user mode 267 - * 268 - * Syscall/interrupt exit enables interrupts, but the kernel state is 269 - * interrupts disabled when this is invoked. Also tell RCU about it. 270 - * 271 - * 1) Trace interrupts on state 272 - * 2) Invoke context tracking if enabled to adjust RCU state 273 - * 3) Invoke architecture specific last minute exit code, e.g. speculation 274 - * mitigations, etc.: arch_exit_to_user_mode() 275 - * 4) Tell lockdep that interrupts are enabled 276 - * 277 - * Invoked from architecture specific code when syscall_exit_to_user_mode() 278 - * is not suitable as the last step before returning to userspace. Must be 279 - * invoked with interrupts disabled and the caller must be 280 - * non-instrumentable. 281 - * The caller has to invoke syscall_exit_to_user_mode_work() before this. 282 - */ 283 - static __always_inline void exit_to_user_mode(void) 284 - { 285 - instrumentation_begin(); 286 - trace_hardirqs_on_prepare(); 287 - lockdep_hardirqs_on_prepare(); 288 - instrumentation_end(); 289 - 290 - user_enter_irqoff(); 291 - arch_exit_to_user_mode(); 292 - lockdep_hardirqs_on(CALLER_ADDR0); 293 - } 294 - 295 - /** 296 207 * syscall_exit_work - Handle work before returning to user mode 297 208 * @regs: Pointer to current pt_regs 298 209 * @work: Current thread syscall work ··· 211 450 instrumentation_end(); 212 451 exit_to_user_mode(); 213 452 } 214 - 215 - /** 216 - * irqentry_enter_from_user_mode - Establish state before invoking the irq handler 217 - * @regs: Pointer to currents pt_regs 218 - * 219 - * Invoked from architecture specific entry code with interrupts disabled. 220 - * Can only be called when the interrupt entry came from user mode. The 221 - * calling code must be non-instrumentable. When the function returns all 222 - * state is correct and the subsequent functions can be instrumented. 223 - * 224 - * The function establishes state (lockdep, RCU (context tracking), tracing) 225 - */ 226 - void irqentry_enter_from_user_mode(struct pt_regs *regs); 227 - 228 - /** 229 - * irqentry_exit_to_user_mode - Interrupt exit work 230 - * @regs: Pointer to current's pt_regs 231 - * 232 - * Invoked with interrupts disabled and fully valid regs. Returns with all 233 - * work handled, interrupts disabled such that the caller can immediately 234 - * switch to user mode. Called from architecture specific interrupt 235 - * handling code. 236 - * 237 - * The call order is #2 and #3 as described in syscall_exit_to_user_mode(). 238 - * Interrupt exit is not invoking #1 which is the syscall specific one time 239 - * work. 240 - */ 241 - void irqentry_exit_to_user_mode(struct pt_regs *regs); 242 - 243 - #ifndef irqentry_state 244 - /** 245 - * struct irqentry_state - Opaque object for exception state storage 246 - * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the 247 - * exit path has to invoke ct_irq_exit(). 248 - * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that 249 - * lockdep state is restored correctly on exit from nmi. 250 - * 251 - * This opaque object is filled in by the irqentry_*_enter() functions and 252 - * must be passed back into the corresponding irqentry_*_exit() functions 253 - * when the exception is complete. 254 - * 255 - * Callers of irqentry_*_[enter|exit]() must consider this structure opaque 256 - * and all members private. Descriptions of the members are provided to aid in 257 - * the maintenance of the irqentry_*() functions. 258 - */ 259 - typedef struct irqentry_state { 260 - union { 261 - bool exit_rcu; 262 - bool lockdep; 263 - }; 264 - } irqentry_state_t; 265 - #endif 266 - 267 - /** 268 - * irqentry_enter - Handle state tracking on ordinary interrupt entries 269 - * @regs: Pointer to pt_regs of interrupted context 270 - * 271 - * Invokes: 272 - * - lockdep irqflag state tracking as low level ASM entry disabled 273 - * interrupts. 274 - * 275 - * - Context tracking if the exception hit user mode. 276 - * 277 - * - The hardirq tracer to keep the state consistent as low level ASM 278 - * entry disabled interrupts. 279 - * 280 - * As a precondition, this requires that the entry came from user mode, 281 - * idle, or a kernel context in which RCU is watching. 282 - * 283 - * For kernel mode entries RCU handling is done conditional. If RCU is 284 - * watching then the only RCU requirement is to check whether the tick has 285 - * to be restarted. If RCU is not watching then ct_irq_enter() has to be 286 - * invoked on entry and ct_irq_exit() on exit. 287 - * 288 - * Avoiding the ct_irq_enter/exit() calls is an optimization but also 289 - * solves the problem of kernel mode pagefaults which can schedule, which 290 - * is not possible after invoking ct_irq_enter() without undoing it. 291 - * 292 - * For user mode entries irqentry_enter_from_user_mode() is invoked to 293 - * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit 294 - * would not be possible. 295 - * 296 - * Returns: An opaque object that must be passed to idtentry_exit() 297 - */ 298 - irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); 299 - 300 - /** 301 - * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt 302 - * 303 - * Conditional reschedule with additional sanity checks. 304 - */ 305 - void raw_irqentry_exit_cond_resched(void); 306 - #ifdef CONFIG_PREEMPT_DYNAMIC 307 - #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 308 - #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched 309 - #define irqentry_exit_cond_resched_dynamic_disabled NULL 310 - DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); 311 - #define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() 312 - #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 313 - DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); 314 - void dynamic_irqentry_exit_cond_resched(void); 315 - #define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() 316 - #endif 317 - #else /* CONFIG_PREEMPT_DYNAMIC */ 318 - #define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() 319 - #endif /* CONFIG_PREEMPT_DYNAMIC */ 320 - 321 - /** 322 - * irqentry_exit - Handle return from exception that used irqentry_enter() 323 - * @regs: Pointer to pt_regs (exception entry regs) 324 - * @state: Return value from matching call to irqentry_enter() 325 - * 326 - * Depending on the return target (kernel/user) this runs the necessary 327 - * preemption and work checks if possible and required and returns to 328 - * the caller with interrupts disabled and no further work pending. 329 - * 330 - * This is the last action before returning to the low level ASM code which 331 - * just needs to return to the appropriate context. 332 - * 333 - * Counterpart to irqentry_enter(). 334 - */ 335 - void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state); 336 - 337 - /** 338 - * irqentry_nmi_enter - Handle NMI entry 339 - * @regs: Pointer to currents pt_regs 340 - * 341 - * Similar to irqentry_enter() but taking care of the NMI constraints. 342 - */ 343 - irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs); 344 - 345 - /** 346 - * irqentry_nmi_exit - Handle return from NMI handling 347 - * @regs: Pointer to pt_regs (NMI entry regs) 348 - * @irq_state: Return value from matching call to irqentry_nmi_enter() 349 - * 350 - * Last action before returning to the low level assembly code. 351 - * 352 - * Counterpart to irqentry_nmi_enter(). 353 - */ 354 - void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state); 355 453 356 454 #endif
+389
include/linux/irq-entry-common.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __LINUX_IRQENTRYCOMMON_H 3 + #define __LINUX_IRQENTRYCOMMON_H 4 + 5 + #include <linux/static_call_types.h> 6 + #include <linux/syscalls.h> 7 + #include <linux/context_tracking.h> 8 + #include <linux/tick.h> 9 + #include <linux/kmsan.h> 10 + 11 + #include <asm/entry-common.h> 12 + 13 + /* 14 + * Define dummy _TIF work flags if not defined by the architecture or for 15 + * disabled functionality. 16 + */ 17 + #ifndef _TIF_PATCH_PENDING 18 + # define _TIF_PATCH_PENDING (0) 19 + #endif 20 + 21 + /* 22 + * TIF flags handled in exit_to_user_mode_loop() 23 + */ 24 + #ifndef ARCH_EXIT_TO_USER_MODE_WORK 25 + # define ARCH_EXIT_TO_USER_MODE_WORK (0) 26 + #endif 27 + 28 + #define EXIT_TO_USER_MODE_WORK \ 29 + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 30 + _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ 31 + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 32 + ARCH_EXIT_TO_USER_MODE_WORK) 33 + 34 + /** 35 + * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs 36 + * @regs: Pointer to currents pt_regs 37 + * 38 + * Defaults to an empty implementation. Can be replaced by architecture 39 + * specific code. 40 + * 41 + * Invoked from syscall_enter_from_user_mode() in the non-instrumentable 42 + * section. Use __always_inline so the compiler cannot push it out of line 43 + * and make it instrumentable. 44 + */ 45 + static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs); 46 + 47 + #ifndef arch_enter_from_user_mode 48 + static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} 49 + #endif 50 + 51 + /** 52 + * enter_from_user_mode - Establish state when coming from user mode 53 + * 54 + * Syscall/interrupt entry disables interrupts, but user mode is traced as 55 + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. 56 + * 57 + * 1) Tell lockdep that interrupts are disabled 58 + * 2) Invoke context tracking if enabled to reactivate RCU 59 + * 3) Trace interrupts off state 60 + * 61 + * Invoked from architecture specific syscall entry code with interrupts 62 + * disabled. The calling code has to be non-instrumentable. When the 63 + * function returns all state is correct and interrupts are still 64 + * disabled. The subsequent functions can be instrumented. 65 + * 66 + * This is invoked when there is architecture specific functionality to be 67 + * done between establishing state and enabling interrupts. The caller must 68 + * enable interrupts before invoking syscall_enter_from_user_mode_work(). 69 + */ 70 + static __always_inline void enter_from_user_mode(struct pt_regs *regs) 71 + { 72 + arch_enter_from_user_mode(regs); 73 + lockdep_hardirqs_off(CALLER_ADDR0); 74 + 75 + CT_WARN_ON(__ct_state() != CT_STATE_USER); 76 + user_exit_irqoff(); 77 + 78 + instrumentation_begin(); 79 + kmsan_unpoison_entry_regs(regs); 80 + trace_hardirqs_off_finish(); 81 + instrumentation_end(); 82 + } 83 + 84 + /** 85 + * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() 86 + * @ti_work: Cached TIF flags gathered with interrupts disabled 87 + * 88 + * Defaults to local_irq_enable(). Can be supplied by architecture specific 89 + * code. 90 + */ 91 + static inline void local_irq_enable_exit_to_user(unsigned long ti_work); 92 + 93 + #ifndef local_irq_enable_exit_to_user 94 + static inline void local_irq_enable_exit_to_user(unsigned long ti_work) 95 + { 96 + local_irq_enable(); 97 + } 98 + #endif 99 + 100 + /** 101 + * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() 102 + * 103 + * Defaults to local_irq_disable(). Can be supplied by architecture specific 104 + * code. 105 + */ 106 + static inline void local_irq_disable_exit_to_user(void); 107 + 108 + #ifndef local_irq_disable_exit_to_user 109 + static inline void local_irq_disable_exit_to_user(void) 110 + { 111 + local_irq_disable(); 112 + } 113 + #endif 114 + 115 + /** 116 + * arch_exit_to_user_mode_work - Architecture specific TIF work for exit 117 + * to user mode. 118 + * @regs: Pointer to currents pt_regs 119 + * @ti_work: Cached TIF flags gathered with interrupts disabled 120 + * 121 + * Invoked from exit_to_user_mode_loop() with interrupt enabled 122 + * 123 + * Defaults to NOOP. Can be supplied by architecture specific code. 124 + */ 125 + static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 126 + unsigned long ti_work); 127 + 128 + #ifndef arch_exit_to_user_mode_work 129 + static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 130 + unsigned long ti_work) 131 + { 132 + } 133 + #endif 134 + 135 + /** 136 + * arch_exit_to_user_mode_prepare - Architecture specific preparation for 137 + * exit to user mode. 138 + * @regs: Pointer to currents pt_regs 139 + * @ti_work: Cached TIF flags gathered with interrupts disabled 140 + * 141 + * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last 142 + * function before return. Defaults to NOOP. 143 + */ 144 + static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 145 + unsigned long ti_work); 146 + 147 + #ifndef arch_exit_to_user_mode_prepare 148 + static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 149 + unsigned long ti_work) 150 + { 151 + } 152 + #endif 153 + 154 + /** 155 + * arch_exit_to_user_mode - Architecture specific final work before 156 + * exit to user mode. 157 + * 158 + * Invoked from exit_to_user_mode() with interrupt disabled as the last 159 + * function before return. Defaults to NOOP. 160 + * 161 + * This needs to be __always_inline because it is non-instrumentable code 162 + * invoked after context tracking switched to user mode. 163 + * 164 + * An architecture implementation must not do anything complex, no locking 165 + * etc. The main purpose is for speculation mitigations. 166 + */ 167 + static __always_inline void arch_exit_to_user_mode(void); 168 + 169 + #ifndef arch_exit_to_user_mode 170 + static __always_inline void arch_exit_to_user_mode(void) { } 171 + #endif 172 + 173 + /** 174 + * arch_do_signal_or_restart - Architecture specific signal delivery function 175 + * @regs: Pointer to currents pt_regs 176 + * 177 + * Invoked from exit_to_user_mode_loop(). 178 + */ 179 + void arch_do_signal_or_restart(struct pt_regs *regs); 180 + 181 + /** 182 + * exit_to_user_mode_loop - do any pending work before leaving to user space 183 + */ 184 + unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 185 + unsigned long ti_work); 186 + 187 + /** 188 + * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 189 + * @regs: Pointer to pt_regs on entry stack 190 + * 191 + * 1) check that interrupts are disabled 192 + * 2) call tick_nohz_user_enter_prepare() 193 + * 3) call exit_to_user_mode_loop() if any flags from 194 + * EXIT_TO_USER_MODE_WORK are set 195 + * 4) check that interrupts are still disabled 196 + */ 197 + static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) 198 + { 199 + unsigned long ti_work; 200 + 201 + lockdep_assert_irqs_disabled(); 202 + 203 + /* Flush pending rcuog wakeup before the last need_resched() check */ 204 + tick_nohz_user_enter_prepare(); 205 + 206 + ti_work = read_thread_flags(); 207 + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 208 + ti_work = exit_to_user_mode_loop(regs, ti_work); 209 + 210 + arch_exit_to_user_mode_prepare(regs, ti_work); 211 + 212 + /* Ensure that kernel state is sane for a return to userspace */ 213 + kmap_assert_nomap(); 214 + lockdep_assert_irqs_disabled(); 215 + lockdep_sys_exit(); 216 + } 217 + 218 + /** 219 + * exit_to_user_mode - Fixup state when exiting to user mode 220 + * 221 + * Syscall/interrupt exit enables interrupts, but the kernel state is 222 + * interrupts disabled when this is invoked. Also tell RCU about it. 223 + * 224 + * 1) Trace interrupts on state 225 + * 2) Invoke context tracking if enabled to adjust RCU state 226 + * 3) Invoke architecture specific last minute exit code, e.g. speculation 227 + * mitigations, etc.: arch_exit_to_user_mode() 228 + * 4) Tell lockdep that interrupts are enabled 229 + * 230 + * Invoked from architecture specific code when syscall_exit_to_user_mode() 231 + * is not suitable as the last step before returning to userspace. Must be 232 + * invoked with interrupts disabled and the caller must be 233 + * non-instrumentable. 234 + * The caller has to invoke syscall_exit_to_user_mode_work() before this. 235 + */ 236 + static __always_inline void exit_to_user_mode(void) 237 + { 238 + instrumentation_begin(); 239 + trace_hardirqs_on_prepare(); 240 + lockdep_hardirqs_on_prepare(); 241 + instrumentation_end(); 242 + 243 + user_enter_irqoff(); 244 + arch_exit_to_user_mode(); 245 + lockdep_hardirqs_on(CALLER_ADDR0); 246 + } 247 + 248 + /** 249 + * irqentry_enter_from_user_mode - Establish state before invoking the irq handler 250 + * @regs: Pointer to currents pt_regs 251 + * 252 + * Invoked from architecture specific entry code with interrupts disabled. 253 + * Can only be called when the interrupt entry came from user mode. The 254 + * calling code must be non-instrumentable. When the function returns all 255 + * state is correct and the subsequent functions can be instrumented. 256 + * 257 + * The function establishes state (lockdep, RCU (context tracking), tracing) 258 + */ 259 + void irqentry_enter_from_user_mode(struct pt_regs *regs); 260 + 261 + /** 262 + * irqentry_exit_to_user_mode - Interrupt exit work 263 + * @regs: Pointer to current's pt_regs 264 + * 265 + * Invoked with interrupts disabled and fully valid regs. Returns with all 266 + * work handled, interrupts disabled such that the caller can immediately 267 + * switch to user mode. Called from architecture specific interrupt 268 + * handling code. 269 + * 270 + * The call order is #2 and #3 as described in syscall_exit_to_user_mode(). 271 + * Interrupt exit is not invoking #1 which is the syscall specific one time 272 + * work. 273 + */ 274 + void irqentry_exit_to_user_mode(struct pt_regs *regs); 275 + 276 + #ifndef irqentry_state 277 + /** 278 + * struct irqentry_state - Opaque object for exception state storage 279 + * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the 280 + * exit path has to invoke ct_irq_exit(). 281 + * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that 282 + * lockdep state is restored correctly on exit from nmi. 283 + * 284 + * This opaque object is filled in by the irqentry_*_enter() functions and 285 + * must be passed back into the corresponding irqentry_*_exit() functions 286 + * when the exception is complete. 287 + * 288 + * Callers of irqentry_*_[enter|exit]() must consider this structure opaque 289 + * and all members private. Descriptions of the members are provided to aid in 290 + * the maintenance of the irqentry_*() functions. 291 + */ 292 + typedef struct irqentry_state { 293 + union { 294 + bool exit_rcu; 295 + bool lockdep; 296 + }; 297 + } irqentry_state_t; 298 + #endif 299 + 300 + /** 301 + * irqentry_enter - Handle state tracking on ordinary interrupt entries 302 + * @regs: Pointer to pt_regs of interrupted context 303 + * 304 + * Invokes: 305 + * - lockdep irqflag state tracking as low level ASM entry disabled 306 + * interrupts. 307 + * 308 + * - Context tracking if the exception hit user mode. 309 + * 310 + * - The hardirq tracer to keep the state consistent as low level ASM 311 + * entry disabled interrupts. 312 + * 313 + * As a precondition, this requires that the entry came from user mode, 314 + * idle, or a kernel context in which RCU is watching. 315 + * 316 + * For kernel mode entries RCU handling is done conditional. If RCU is 317 + * watching then the only RCU requirement is to check whether the tick has 318 + * to be restarted. If RCU is not watching then ct_irq_enter() has to be 319 + * invoked on entry and ct_irq_exit() on exit. 320 + * 321 + * Avoiding the ct_irq_enter/exit() calls is an optimization but also 322 + * solves the problem of kernel mode pagefaults which can schedule, which 323 + * is not possible after invoking ct_irq_enter() without undoing it. 324 + * 325 + * For user mode entries irqentry_enter_from_user_mode() is invoked to 326 + * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit 327 + * would not be possible. 328 + * 329 + * Returns: An opaque object that must be passed to idtentry_exit() 330 + */ 331 + irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); 332 + 333 + /** 334 + * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt 335 + * 336 + * Conditional reschedule with additional sanity checks. 337 + */ 338 + void raw_irqentry_exit_cond_resched(void); 339 + #ifdef CONFIG_PREEMPT_DYNAMIC 340 + #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 341 + #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched 342 + #define irqentry_exit_cond_resched_dynamic_disabled NULL 343 + DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); 344 + #define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() 345 + #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 346 + DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); 347 + void dynamic_irqentry_exit_cond_resched(void); 348 + #define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() 349 + #endif 350 + #else /* CONFIG_PREEMPT_DYNAMIC */ 351 + #define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() 352 + #endif /* CONFIG_PREEMPT_DYNAMIC */ 353 + 354 + /** 355 + * irqentry_exit - Handle return from exception that used irqentry_enter() 356 + * @regs: Pointer to pt_regs (exception entry regs) 357 + * @state: Return value from matching call to irqentry_enter() 358 + * 359 + * Depending on the return target (kernel/user) this runs the necessary 360 + * preemption and work checks if possible and required and returns to 361 + * the caller with interrupts disabled and no further work pending. 362 + * 363 + * This is the last action before returning to the low level ASM code which 364 + * just needs to return to the appropriate context. 365 + * 366 + * Counterpart to irqentry_enter(). 367 + */ 368 + void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state); 369 + 370 + /** 371 + * irqentry_nmi_enter - Handle NMI entry 372 + * @regs: Pointer to currents pt_regs 373 + * 374 + * Similar to irqentry_enter() but taking care of the NMI constraints. 375 + */ 376 + irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs); 377 + 378 + /** 379 + * irqentry_nmi_exit - Handle return from NMI handling 380 + * @regs: Pointer to pt_regs (NMI entry regs) 381 + * @irq_state: Return value from matching call to irqentry_nmi_enter() 382 + * 383 + * Last action before returning to the low level assembly code. 384 + * 385 + * Counterpart to irqentry_nmi_enter(). 386 + */ 387 + void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state); 388 + 389 + #endif
+6 -1
include/uapi/linux/prctl.h
··· 255 255 /* Dispatch syscalls to a userspace handler */ 256 256 #define PR_SET_SYSCALL_USER_DISPATCH 59 257 257 # define PR_SYS_DISPATCH_OFF 0 258 - # define PR_SYS_DISPATCH_ON 1 258 + /* Enable dispatch except for the specified range */ 259 + # define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 260 + /* Enable dispatch for the specified range */ 261 + # define PR_SYS_DISPATCH_INCLUSIVE_ON 2 262 + /* Legacy name for backwards compatibility */ 263 + # define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON 259 264 /* The control values for the user space selector when dispatch is enabled */ 260 265 # define SYSCALL_DISPATCH_FILTER_ALLOW 0 261 266 # define SYSCALL_DISPATCH_FILTER_BLOCK 1
+2 -1
kernel/entry/Makefile
··· 12 12 CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong 13 13 CFLAGS_common.o += -fno-stack-protector 14 14 15 - obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o 15 + obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o 16 + obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o 16 17 obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
+1 -112
kernel/entry/common.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 - #include <linux/context_tracking.h> 4 - #include <linux/entry-common.h> 3 + #include <linux/irq-entry-common.h> 5 4 #include <linux/resume_user_mode.h> 6 5 #include <linux/highmem.h> 7 6 #include <linux/jump_label.h> 8 7 #include <linux/kmsan.h> 9 8 #include <linux/livepatch.h> 10 - #include <linux/audit.h> 11 9 #include <linux/tick.h> 12 - 13 - #include "common.h" 14 - 15 - #define CREATE_TRACE_POINTS 16 - #include <trace/events/syscalls.h> 17 - 18 - static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 19 - { 20 - if (unlikely(audit_context())) { 21 - unsigned long args[6]; 22 - 23 - syscall_get_arguments(current, regs, args); 24 - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 25 - } 26 - } 27 - 28 - long syscall_trace_enter(struct pt_regs *regs, long syscall, 29 - unsigned long work) 30 - { 31 - long ret = 0; 32 - 33 - /* 34 - * Handle Syscall User Dispatch. This must comes first, since 35 - * the ABI here can be something that doesn't make sense for 36 - * other syscall_work features. 37 - */ 38 - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 39 - if (syscall_user_dispatch(regs)) 40 - return -1L; 41 - } 42 - 43 - /* Handle ptrace */ 44 - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 45 - ret = ptrace_report_syscall_entry(regs); 46 - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 47 - return -1L; 48 - } 49 - 50 - /* Do seccomp after ptrace, to catch any tracer changes. */ 51 - if (work & SYSCALL_WORK_SECCOMP) { 52 - ret = __secure_computing(); 53 - if (ret == -1L) 54 - return ret; 55 - } 56 - 57 - /* Either of the above might have changed the syscall number */ 58 - syscall = syscall_get_nr(current, regs); 59 - 60 - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { 61 - trace_sys_enter(regs, syscall); 62 - /* 63 - * Probes or BPF hooks in the tracepoint may have changed the 64 - * system call number as well. 65 - */ 66 - syscall = syscall_get_nr(current, regs); 67 - } 68 - 69 - syscall_enter_audit(regs, syscall); 70 - 71 - return ret ? : syscall; 72 - } 73 - 74 - noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 75 - { 76 - enter_from_user_mode(regs); 77 - instrumentation_begin(); 78 - local_irq_enable(); 79 - instrumentation_end(); 80 - } 81 10 82 11 /* Workaround to allow gradual conversion of architecture code */ 83 12 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } ··· 60 131 61 132 /* Return the latest work state for arch_exit_to_user_mode() */ 62 133 return ti_work; 63 - } 64 - 65 - /* 66 - * If SYSCALL_EMU is set, then the only reason to report is when 67 - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 68 - * instruction has been already reported in syscall_enter_from_user_mode(). 69 - */ 70 - static inline bool report_single_step(unsigned long work) 71 - { 72 - if (work & SYSCALL_WORK_SYSCALL_EMU) 73 - return false; 74 - 75 - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 76 - } 77 - 78 - void syscall_exit_work(struct pt_regs *regs, unsigned long work) 79 - { 80 - bool step; 81 - 82 - /* 83 - * If the syscall was rolled back due to syscall user dispatching, 84 - * then the tracers below are not invoked for the same reason as 85 - * the entry side was not invoked in syscall_trace_enter(): The ABI 86 - * of these syscalls is unknown. 87 - */ 88 - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 89 - if (unlikely(current->syscall_dispatch.on_dispatch)) { 90 - current->syscall_dispatch.on_dispatch = false; 91 - return; 92 - } 93 - } 94 - 95 - audit_syscall_exit(regs); 96 - 97 - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 98 - trace_sys_exit(regs, syscall_get_return_value(current, regs)); 99 - 100 - step = report_single_step(work); 101 - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 102 - ptrace_report_syscall_exit(regs, step); 103 134 } 104 135 105 136 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
+112
kernel/entry/syscall-common.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/audit.h> 4 + #include <linux/entry-common.h> 5 + #include "common.h" 6 + 7 + #define CREATE_TRACE_POINTS 8 + #include <trace/events/syscalls.h> 9 + 10 + static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 11 + { 12 + if (unlikely(audit_context())) { 13 + unsigned long args[6]; 14 + 15 + syscall_get_arguments(current, regs, args); 16 + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 17 + } 18 + } 19 + 20 + long syscall_trace_enter(struct pt_regs *regs, long syscall, 21 + unsigned long work) 22 + { 23 + long ret = 0; 24 + 25 + /* 26 + * Handle Syscall User Dispatch. This must comes first, since 27 + * the ABI here can be something that doesn't make sense for 28 + * other syscall_work features. 29 + */ 30 + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 31 + if (syscall_user_dispatch(regs)) 32 + return -1L; 33 + } 34 + 35 + /* Handle ptrace */ 36 + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 37 + ret = ptrace_report_syscall_entry(regs); 38 + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 39 + return -1L; 40 + } 41 + 42 + /* Do seccomp after ptrace, to catch any tracer changes. */ 43 + if (work & SYSCALL_WORK_SECCOMP) { 44 + ret = __secure_computing(); 45 + if (ret == -1L) 46 + return ret; 47 + } 48 + 49 + /* Either of the above might have changed the syscall number */ 50 + syscall = syscall_get_nr(current, regs); 51 + 52 + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { 53 + trace_sys_enter(regs, syscall); 54 + /* 55 + * Probes or BPF hooks in the tracepoint may have changed the 56 + * system call number as well. 57 + */ 58 + syscall = syscall_get_nr(current, regs); 59 + } 60 + 61 + syscall_enter_audit(regs, syscall); 62 + 63 + return ret ? : syscall; 64 + } 65 + 66 + noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 67 + { 68 + enter_from_user_mode(regs); 69 + instrumentation_begin(); 70 + local_irq_enable(); 71 + instrumentation_end(); 72 + } 73 + 74 + /* 75 + * If SYSCALL_EMU is set, then the only reason to report is when 76 + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 77 + * instruction has been already reported in syscall_enter_from_user_mode(). 78 + */ 79 + static inline bool report_single_step(unsigned long work) 80 + { 81 + if (work & SYSCALL_WORK_SYSCALL_EMU) 82 + return false; 83 + 84 + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 85 + } 86 + 87 + void syscall_exit_work(struct pt_regs *regs, unsigned long work) 88 + { 89 + bool step; 90 + 91 + /* 92 + * If the syscall was rolled back due to syscall user dispatching, 93 + * then the tracers below are not invoked for the same reason as 94 + * the entry side was not invoked in syscall_trace_enter(): The ABI 95 + * of these syscalls is unknown. 96 + */ 97 + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 98 + if (unlikely(current->syscall_dispatch.on_dispatch)) { 99 + current->syscall_dispatch.on_dispatch = false; 100 + return; 101 + } 102 + } 103 + 104 + audit_syscall_exit(regs); 105 + 106 + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 107 + trace_sys_exit(regs, syscall_get_return_value(current, regs)); 108 + 109 + step = report_single_step(work); 110 + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 111 + ptrace_report_syscall_exit(regs, step); 112 + }
+23 -13
kernel/entry/syscall_user_dispatch.c
··· 78 78 if (offset || len || selector) 79 79 return -EINVAL; 80 80 break; 81 - case PR_SYS_DISPATCH_ON: 81 + case PR_SYS_DISPATCH_EXCLUSIVE_ON: 82 82 /* 83 83 * Validate the direct dispatcher region just for basic 84 84 * sanity against overflow and a 0-sized dispatcher ··· 87 87 */ 88 88 if (offset && offset + len <= offset) 89 89 return -EINVAL; 90 - 90 + break; 91 + case PR_SYS_DISPATCH_INCLUSIVE_ON: 92 + if (len == 0 || offset + len <= offset) 93 + return -EINVAL; 91 94 /* 92 - * access_ok() will clear memory tags for tagged addresses 93 - * if current has memory tagging enabled. 94 - 95 - * To enable a tracer to set a tracees selector the 96 - * selector address must be untagged for access_ok(), 97 - * otherwise an untagged tracer will always fail to set a 98 - * tagged tracees selector. 95 + * Invert the range, the check in syscall_user_dispatch() 96 + * supports wrap-around. 99 97 */ 100 - if (selector && !access_ok(untagged_addr(selector), sizeof(*selector))) 101 - return -EFAULT; 102 - 98 + offset = offset + len; 99 + len = -len; 103 100 break; 104 101 default: 105 102 return -EINVAL; 106 103 } 104 + 105 + /* 106 + * access_ok() will clear memory tags for tagged addresses 107 + * if current has memory tagging enabled. 108 + * 109 + * To enable a tracer to set a tracees selector the 110 + * selector address must be untagged for access_ok(), 111 + * otherwise an untagged tracer will always fail to set a 112 + * tagged tracees selector. 113 + */ 114 + if (mode != PR_SYS_DISPATCH_OFF && selector && 115 + !access_ok(untagged_addr(selector), sizeof(*selector))) 116 + return -EFAULT; 107 117 108 118 task->syscall_dispatch.selector = selector; 109 119 task->syscall_dispatch.offset = offset; 110 120 task->syscall_dispatch.len = len; 111 121 task->syscall_dispatch.on_dispatch = false; 112 122 113 - if (mode == PR_SYS_DISPATCH_ON) 123 + if (mode != PR_SYS_DISPATCH_OFF) 114 124 set_task_syscall_work(task, SYSCALL_USER_DISPATCH); 115 125 else 116 126 clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+4 -4
kernel/sched/core.c
··· 69 69 #include <linux/livepatch_sched.h> 70 70 71 71 #ifdef CONFIG_PREEMPT_DYNAMIC 72 - # ifdef CONFIG_GENERIC_ENTRY 73 - # include <linux/entry-common.h> 72 + # ifdef CONFIG_GENERIC_IRQ_ENTRY 73 + # include <linux/irq-entry-common.h> 74 74 # endif 75 75 #endif 76 76 ··· 7428 7428 7429 7429 #ifdef CONFIG_PREEMPT_DYNAMIC 7430 7430 7431 - #ifdef CONFIG_GENERIC_ENTRY 7432 - #include <linux/entry-common.h> 7431 + #ifdef CONFIG_GENERIC_IRQ_ENTRY 7432 + #include <linux/irq-entry-common.h> 7433 7433 #endif 7434 7434 7435 7435 /*
+6 -1
tools/include/uapi/linux/prctl.h
··· 255 255 /* Dispatch syscalls to a userspace handler */ 256 256 #define PR_SET_SYSCALL_USER_DISPATCH 59 257 257 # define PR_SYS_DISPATCH_OFF 0 258 - # define PR_SYS_DISPATCH_ON 1 258 + /* Enable dispatch except for the specified range */ 259 + # define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 260 + /* Enable dispatch for the specified range */ 261 + # define PR_SYS_DISPATCH_INCLUSIVE_ON 2 262 + /* Legacy name for backwards compatibility */ 263 + # define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON 259 264 /* The control values for the user space selector when dispatch is enabled */ 260 265 # define SYSCALL_DISPATCH_FILTER_ALLOW 0 261 266 # define SYSCALL_DISPATCH_FILTER_BLOCK 1
+98 -44
tools/testing/selftests/syscall_user_dispatch/sud_test.c
··· 10 10 #include <sys/sysinfo.h> 11 11 #include <sys/syscall.h> 12 12 #include <signal.h> 13 + #include <stdbool.h> 14 + #include <stdlib.h> 13 15 14 16 #include <asm/unistd.h> 15 17 #include "../kselftest_harness.h" ··· 19 17 #ifndef PR_SET_SYSCALL_USER_DISPATCH 20 18 # define PR_SET_SYSCALL_USER_DISPATCH 59 21 19 # define PR_SYS_DISPATCH_OFF 0 22 - # define PR_SYS_DISPATCH_ON 1 23 20 # define SYSCALL_DISPATCH_FILTER_ALLOW 0 24 21 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 22 + #endif 23 + 24 + #ifndef PR_SYS_DISPATCH_EXCLUSIVE_ON 25 + # define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 26 + # define PR_SYS_DISPATCH_INCLUSIVE_ON 2 25 27 #endif 26 28 27 29 #ifndef SYS_USER_DISPATCH ··· 71 65 ret = sysinfo(&info); 72 66 ASSERT_EQ(0, ret); 73 67 74 - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); 68 + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, &sel); 75 69 ASSERT_EQ(0, ret) { 76 70 TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); 77 71 } ··· 85 79 } 86 80 } 87 81 82 + static void prctl_valid(struct __test_metadata *_metadata, 83 + unsigned long op, unsigned long off, 84 + unsigned long size, void *sel) 85 + { 86 + EXPECT_EQ(0, prctl(PR_SET_SYSCALL_USER_DISPATCH, op, off, size, sel)); 87 + } 88 + 89 + static void prctl_invalid(struct __test_metadata *_metadata, 90 + unsigned long op, unsigned long off, 91 + unsigned long size, void *sel, int err) 92 + { 93 + EXPECT_EQ(-1, prctl(PR_SET_SYSCALL_USER_DISPATCH, op, off, size, sel)); 94 + EXPECT_EQ(err, errno); 95 + } 96 + 88 97 TEST(bad_prctl_param) 89 98 { 90 99 char sel = SYSCALL_DISPATCH_FILTER_ALLOW; ··· 107 86 108 87 /* Invalid op */ 109 88 op = -1; 110 - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0, 0, &sel); 111 - ASSERT_EQ(EINVAL, errno); 89 + prctl_invalid(_metadata, op, 0, 0, &sel, EINVAL); 112 90 113 91 /* PR_SYS_DISPATCH_OFF */ 114 92 op = PR_SYS_DISPATCH_OFF; 115 93 116 94 /* offset != 0 */ 117 - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, 0); 118 - EXPECT_EQ(EINVAL, errno); 95 + prctl_invalid(_metadata, op, 0x1, 0x0, 0, EINVAL); 119 96 120 97 /* len != 0 */ 121 - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0xff, 0); 122 - EXPECT_EQ(EINVAL, errno); 98 + prctl_invalid(_metadata, op, 0x0, 0xff, 0, EINVAL); 123 99 124 100 /* sel != NULL */ 125 - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, &sel); 126 - EXPECT_EQ(EINVAL, errno); 101 + prctl_invalid(_metadata, op, 0x0, 0x0, &sel, EINVAL); 127 102 128 103 /* Valid parameter */ 129 - errno = 0; 130 - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, 0x0); 131 - EXPECT_EQ(0, errno); 104 + prctl_valid(_metadata, op, 0x0, 0x0, 0x0); 132 105 133 - /* PR_SYS_DISPATCH_ON */ 134 - op = PR_SYS_DISPATCH_ON; 106 + /* PR_SYS_DISPATCH_EXCLUSIVE_ON */ 107 + op = PR_SYS_DISPATCH_EXCLUSIVE_ON; 135 108 136 109 /* Dispatcher region is bad (offset > 0 && len == 0) */ 137 - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, &sel); 138 - EXPECT_EQ(EINVAL, errno); 139 - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, -1L, 0x0, &sel); 140 - EXPECT_EQ(EINVAL, errno); 110 + prctl_invalid(_metadata, op, 0x1, 0x0, &sel, EINVAL); 111 + prctl_invalid(_metadata, op, -1L, 0x0, &sel, EINVAL); 141 112 142 113 /* Invalid selector */ 143 - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x1, (void *) -1); 144 - ASSERT_EQ(EFAULT, errno); 114 + prctl_invalid(_metadata, op, 0x0, 0x1, (void *) -1, EFAULT); 145 115 146 116 /* 147 117 * Dispatcher range overflows unsigned long 148 118 */ 149 - prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 1, -1L, &sel); 150 - ASSERT_EQ(EINVAL, errno) { 151 - TH_LOG("Should reject bad syscall range"); 152 - } 119 + prctl_invalid(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, 1, -1L, &sel, EINVAL); 153 120 154 121 /* 155 122 * Allowed range overflows usigned long 156 123 */ 157 - prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, -1L, 0x1, &sel); 158 - ASSERT_EQ(EINVAL, errno) { 159 - TH_LOG("Should reject bad syscall range"); 160 - } 124 + prctl_invalid(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, -1L, 0x1, &sel, EINVAL); 125 + 126 + /* 0 len should fail for PR_SYS_DISPATCH_INCLUSIVE_ON */ 127 + prctl_invalid(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, 1, 0, 0, EINVAL); 128 + 129 + /* Range wrap-around should fail */ 130 + prctl_invalid(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, -1L, 2, 0, EINVAL); 131 + 132 + /* Normal range shouldn't fail */ 133 + prctl_valid(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, 2, 3, 0); 134 + 135 + /* Invalid selector */ 136 + prctl_invalid(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, 2, 3, (void *) -1, EFAULT); 161 137 } 162 138 163 139 /* ··· 165 147 int nr_syscalls_emulated; 166 148 int si_code; 167 149 int si_errno; 150 + unsigned long syscall_addr; 168 151 169 152 static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) 170 153 { 171 154 si_code = info->si_code; 172 155 si_errno = info->si_errno; 156 + syscall_addr = (unsigned long)info->si_call_addr; 173 157 174 158 if (info->si_syscall == MAGIC_SYSCALL_1) 175 159 nr_syscalls_emulated++; ··· 194 174 #endif 195 175 } 196 176 177 + int setup_sigsys_handler(void) 178 + { 179 + struct sigaction act; 180 + sigset_t mask; 181 + 182 + memset(&act, 0, sizeof(act)); 183 + sigemptyset(&mask); 184 + act.sa_sigaction = handle_sigsys; 185 + act.sa_flags = SA_SIGINFO; 186 + act.sa_mask = mask; 187 + return sigaction(SIGSYS, &act, NULL); 188 + } 189 + 197 190 TEST(dispatch_and_return) 198 191 { 199 192 long ret; 200 - struct sigaction act; 201 - sigset_t mask; 202 193 203 194 glob_sel = 0; 204 195 nr_syscalls_emulated = 0; 205 196 si_code = 0; 206 197 si_errno = 0; 207 198 208 - memset(&act, 0, sizeof(act)); 209 - sigemptyset(&mask); 210 - 211 - act.sa_sigaction = handle_sigsys; 212 - act.sa_flags = SA_SIGINFO; 213 - act.sa_mask = mask; 214 - 215 - ret = sigaction(SIGSYS, &act, NULL); 216 - ASSERT_EQ(0, ret); 199 + ASSERT_EQ(0, setup_sigsys_handler()); 217 200 218 201 /* Make sure selector is good prior to prctl. */ 219 202 SYSCALL_DISPATCH_OFF(glob_sel); 220 203 221 - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); 204 + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, &glob_sel); 222 205 ASSERT_EQ(0, ret) { 223 206 TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); 224 207 } ··· 277 254 /* Make sure selector is good prior to prctl. */ 278 255 SYSCALL_DISPATCH_OFF(glob_sel); 279 256 280 - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); 257 + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, &glob_sel); 281 258 ASSERT_EQ(0, ret) { 282 259 TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); 283 260 } ··· 301 278 struct sysinfo info; 302 279 char sel = 0; 303 280 304 - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); 281 + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, &sel); 305 282 ASSERT_EQ(0, ret) { 306 283 TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); 307 284 } ··· 333 310 * Instead of calculating libc addresses; allow the entire 334 311 * memory map and lock the selector. 335 312 */ 336 - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, -1L, &sel); 313 + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, -1L, &sel); 337 314 ASSERT_EQ(0, ret) { 338 315 TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); 339 316 } ··· 344 321 ASSERT_EQ(0, ret) { 345 322 TH_LOG("Dispatch triggered unexpectedly"); 346 323 } 324 + } 325 + 326 + static void test_range(struct __test_metadata *_metadata, 327 + unsigned long op, unsigned long off, 328 + unsigned long size, bool dispatch) 329 + { 330 + nr_syscalls_emulated = 0; 331 + SYSCALL_DISPATCH_OFF(glob_sel); 332 + EXPECT_EQ(0, prctl(PR_SET_SYSCALL_USER_DISPATCH, op, off, size, &glob_sel)); 333 + SYSCALL_DISPATCH_ON(glob_sel); 334 + if (dispatch) { 335 + EXPECT_EQ(syscall(MAGIC_SYSCALL_1), MAGIC_SYSCALL_1); 336 + EXPECT_EQ(nr_syscalls_emulated, 1); 337 + } else { 338 + EXPECT_EQ(syscall(MAGIC_SYSCALL_1), -1); 339 + EXPECT_EQ(nr_syscalls_emulated, 0); 340 + } 341 + } 342 + 343 + TEST(dispatch_range) 344 + { 345 + ASSERT_EQ(0, setup_sigsys_handler()); 346 + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, true); 347 + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, syscall_addr, 1, false); 348 + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, syscall_addr-100, 200, false); 349 + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, syscall_addr+1, 100, true); 350 + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, syscall_addr-100, 100, true); 351 + test_range(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, syscall_addr, 1, true); 352 + test_range(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, syscall_addr-1, 1, false); 353 + test_range(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, syscall_addr+1, 1, false); 354 + SYSCALL_DISPATCH_OFF(glob_sel); 347 355 } 348 356 349 357 TEST_HARNESS_MAIN