Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'core-entry-2020-08-04' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull generic kernel entry/exit code from Thomas Gleixner:
"Generic implementation of common syscall, interrupt and exception
entry/exit functionality based on the recent X86 effort to ensure
correctness of entry/exit vs RCU and instrumentation.

As this functionality and the required entry/exit sequences are not
architecture specific, sharing them allows other architectures to
benefit instead of copying the same code over and over again.

This branch was kept standalone to allow others to work on it. The
conversion of x86 comes in a seperate pull request which obviously is
based on this branch"

* tag 'core-entry-2020-08-04' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
entry: Correct __secure_computing() stub
entry: Correct 'noinstr' attributes
entry: Provide infrastructure for work before transitioning to guest mode
entry: Provide generic interrupt entry/exit code
entry: Provide generic syscall exit function
entry: Provide generic syscall entry functionality
seccomp: Provide stub for __secure_computing()

+907
+3
arch/Kconfig
··· 27 27 config HOTPLUG_SMT 28 28 bool 29 29 30 + config GENERIC_ENTRY 31 + bool 32 + 30 33 config OPROFILE 31 34 tristate "OProfile system profiling" 32 35 depends on PROFILING
+372
include/linux/entry-common.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __LINUX_ENTRYCOMMON_H 3 + #define __LINUX_ENTRYCOMMON_H 4 + 5 + #include <linux/tracehook.h> 6 + #include <linux/syscalls.h> 7 + #include <linux/seccomp.h> 8 + #include <linux/sched.h> 9 + 10 + #include <asm/entry-common.h> 11 + 12 + /* 13 + * Define dummy _TIF work flags if not defined by the architecture or for 14 + * disabled functionality. 15 + */ 16 + #ifndef _TIF_SYSCALL_EMU 17 + # define _TIF_SYSCALL_EMU (0) 18 + #endif 19 + 20 + #ifndef _TIF_SYSCALL_TRACEPOINT 21 + # define _TIF_SYSCALL_TRACEPOINT (0) 22 + #endif 23 + 24 + #ifndef _TIF_SECCOMP 25 + # define _TIF_SECCOMP (0) 26 + #endif 27 + 28 + #ifndef _TIF_SYSCALL_AUDIT 29 + # define _TIF_SYSCALL_AUDIT (0) 30 + #endif 31 + 32 + #ifndef _TIF_PATCH_PENDING 33 + # define _TIF_PATCH_PENDING (0) 34 + #endif 35 + 36 + #ifndef _TIF_UPROBE 37 + # define _TIF_UPROBE (0) 38 + #endif 39 + 40 + /* 41 + * TIF flags handled in syscall_enter_from_usermode() 42 + */ 43 + #ifndef ARCH_SYSCALL_ENTER_WORK 44 + # define ARCH_SYSCALL_ENTER_WORK (0) 45 + #endif 46 + 47 + #define SYSCALL_ENTER_WORK \ 48 + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ 49 + _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \ 50 + ARCH_SYSCALL_ENTER_WORK) 51 + 52 + /* 53 + * TIF flags handled in syscall_exit_to_user_mode() 54 + */ 55 + #ifndef ARCH_SYSCALL_EXIT_WORK 56 + # define ARCH_SYSCALL_EXIT_WORK (0) 57 + #endif 58 + 59 + #define SYSCALL_EXIT_WORK \ 60 + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ 61 + _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) 62 + 63 + /* 64 + * TIF flags handled in exit_to_user_mode_loop() 65 + */ 66 + #ifndef ARCH_EXIT_TO_USER_MODE_WORK 67 + # define ARCH_EXIT_TO_USER_MODE_WORK (0) 68 + #endif 69 + 70 + #define EXIT_TO_USER_MODE_WORK \ 71 + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 72 + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \ 73 + ARCH_EXIT_TO_USER_MODE_WORK) 74 + 75 + /** 76 + * arch_check_user_regs - Architecture specific sanity check for user mode regs 77 + * @regs: Pointer to currents pt_regs 78 + * 79 + * Defaults to an empty implementation. Can be replaced by architecture 80 + * specific code. 81 + * 82 + * Invoked from syscall_enter_from_user_mode() in the non-instrumentable 83 + * section. Use __always_inline so the compiler cannot push it out of line 84 + * and make it instrumentable. 85 + */ 86 + static __always_inline void arch_check_user_regs(struct pt_regs *regs); 87 + 88 + #ifndef arch_check_user_regs 89 + static __always_inline void arch_check_user_regs(struct pt_regs *regs) {} 90 + #endif 91 + 92 + /** 93 + * arch_syscall_enter_tracehook - Wrapper around tracehook_report_syscall_entry() 94 + * @regs: Pointer to currents pt_regs 95 + * 96 + * Returns: 0 on success or an error code to skip the syscall. 97 + * 98 + * Defaults to tracehook_report_syscall_entry(). Can be replaced by 99 + * architecture specific code. 100 + * 101 + * Invoked from syscall_enter_from_user_mode() 102 + */ 103 + static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs); 104 + 105 + #ifndef arch_syscall_enter_tracehook 106 + static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs) 107 + { 108 + return tracehook_report_syscall_entry(regs); 109 + } 110 + #endif 111 + 112 + /** 113 + * syscall_enter_from_user_mode - Check and handle work before invoking 114 + * a syscall 115 + * @regs: Pointer to currents pt_regs 116 + * @syscall: The syscall number 117 + * 118 + * Invoked from architecture specific syscall entry code with interrupts 119 + * disabled. The calling code has to be non-instrumentable. When the 120 + * function returns all state is correct and the subsequent functions can be 121 + * instrumented. 122 + * 123 + * Returns: The original or a modified syscall number 124 + * 125 + * If the returned syscall number is -1 then the syscall should be 126 + * skipped. In this case the caller may invoke syscall_set_error() or 127 + * syscall_set_return_value() first. If neither of those are called and -1 128 + * is returned, then the syscall will fail with ENOSYS. 129 + * 130 + * The following functionality is handled here: 131 + * 132 + * 1) Establish state (lockdep, RCU (context tracking), tracing) 133 + * 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(), 134 + * __secure_computing(), trace_sys_enter() 135 + * 3) Invocation of audit_syscall_entry() 136 + */ 137 + long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); 138 + 139 + /** 140 + * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() 141 + * @ti_work: Cached TIF flags gathered with interrupts disabled 142 + * 143 + * Defaults to local_irq_enable(). Can be supplied by architecture specific 144 + * code. 145 + */ 146 + static inline void local_irq_enable_exit_to_user(unsigned long ti_work); 147 + 148 + #ifndef local_irq_enable_exit_to_user 149 + static inline void local_irq_enable_exit_to_user(unsigned long ti_work) 150 + { 151 + local_irq_enable(); 152 + } 153 + #endif 154 + 155 + /** 156 + * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() 157 + * 158 + * Defaults to local_irq_disable(). Can be supplied by architecture specific 159 + * code. 160 + */ 161 + static inline void local_irq_disable_exit_to_user(void); 162 + 163 + #ifndef local_irq_disable_exit_to_user 164 + static inline void local_irq_disable_exit_to_user(void) 165 + { 166 + local_irq_disable(); 167 + } 168 + #endif 169 + 170 + /** 171 + * arch_exit_to_user_mode_work - Architecture specific TIF work for exit 172 + * to user mode. 173 + * @regs: Pointer to currents pt_regs 174 + * @ti_work: Cached TIF flags gathered with interrupts disabled 175 + * 176 + * Invoked from exit_to_user_mode_loop() with interrupt enabled 177 + * 178 + * Defaults to NOOP. Can be supplied by architecture specific code. 179 + */ 180 + static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 181 + unsigned long ti_work); 182 + 183 + #ifndef arch_exit_to_user_mode_work 184 + static inline void arch_exit_to_user_mode_work(struct pt_regs *regs, 185 + unsigned long ti_work) 186 + { 187 + } 188 + #endif 189 + 190 + /** 191 + * arch_exit_to_user_mode_prepare - Architecture specific preparation for 192 + * exit to user mode. 193 + * @regs: Pointer to currents pt_regs 194 + * @ti_work: Cached TIF flags gathered with interrupts disabled 195 + * 196 + * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last 197 + * function before return. Defaults to NOOP. 198 + */ 199 + static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 200 + unsigned long ti_work); 201 + 202 + #ifndef arch_exit_to_user_mode_prepare 203 + static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 204 + unsigned long ti_work) 205 + { 206 + } 207 + #endif 208 + 209 + /** 210 + * arch_exit_to_user_mode - Architecture specific final work before 211 + * exit to user mode. 212 + * 213 + * Invoked from exit_to_user_mode() with interrupt disabled as the last 214 + * function before return. Defaults to NOOP. 215 + * 216 + * This needs to be __always_inline because it is non-instrumentable code 217 + * invoked after context tracking switched to user mode. 218 + * 219 + * An architecture implementation must not do anything complex, no locking 220 + * etc. The main purpose is for speculation mitigations. 221 + */ 222 + static __always_inline void arch_exit_to_user_mode(void); 223 + 224 + #ifndef arch_exit_to_user_mode 225 + static __always_inline void arch_exit_to_user_mode(void) { } 226 + #endif 227 + 228 + /** 229 + * arch_do_signal - Architecture specific signal delivery function 230 + * @regs: Pointer to currents pt_regs 231 + * 232 + * Invoked from exit_to_user_mode_loop(). 233 + */ 234 + void arch_do_signal(struct pt_regs *regs); 235 + 236 + /** 237 + * arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit() 238 + * @regs: Pointer to currents pt_regs 239 + * @step: Indicator for single step 240 + * 241 + * Defaults to tracehook_report_syscall_exit(). Can be replaced by 242 + * architecture specific code. 243 + * 244 + * Invoked from syscall_exit_to_user_mode() 245 + */ 246 + static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step); 247 + 248 + #ifndef arch_syscall_exit_tracehook 249 + static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step) 250 + { 251 + tracehook_report_syscall_exit(regs, step); 252 + } 253 + #endif 254 + 255 + /** 256 + * syscall_exit_to_user_mode - Handle work before returning to user mode 257 + * @regs: Pointer to currents pt_regs 258 + * 259 + * Invoked with interrupts enabled and fully valid regs. Returns with all 260 + * work handled, interrupts disabled such that the caller can immediately 261 + * switch to user mode. Called from architecture specific syscall and ret 262 + * from fork code. 263 + * 264 + * The call order is: 265 + * 1) One-time syscall exit work: 266 + * - rseq syscall exit 267 + * - audit 268 + * - syscall tracing 269 + * - tracehook (single stepping) 270 + * 271 + * 2) Preparatory work 272 + * - Exit to user mode loop (common TIF handling). Invokes 273 + * arch_exit_to_user_mode_work() for architecture specific TIF work 274 + * - Architecture specific one time work arch_exit_to_user_mode_prepare() 275 + * - Address limit and lockdep checks 276 + * 277 + * 3) Final transition (lockdep, tracing, context tracking, RCU). Invokes 278 + * arch_exit_to_user_mode() to handle e.g. speculation mitigations 279 + */ 280 + void syscall_exit_to_user_mode(struct pt_regs *regs); 281 + 282 + /** 283 + * irqentry_enter_from_user_mode - Establish state before invoking the irq handler 284 + * @regs: Pointer to currents pt_regs 285 + * 286 + * Invoked from architecture specific entry code with interrupts disabled. 287 + * Can only be called when the interrupt entry came from user mode. The 288 + * calling code must be non-instrumentable. When the function returns all 289 + * state is correct and the subsequent functions can be instrumented. 290 + * 291 + * The function establishes state (lockdep, RCU (context tracking), tracing) 292 + */ 293 + void irqentry_enter_from_user_mode(struct pt_regs *regs); 294 + 295 + /** 296 + * irqentry_exit_to_user_mode - Interrupt exit work 297 + * @regs: Pointer to current's pt_regs 298 + * 299 + * Invoked with interrupts disbled and fully valid regs. Returns with all 300 + * work handled, interrupts disabled such that the caller can immediately 301 + * switch to user mode. Called from architecture specific interrupt 302 + * handling code. 303 + * 304 + * The call order is #2 and #3 as described in syscall_exit_to_user_mode(). 305 + * Interrupt exit is not invoking #1 which is the syscall specific one time 306 + * work. 307 + */ 308 + void irqentry_exit_to_user_mode(struct pt_regs *regs); 309 + 310 + #ifndef irqentry_state 311 + typedef struct irqentry_state { 312 + bool exit_rcu; 313 + } irqentry_state_t; 314 + #endif 315 + 316 + /** 317 + * irqentry_enter - Handle state tracking on ordinary interrupt entries 318 + * @regs: Pointer to pt_regs of interrupted context 319 + * 320 + * Invokes: 321 + * - lockdep irqflag state tracking as low level ASM entry disabled 322 + * interrupts. 323 + * 324 + * - Context tracking if the exception hit user mode. 325 + * 326 + * - The hardirq tracer to keep the state consistent as low level ASM 327 + * entry disabled interrupts. 328 + * 329 + * As a precondition, this requires that the entry came from user mode, 330 + * idle, or a kernel context in which RCU is watching. 331 + * 332 + * For kernel mode entries RCU handling is done conditional. If RCU is 333 + * watching then the only RCU requirement is to check whether the tick has 334 + * to be restarted. If RCU is not watching then rcu_irq_enter() has to be 335 + * invoked on entry and rcu_irq_exit() on exit. 336 + * 337 + * Avoiding the rcu_irq_enter/exit() calls is an optimization but also 338 + * solves the problem of kernel mode pagefaults which can schedule, which 339 + * is not possible after invoking rcu_irq_enter() without undoing it. 340 + * 341 + * For user mode entries irqentry_enter_from_user_mode() is invoked to 342 + * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit 343 + * would not be possible. 344 + * 345 + * Returns: An opaque object that must be passed to idtentry_exit() 346 + */ 347 + irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); 348 + 349 + /** 350 + * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt 351 + * 352 + * Conditional reschedule with additional sanity checks. 353 + */ 354 + void irqentry_exit_cond_resched(void); 355 + 356 + /** 357 + * irqentry_exit - Handle return from exception that used irqentry_enter() 358 + * @regs: Pointer to pt_regs (exception entry regs) 359 + * @state: Return value from matching call to irqentry_enter() 360 + * 361 + * Depending on the return target (kernel/user) this runs the necessary 362 + * preemption and work checks if possible and reguired and returns to 363 + * the caller with interrupts disabled and no further work pending. 364 + * 365 + * This is the last action before returning to the low level ASM code which 366 + * just needs to return to the appropriate context. 367 + * 368 + * Counterpart to irqentry_enter(). 369 + */ 370 + void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state); 371 + 372 + #endif
+80
include/linux/entry-kvm.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __LINUX_ENTRYKVM_H 3 + #define __LINUX_ENTRYKVM_H 4 + 5 + #include <linux/entry-common.h> 6 + 7 + /* Transfer to guest mode work */ 8 + #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK 9 + 10 + #ifndef ARCH_XFER_TO_GUEST_MODE_WORK 11 + # define ARCH_XFER_TO_GUEST_MODE_WORK (0) 12 + #endif 13 + 14 + #define XFER_TO_GUEST_MODE_WORK \ 15 + (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ 16 + _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) 17 + 18 + struct kvm_vcpu; 19 + 20 + /** 21 + * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest 22 + * mode work handling function. 23 + * @vcpu: Pointer to current's VCPU data 24 + * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() 25 + * 26 + * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be 27 + * replaced by architecture specific code. 28 + */ 29 + static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, 30 + unsigned long ti_work); 31 + 32 + #ifndef arch_xfer_to_guest_mode_work 33 + static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, 34 + unsigned long ti_work) 35 + { 36 + return 0; 37 + } 38 + #endif 39 + 40 + /** 41 + * xfer_to_guest_mode_handle_work - Check and handle pending work which needs 42 + * to be handled before going to guest mode 43 + * @vcpu: Pointer to current's VCPU data 44 + * 45 + * Returns: 0 or an error code 46 + */ 47 + int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); 48 + 49 + /** 50 + * __xfer_to_guest_mode_work_pending - Check if work is pending 51 + * 52 + * Returns: True if work pending, False otherwise. 53 + * 54 + * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from 55 + * interrupt enabled code for racy quick checks with care. 56 + */ 57 + static inline bool __xfer_to_guest_mode_work_pending(void) 58 + { 59 + unsigned long ti_work = READ_ONCE(current_thread_info()->flags); 60 + 61 + return !!(ti_work & XFER_TO_GUEST_MODE_WORK); 62 + } 63 + 64 + /** 65 + * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be 66 + * handled before returning to guest mode 67 + * 68 + * Returns: True if work pending, False otherwise. 69 + * 70 + * Has to be invoked with interrupts disabled before the transition to 71 + * guest mode. 72 + */ 73 + static inline bool xfer_to_guest_mode_work_pending(void) 74 + { 75 + lockdep_assert_irqs_disabled(); 76 + return __xfer_to_guest_mode_work_pending(); 77 + } 78 + #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ 79 + 80 + #endif
+8
include/linux/kvm_host.h
··· 1439 1439 uintptr_t data, const char *name, 1440 1440 struct task_struct **thread_ptr); 1441 1441 1442 + #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK 1443 + static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) 1444 + { 1445 + vcpu->run->exit_reason = KVM_EXIT_INTR; 1446 + vcpu->stat.signal_exits++; 1447 + } 1448 + #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ 1449 + 1442 1450 #endif
+2
include/linux/seccomp.h
··· 64 64 65 65 struct seccomp { }; 66 66 struct seccomp_filter { }; 67 + struct seccomp_data; 67 68 68 69 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER 69 70 static inline int secure_computing(void) { return 0; } 71 + static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } 70 72 #else 71 73 static inline void secure_computing_strict(int this_syscall) { return; } 72 74 #endif
+1
kernel/Makefile
··· 49 49 obj-y += rcu/ 50 50 obj-y += livepatch/ 51 51 obj-y += dma/ 52 + obj-y += entry/ 52 53 53 54 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 54 55 obj-$(CONFIG_FREEZER) += freezer.o
+13
kernel/entry/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + 3 + # Prevent the noinstr section from being pestered by sanitizer and other goodies 4 + # as long as these things cannot be disabled per function. 5 + KASAN_SANITIZE := n 6 + UBSAN_SANITIZE := n 7 + KCOV_INSTRUMENT := n 8 + 9 + CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong 10 + CFLAGS_common.o += -fno-stack-protector 11 + 12 + obj-$(CONFIG_GENERIC_ENTRY) += common.o 13 + obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
+374
kernel/entry/common.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/context_tracking.h> 4 + #include <linux/entry-common.h> 5 + #include <linux/livepatch.h> 6 + #include <linux/audit.h> 7 + 8 + #define CREATE_TRACE_POINTS 9 + #include <trace/events/syscalls.h> 10 + 11 + /** 12 + * enter_from_user_mode - Establish state when coming from user mode 13 + * 14 + * Syscall/interrupt entry disables interrupts, but user mode is traced as 15 + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. 16 + * 17 + * 1) Tell lockdep that interrupts are disabled 18 + * 2) Invoke context tracking if enabled to reactivate RCU 19 + * 3) Trace interrupts off state 20 + */ 21 + static __always_inline void enter_from_user_mode(struct pt_regs *regs) 22 + { 23 + arch_check_user_regs(regs); 24 + lockdep_hardirqs_off(CALLER_ADDR0); 25 + 26 + CT_WARN_ON(ct_state() != CONTEXT_USER); 27 + user_exit_irqoff(); 28 + 29 + instrumentation_begin(); 30 + trace_hardirqs_off_finish(); 31 + instrumentation_end(); 32 + } 33 + 34 + static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 35 + { 36 + if (unlikely(audit_context())) { 37 + unsigned long args[6]; 38 + 39 + syscall_get_arguments(current, regs, args); 40 + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 41 + } 42 + } 43 + 44 + static long syscall_trace_enter(struct pt_regs *regs, long syscall, 45 + unsigned long ti_work) 46 + { 47 + long ret = 0; 48 + 49 + /* Handle ptrace */ 50 + if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { 51 + ret = arch_syscall_enter_tracehook(regs); 52 + if (ret || (ti_work & _TIF_SYSCALL_EMU)) 53 + return -1L; 54 + } 55 + 56 + /* Do seccomp after ptrace, to catch any tracer changes. */ 57 + if (ti_work & _TIF_SECCOMP) { 58 + ret = __secure_computing(NULL); 59 + if (ret == -1L) 60 + return ret; 61 + } 62 + 63 + if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) 64 + trace_sys_enter(regs, syscall); 65 + 66 + syscall_enter_audit(regs, syscall); 67 + 68 + return ret ? : syscall; 69 + } 70 + 71 + noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 72 + { 73 + unsigned long ti_work; 74 + 75 + enter_from_user_mode(regs); 76 + instrumentation_begin(); 77 + 78 + local_irq_enable(); 79 + ti_work = READ_ONCE(current_thread_info()->flags); 80 + if (ti_work & SYSCALL_ENTER_WORK) 81 + syscall = syscall_trace_enter(regs, syscall, ti_work); 82 + instrumentation_end(); 83 + 84 + return syscall; 85 + } 86 + 87 + /** 88 + * exit_to_user_mode - Fixup state when exiting to user mode 89 + * 90 + * Syscall/interupt exit enables interrupts, but the kernel state is 91 + * interrupts disabled when this is invoked. Also tell RCU about it. 92 + * 93 + * 1) Trace interrupts on state 94 + * 2) Invoke context tracking if enabled to adjust RCU state 95 + * 3) Invoke architecture specific last minute exit code, e.g. speculation 96 + * mitigations, etc. 97 + * 4) Tell lockdep that interrupts are enabled 98 + */ 99 + static __always_inline void exit_to_user_mode(void) 100 + { 101 + instrumentation_begin(); 102 + trace_hardirqs_on_prepare(); 103 + lockdep_hardirqs_on_prepare(CALLER_ADDR0); 104 + instrumentation_end(); 105 + 106 + user_enter_irqoff(); 107 + arch_exit_to_user_mode(); 108 + lockdep_hardirqs_on(CALLER_ADDR0); 109 + } 110 + 111 + /* Workaround to allow gradual conversion of architecture code */ 112 + void __weak arch_do_signal(struct pt_regs *regs) { } 113 + 114 + static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 115 + unsigned long ti_work) 116 + { 117 + /* 118 + * Before returning to user space ensure that all pending work 119 + * items have been completed. 120 + */ 121 + while (ti_work & EXIT_TO_USER_MODE_WORK) { 122 + 123 + local_irq_enable_exit_to_user(ti_work); 124 + 125 + if (ti_work & _TIF_NEED_RESCHED) 126 + schedule(); 127 + 128 + if (ti_work & _TIF_UPROBE) 129 + uprobe_notify_resume(regs); 130 + 131 + if (ti_work & _TIF_PATCH_PENDING) 132 + klp_update_patch_state(current); 133 + 134 + if (ti_work & _TIF_SIGPENDING) 135 + arch_do_signal(regs); 136 + 137 + if (ti_work & _TIF_NOTIFY_RESUME) { 138 + clear_thread_flag(TIF_NOTIFY_RESUME); 139 + tracehook_notify_resume(regs); 140 + rseq_handle_notify_resume(NULL, regs); 141 + } 142 + 143 + /* Architecture specific TIF work */ 144 + arch_exit_to_user_mode_work(regs, ti_work); 145 + 146 + /* 147 + * Disable interrupts and reevaluate the work flags as they 148 + * might have changed while interrupts and preemption was 149 + * enabled above. 150 + */ 151 + local_irq_disable_exit_to_user(); 152 + ti_work = READ_ONCE(current_thread_info()->flags); 153 + } 154 + 155 + /* Return the latest work state for arch_exit_to_user_mode() */ 156 + return ti_work; 157 + } 158 + 159 + static void exit_to_user_mode_prepare(struct pt_regs *regs) 160 + { 161 + unsigned long ti_work = READ_ONCE(current_thread_info()->flags); 162 + 163 + lockdep_assert_irqs_disabled(); 164 + 165 + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 166 + ti_work = exit_to_user_mode_loop(regs, ti_work); 167 + 168 + arch_exit_to_user_mode_prepare(regs, ti_work); 169 + 170 + /* Ensure that the address limit is intact and no locks are held */ 171 + addr_limit_user_check(); 172 + lockdep_assert_irqs_disabled(); 173 + lockdep_sys_exit(); 174 + } 175 + 176 + #ifndef _TIF_SINGLESTEP 177 + static inline bool report_single_step(unsigned long ti_work) 178 + { 179 + return false; 180 + } 181 + #else 182 + /* 183 + * If TIF_SYSCALL_EMU is set, then the only reason to report is when 184 + * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 185 + * instruction has been already reported in syscall_enter_from_usermode(). 186 + */ 187 + #define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) 188 + 189 + static inline bool report_single_step(unsigned long ti_work) 190 + { 191 + return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; 192 + } 193 + #endif 194 + 195 + static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) 196 + { 197 + bool step; 198 + 199 + audit_syscall_exit(regs); 200 + 201 + if (ti_work & _TIF_SYSCALL_TRACEPOINT) 202 + trace_sys_exit(regs, syscall_get_return_value(current, regs)); 203 + 204 + step = report_single_step(ti_work); 205 + if (step || ti_work & _TIF_SYSCALL_TRACE) 206 + arch_syscall_exit_tracehook(regs, step); 207 + } 208 + 209 + /* 210 + * Syscall specific exit to user mode preparation. Runs with interrupts 211 + * enabled. 212 + */ 213 + static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 214 + { 215 + u32 cached_flags = READ_ONCE(current_thread_info()->flags); 216 + unsigned long nr = syscall_get_nr(current, regs); 217 + 218 + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 219 + 220 + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 221 + if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 222 + local_irq_enable(); 223 + } 224 + 225 + rseq_syscall(regs); 226 + 227 + /* 228 + * Do one-time syscall specific work. If these work items are 229 + * enabled, we want to run them exactly once per syscall exit with 230 + * interrupts enabled. 231 + */ 232 + if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) 233 + syscall_exit_work(regs, cached_flags); 234 + } 235 + 236 + __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 237 + { 238 + instrumentation_begin(); 239 + syscall_exit_to_user_mode_prepare(regs); 240 + local_irq_disable_exit_to_user(); 241 + exit_to_user_mode_prepare(regs); 242 + instrumentation_end(); 243 + exit_to_user_mode(); 244 + } 245 + 246 + noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 247 + { 248 + enter_from_user_mode(regs); 249 + } 250 + 251 + noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 252 + { 253 + instrumentation_begin(); 254 + exit_to_user_mode_prepare(regs); 255 + instrumentation_end(); 256 + exit_to_user_mode(); 257 + } 258 + 259 + noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 260 + { 261 + irqentry_state_t ret = { 262 + .exit_rcu = false, 263 + }; 264 + 265 + if (user_mode(regs)) { 266 + irqentry_enter_from_user_mode(regs); 267 + return ret; 268 + } 269 + 270 + /* 271 + * If this entry hit the idle task invoke rcu_irq_enter() whether 272 + * RCU is watching or not. 273 + * 274 + * Interupts can nest when the first interrupt invokes softirq 275 + * processing on return which enables interrupts. 276 + * 277 + * Scheduler ticks in the idle task can mark quiescent state and 278 + * terminate a grace period, if and only if the timer interrupt is 279 + * not nested into another interrupt. 280 + * 281 + * Checking for __rcu_is_watching() here would prevent the nesting 282 + * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 283 + * the tick then rcu_flavor_sched_clock_irq() would wrongfully 284 + * assume that it is the first interupt and eventually claim 285 + * quiescient state and end grace periods prematurely. 286 + * 287 + * Unconditionally invoke rcu_irq_enter() so RCU state stays 288 + * consistent. 289 + * 290 + * TINY_RCU does not support EQS, so let the compiler eliminate 291 + * this part when enabled. 292 + */ 293 + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 294 + /* 295 + * If RCU is not watching then the same careful 296 + * sequence vs. lockdep and tracing is required 297 + * as in irq_enter_from_user_mode(). 298 + */ 299 + lockdep_hardirqs_off(CALLER_ADDR0); 300 + rcu_irq_enter(); 301 + instrumentation_begin(); 302 + trace_hardirqs_off_finish(); 303 + instrumentation_end(); 304 + 305 + ret.exit_rcu = true; 306 + return ret; 307 + } 308 + 309 + /* 310 + * If RCU is watching then RCU only wants to check whether it needs 311 + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 312 + * already contains a warning when RCU is not watching, so no point 313 + * in having another one here. 314 + */ 315 + instrumentation_begin(); 316 + rcu_irq_enter_check_tick(); 317 + /* Use the combo lockdep/tracing function */ 318 + trace_hardirqs_off(); 319 + instrumentation_end(); 320 + 321 + return ret; 322 + } 323 + 324 + void irqentry_exit_cond_resched(void) 325 + { 326 + if (!preempt_count()) { 327 + /* Sanity check RCU and thread stack */ 328 + rcu_irq_exit_check_preempt(); 329 + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 330 + WARN_ON_ONCE(!on_thread_stack()); 331 + if (need_resched()) 332 + preempt_schedule_irq(); 333 + } 334 + } 335 + 336 + noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 337 + { 338 + lockdep_assert_irqs_disabled(); 339 + 340 + /* Check whether this returns to user mode */ 341 + if (user_mode(regs)) { 342 + irqentry_exit_to_user_mode(regs); 343 + } else if (!regs_irqs_disabled(regs)) { 344 + /* 345 + * If RCU was not watching on entry this needs to be done 346 + * carefully and needs the same ordering of lockdep/tracing 347 + * and RCU as the return to user mode path. 348 + */ 349 + if (state.exit_rcu) { 350 + instrumentation_begin(); 351 + /* Tell the tracer that IRET will enable interrupts */ 352 + trace_hardirqs_on_prepare(); 353 + lockdep_hardirqs_on_prepare(CALLER_ADDR0); 354 + instrumentation_end(); 355 + rcu_irq_exit(); 356 + lockdep_hardirqs_on(CALLER_ADDR0); 357 + return; 358 + } 359 + 360 + instrumentation_begin(); 361 + if (IS_ENABLED(CONFIG_PREEMPTION)) 362 + irqentry_exit_cond_resched(); 363 + /* Covers both tracing and lockdep */ 364 + trace_hardirqs_on(); 365 + instrumentation_end(); 366 + } else { 367 + /* 368 + * IRQ flags state is correct already. Just tell RCU if it 369 + * was not watching on entry. 370 + */ 371 + if (state.exit_rcu) 372 + rcu_irq_exit(); 373 + } 374 + }
+51
kernel/entry/kvm.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/entry-kvm.h> 4 + #include <linux/kvm_host.h> 5 + 6 + static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) 7 + { 8 + do { 9 + int ret; 10 + 11 + if (ti_work & _TIF_SIGPENDING) { 12 + kvm_handle_signal_exit(vcpu); 13 + return -EINTR; 14 + } 15 + 16 + if (ti_work & _TIF_NEED_RESCHED) 17 + schedule(); 18 + 19 + if (ti_work & _TIF_NOTIFY_RESUME) { 20 + clear_thread_flag(TIF_NOTIFY_RESUME); 21 + tracehook_notify_resume(NULL); 22 + } 23 + 24 + ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); 25 + if (ret) 26 + return ret; 27 + 28 + ti_work = READ_ONCE(current_thread_info()->flags); 29 + } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); 30 + return 0; 31 + } 32 + 33 + int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) 34 + { 35 + unsigned long ti_work; 36 + 37 + /* 38 + * This is invoked from the outer guest loop with interrupts and 39 + * preemption enabled. 40 + * 41 + * KVM invokes xfer_to_guest_mode_work_pending() with interrupts 42 + * disabled in the inner loop before going into guest mode. No need 43 + * to disable interrupts here. 44 + */ 45 + ti_work = READ_ONCE(current_thread_info()->flags); 46 + if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) 47 + return 0; 48 + 49 + return xfer_to_guest_mode_work(vcpu, ti_work); 50 + } 51 + EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work);
+3
virt/kvm/Kconfig
··· 60 60 61 61 config HAVE_KVM_NO_POLL 62 62 bool 63 + 64 + config KVM_XFER_TO_GUEST_WORK 65 + bool