Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 703ccb63ae9f7444d6ff876d024e17f628103c69 784 lines 24 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_RSEQ_ENTRY_H 3#define _LINUX_RSEQ_ENTRY_H 4 5/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ 6#ifdef CONFIG_RSEQ_STATS 7#include <linux/percpu.h> 8 9struct rseq_stats { 10 unsigned long exit; 11 unsigned long signal; 12 unsigned long slowpath; 13 unsigned long fastpath; 14 unsigned long ids; 15 unsigned long cs; 16 unsigned long clear; 17 unsigned long fixup; 18 unsigned long s_granted; 19 unsigned long s_expired; 20 unsigned long s_revoked; 21 unsigned long s_yielded; 22 unsigned long s_aborted; 23}; 24 25DECLARE_PER_CPU(struct rseq_stats, rseq_stats); 26 27/* 28 * Slow path has interrupts and preemption enabled, but the fast path 29 * runs with interrupts disabled so there is no point in having the 30 * preemption checks implied in __this_cpu_inc() for every operation. 31 */ 32#ifdef RSEQ_BUILD_SLOW_PATH 33#define rseq_stat_inc(which) this_cpu_inc((which)) 34#else 35#define rseq_stat_inc(which) raw_cpu_inc((which)) 36#endif 37 38#else /* CONFIG_RSEQ_STATS */ 39#define rseq_stat_inc(x) do { } while (0) 40#endif /* !CONFIG_RSEQ_STATS */ 41 42#ifdef CONFIG_RSEQ 43#include <linux/jump_label.h> 44#include <linux/rseq.h> 45#include <linux/sched/signal.h> 46#include <linux/uaccess.h> 47 48#include <linux/tracepoint-defs.h> 49 50#ifdef CONFIG_TRACEPOINTS 51DECLARE_TRACEPOINT(rseq_update); 52DECLARE_TRACEPOINT(rseq_ip_fixup); 53void __rseq_trace_update(struct task_struct *t); 54void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 55 unsigned long offset, unsigned long abort_ip); 56 57static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) 58{ 59 if (tracepoint_enabled(rseq_update) && ids) 60 __rseq_trace_update(t); 61} 62 63static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 64 unsigned long offset, unsigned long abort_ip) 65{ 66 if (tracepoint_enabled(rseq_ip_fixup)) 67 __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 68} 69 70#else /* CONFIG_TRACEPOINT */ 71static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } 72static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 73 unsigned long offset, unsigned long abort_ip) { } 74#endif /* !CONFIG_TRACEPOINT */ 75 76DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 77 78#ifdef RSEQ_BUILD_SLOW_PATH 79#define rseq_inline 80#else 81#define rseq_inline __always_inline 82#endif 83 84#ifdef CONFIG_RSEQ_SLICE_EXTENSION 85DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key); 86 87static __always_inline bool rseq_slice_extension_enabled(void) 88{ 89 return static_branch_likely(&rseq_slice_extension_key); 90} 91 92extern unsigned int rseq_slice_ext_nsecs; 93bool __rseq_arm_slice_extension_timer(void); 94 95static __always_inline bool rseq_arm_slice_extension_timer(void) 96{ 97 if (!rseq_slice_extension_enabled()) 98 return false; 99 100 if (likely(!current->rseq.slice.state.granted)) 101 return false; 102 103 return __rseq_arm_slice_extension_timer(); 104} 105 106static __always_inline void rseq_slice_clear_grant(struct task_struct *t) 107{ 108 if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted) 109 rseq_stat_inc(rseq_stats.s_revoked); 110 t->rseq.slice.state.granted = false; 111} 112 113static __always_inline bool rseq_grant_slice_extension(bool work_pending) 114{ 115 struct task_struct *curr = current; 116 struct rseq_slice_ctrl usr_ctrl; 117 union rseq_slice_state state; 118 struct rseq __user *rseq; 119 120 if (!rseq_slice_extension_enabled()) 121 return false; 122 123 /* If not enabled or not a return from interrupt, nothing to do. */ 124 state = curr->rseq.slice.state; 125 state.enabled &= curr->rseq.event.user_irq; 126 if (likely(!state.state)) 127 return false; 128 129 rseq = curr->rseq.usrptr; 130 scoped_user_rw_access(rseq, efault) { 131 132 /* 133 * Quick check conditions where a grant is not possible or 134 * needs to be revoked. 135 * 136 * 1) Any TIF bit which needs to do extra work aside of 137 * rescheduling prevents a grant. 138 * 139 * 2) A previous rescheduling request resulted in a slice 140 * extension grant. 141 */ 142 if (unlikely(work_pending || state.granted)) { 143 /* Clear user control unconditionally. No point for checking */ 144 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 145 rseq_slice_clear_grant(curr); 146 return false; 147 } 148 149 unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault); 150 if (likely(!(usr_ctrl.request))) 151 return false; 152 153 /* Grant the slice extention */ 154 usr_ctrl.request = 0; 155 usr_ctrl.granted = 1; 156 unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault); 157 } 158 159 rseq_stat_inc(rseq_stats.s_granted); 160 161 curr->rseq.slice.state.granted = true; 162 /* Store expiry time for arming the timer on the way out */ 163 curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns(); 164 /* 165 * This is racy against a remote CPU setting TIF_NEED_RESCHED in 166 * several ways: 167 * 168 * 1) 169 * CPU0 CPU1 170 * clear_tsk() 171 * set_tsk() 172 * clear_preempt() 173 * Raise scheduler IPI on CPU0 174 * --> IPI 175 * fold_need_resched() -> Folds correctly 176 * 2) 177 * CPU0 CPU1 178 * set_tsk() 179 * clear_tsk() 180 * clear_preempt() 181 * Raise scheduler IPI on CPU0 182 * --> IPI 183 * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false 184 * 185 * #1 is not any different from a regular remote reschedule as it 186 * sets the previously not set bit and then raises the IPI which 187 * folds it into the preempt counter 188 * 189 * #2 is obviously incorrect from a scheduler POV, but it's not 190 * differently incorrect than the code below clearing the 191 * reschedule request with the safety net of the timer. 192 * 193 * The important part is that the clearing is protected against the 194 * scheduler IPI and also against any other interrupt which might 195 * end up waking up a task and setting the bits in the middle of 196 * the operation: 197 * 198 * clear_tsk() 199 * ---> Interrupt 200 * wakeup_on_this_cpu() 201 * set_tsk() 202 * set_preempt() 203 * clear_preempt() 204 * 205 * which would be inconsistent state. 206 */ 207 scoped_guard(irq) { 208 clear_tsk_need_resched(curr); 209 clear_preempt_need_resched(); 210 } 211 return true; 212 213efault: 214 force_sig(SIGSEGV); 215 return false; 216} 217 218#else /* CONFIG_RSEQ_SLICE_EXTENSION */ 219static __always_inline bool rseq_slice_extension_enabled(void) { return false; } 220static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } 221static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } 222static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; } 223#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ 224 225bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); 226bool rseq_debug_validate_ids(struct task_struct *t); 227 228static __always_inline void rseq_note_user_irq_entry(void) 229{ 230 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) 231 current->rseq.event.user_irq = true; 232} 233 234/* 235 * Check whether there is a valid critical section and whether the 236 * instruction pointer in @regs is inside the critical section. 237 * 238 * - If the critical section is invalid, terminate the task. 239 * 240 * - If valid and the instruction pointer is inside, set it to the abort IP. 241 * 242 * - If valid and the instruction pointer is outside, clear the critical 243 * section address. 244 * 245 * Returns true, if the section was valid and either fixup or clear was 246 * done, false otherwise. 247 * 248 * In the failure case task::rseq_event::fatal is set when a invalid 249 * section was found. It's clear when the failure was an unresolved page 250 * fault. 251 * 252 * If inlined into the exit to user path with interrupts disabled, the 253 * caller has to protect against page faults with pagefault_disable(). 254 * 255 * In preemptible task context this would be counterproductive as the page 256 * faults could not be fully resolved. As a consequence unresolved page 257 * faults in task context are fatal too. 258 */ 259 260#ifdef RSEQ_BUILD_SLOW_PATH 261/* 262 * The debug version is put out of line, but kept here so the code stays 263 * together. 264 * 265 * @csaddr has already been checked by the caller to be in user space 266 */ 267bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, 268 unsigned long csaddr) 269{ 270 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 271 u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; 272 unsigned long ip = instruction_pointer(regs); 273 u64 __user *uc_head = (u64 __user *) ucs; 274 u32 usig, __user *uc_sig; 275 276 scoped_user_rw_access(ucs, efault) { 277 /* 278 * Evaluate the user pile and exit if one of the conditions 279 * is not fulfilled. 280 */ 281 unsafe_get_user(start_ip, &ucs->start_ip, efault); 282 if (unlikely(start_ip >= tasksize)) 283 goto die; 284 /* If outside, just clear the critical section. */ 285 if (ip < start_ip) 286 goto clear; 287 288 unsafe_get_user(offset, &ucs->post_commit_offset, efault); 289 cs_end = start_ip + offset; 290 /* Check for overflow and wraparound */ 291 if (unlikely(cs_end >= tasksize || cs_end < start_ip)) 292 goto die; 293 294 /* If not inside, clear it. */ 295 if (ip >= cs_end) 296 goto clear; 297 298 unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 299 /* Ensure it's "valid" */ 300 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 301 goto die; 302 /* Validate that the abort IP is not in the critical section */ 303 if (unlikely(abort_ip - start_ip < offset)) 304 goto die; 305 306 /* 307 * Check version and flags for 0. No point in emitting 308 * deprecated warnings before dying. That could be done in 309 * the slow path eventually, but *shrug*. 310 */ 311 unsafe_get_user(head, uc_head, efault); 312 if (unlikely(head)) 313 goto die; 314 315 /* abort_ip - 4 is >= 0. See abort_ip check above */ 316 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 317 unsafe_get_user(usig, uc_sig, efault); 318 if (unlikely(usig != t->rseq.sig)) 319 goto die; 320 321 /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ 322 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 323 /* If not in interrupt from user context, let it die */ 324 if (unlikely(!t->rseq.event.user_irq)) 325 goto die; 326 } 327 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 328 instruction_pointer_set(regs, (unsigned long)abort_ip); 329 rseq_stat_inc(rseq_stats.fixup); 330 break; 331 clear: 332 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 333 rseq_stat_inc(rseq_stats.clear); 334 abort_ip = 0ULL; 335 } 336 337 if (unlikely(abort_ip)) 338 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 339 return true; 340die: 341 t->rseq.event.fatal = true; 342efault: 343 return false; 344} 345 346/* 347 * On debug kernels validate that user space did not mess with it if the 348 * debug branch is enabled. 349 */ 350bool rseq_debug_validate_ids(struct task_struct *t) 351{ 352 struct rseq __user *rseq = t->rseq.usrptr; 353 u32 cpu_id, uval, node_id; 354 355 /* 356 * On the first exit after registering the rseq region CPU ID is 357 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! 358 */ 359 node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? 360 cpu_to_node(t->rseq.ids.cpu_id) : 0; 361 362 scoped_user_read_access(rseq, efault) { 363 unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); 364 if (cpu_id != t->rseq.ids.cpu_id) 365 goto die; 366 unsafe_get_user(uval, &rseq->cpu_id, efault); 367 if (uval != cpu_id) 368 goto die; 369 unsafe_get_user(uval, &rseq->node_id, efault); 370 if (uval != node_id) 371 goto die; 372 unsafe_get_user(uval, &rseq->mm_cid, efault); 373 if (uval != t->rseq.ids.mm_cid) 374 goto die; 375 } 376 return true; 377die: 378 t->rseq.event.fatal = true; 379efault: 380 return false; 381} 382 383#endif /* RSEQ_BUILD_SLOW_PATH */ 384 385/* 386 * This only ensures that abort_ip is in the user address space and 387 * validates that it is preceded by the signature. 388 * 389 * No other sanity checks are done here, that's what the debug code is for. 390 */ 391static rseq_inline bool 392rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) 393{ 394 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 395 unsigned long ip = instruction_pointer(regs); 396 unsigned long tasksize = TASK_SIZE; 397 u64 start_ip, abort_ip, offset; 398 u32 usig, __user *uc_sig; 399 400 rseq_stat_inc(rseq_stats.cs); 401 402 if (unlikely(csaddr >= tasksize)) { 403 t->rseq.event.fatal = true; 404 return false; 405 } 406 407 if (static_branch_unlikely(&rseq_debug_enabled)) 408 return rseq_debug_update_user_cs(t, regs, csaddr); 409 410 scoped_user_rw_access(ucs, efault) { 411 unsafe_get_user(start_ip, &ucs->start_ip, efault); 412 unsafe_get_user(offset, &ucs->post_commit_offset, efault); 413 unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 414 415 /* 416 * No sanity checks. If user space screwed it up, it can 417 * keep the pieces. That's what debug code is for. 418 * 419 * If outside, just clear the critical section. 420 */ 421 if (ip - start_ip >= offset) 422 goto clear; 423 424 /* 425 * Two requirements for @abort_ip: 426 * - Must be in user space as x86 IRET would happily return to 427 * the kernel. 428 * - The four bytes preceding the instruction at @abort_ip must 429 * contain the signature. 430 * 431 * The latter protects against the following attack vector: 432 * 433 * An attacker with limited abilities to write, creates a critical 434 * section descriptor, sets the abort IP to a library function or 435 * some other ROP gadget and stores the address of the descriptor 436 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP 437 * protection. 438 */ 439 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 440 goto die; 441 442 /* The address is guaranteed to be >= 0 and < TASK_SIZE */ 443 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 444 unsafe_get_user(usig, uc_sig, efault); 445 if (unlikely(usig != t->rseq.sig)) 446 goto die; 447 448 /* Invalidate the critical section */ 449 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 450 /* Update the instruction pointer */ 451 instruction_pointer_set(regs, (unsigned long)abort_ip); 452 rseq_stat_inc(rseq_stats.fixup); 453 break; 454 clear: 455 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 456 rseq_stat_inc(rseq_stats.clear); 457 abort_ip = 0ULL; 458 } 459 460 if (unlikely(abort_ip)) 461 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 462 return true; 463die: 464 t->rseq.event.fatal = true; 465efault: 466 return false; 467} 468 469/* 470 * Updates CPU ID, Node ID and MM CID and reads the critical section 471 * address, when @csaddr != NULL. This allows to put the ID update and the 472 * read under the same uaccess region to spare a separate begin/end. 473 * 474 * As this is either invoked from a C wrapper with @csaddr = NULL or from 475 * the fast path code with a valid pointer, a clever compiler should be 476 * able to optimize the read out. Spares a duplicate implementation. 477 * 478 * Returns true, if the operation was successful, false otherwise. 479 * 480 * In the failure case task::rseq_event::fatal is set when invalid data 481 * was found on debug kernels. It's clear when the failure was an unresolved page 482 * fault. 483 * 484 * If inlined into the exit to user path with interrupts disabled, the 485 * caller has to protect against page faults with pagefault_disable(). 486 * 487 * In preemptible task context this would be counterproductive as the page 488 * faults could not be fully resolved. As a consequence unresolved page 489 * faults in task context are fatal too. 490 */ 491static rseq_inline 492bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, 493 u32 node_id, u64 *csaddr) 494{ 495 struct rseq __user *rseq = t->rseq.usrptr; 496 497 if (static_branch_unlikely(&rseq_debug_enabled)) { 498 if (!rseq_debug_validate_ids(t)) 499 return false; 500 } 501 502 scoped_user_rw_access(rseq, efault) { 503 unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); 504 unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); 505 unsafe_put_user(node_id, &rseq->node_id, efault); 506 unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); 507 if (csaddr) 508 unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); 509 510 /* Open coded, so it's in the same user access region */ 511 if (rseq_slice_extension_enabled()) { 512 /* Unconditionally clear it, no point in conditionals */ 513 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 514 } 515 } 516 517 rseq_slice_clear_grant(t); 518 /* Cache the new values */ 519 t->rseq.ids.cpu_cid = ids->cpu_cid; 520 rseq_stat_inc(rseq_stats.ids); 521 rseq_trace_update(t, ids); 522 return true; 523efault: 524 return false; 525} 526 527/* 528 * Update user space with new IDs and conditionally check whether the task 529 * is in a critical section. 530 */ 531static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, 532 struct rseq_ids *ids, u32 node_id) 533{ 534 u64 csaddr; 535 536 if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) 537 return false; 538 539 /* 540 * On architectures which utilize the generic entry code this 541 * allows to skip the critical section when the entry was not from 542 * a user space interrupt, unless debug mode is enabled. 543 */ 544 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 545 if (!static_branch_unlikely(&rseq_debug_enabled)) { 546 if (likely(!t->rseq.event.user_irq)) 547 return true; 548 } 549 } 550 if (likely(!csaddr)) 551 return true; 552 /* Sigh, this really needs to do work */ 553 return rseq_update_user_cs(t, regs, csaddr); 554} 555 556/* 557 * If you want to use this then convert your architecture to the generic 558 * entry code. I'm tired of building workarounds for people who can't be 559 * bothered to make the maintenance of generic infrastructure less 560 * burdensome. Just sucking everything into the architecture code and 561 * thereby making others chase the horrible hacks and keep them working is 562 * neither acceptable nor sustainable. 563 */ 564#ifdef CONFIG_GENERIC_ENTRY 565 566/* 567 * This is inlined into the exit path because: 568 * 569 * 1) It's a one time comparison in the fast path when there is no event to 570 * handle 571 * 572 * 2) The access to the user space rseq memory (TLS) is unlikely to fault 573 * so the straight inline operation is: 574 * 575 * - Four 32-bit stores only if CPU ID/ MM CID need to be updated 576 * - One 64-bit load to retrieve the critical section address 577 * 578 * 3) In the unlikely case that the critical section address is != NULL: 579 * 580 * - One 64-bit load to retrieve the start IP 581 * - One 64-bit load to retrieve the offset for calculating the end 582 * - One 64-bit load to retrieve the abort IP 583 * - One 64-bit load to retrieve the signature 584 * - One store to clear the critical section address 585 * 586 * The non-debug case implements only the minimal required checking. It 587 * provides protection against a rogue abort IP in kernel space, which 588 * would be exploitable at least on x86, and also against a rogue CS 589 * descriptor by checking the signature at the abort IP. Any fallout from 590 * invalid critical section descriptors is a user space problem. The debug 591 * case provides the full set of checks and terminates the task if a 592 * condition is not met. 593 * 594 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and 595 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq 596 * slow path there will handle the failure. 597 */ 598static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) 599{ 600 /* 601 * Page faults need to be disabled as this is called with 602 * interrupts disabled 603 */ 604 guard(pagefault)(); 605 if (likely(!t->rseq.event.ids_changed)) { 606 struct rseq __user *rseq = t->rseq.usrptr; 607 /* 608 * If IDs have not changed rseq_event::user_irq must be true 609 * See rseq_sched_switch_event(). 610 */ 611 u64 csaddr; 612 613 scoped_user_rw_access(rseq, efault) { 614 unsafe_get_user(csaddr, &rseq->rseq_cs, efault); 615 616 /* Open coded, so it's in the same user access region */ 617 if (rseq_slice_extension_enabled()) { 618 /* Unconditionally clear it, no point in conditionals */ 619 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 620 } 621 } 622 623 rseq_slice_clear_grant(t); 624 625 if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { 626 if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) 627 return false; 628 } 629 return true; 630 } 631 632 struct rseq_ids ids = { 633 .cpu_id = task_cpu(t), 634 .mm_cid = task_mm_cid(t), 635 }; 636 u32 node_id = cpu_to_node(ids.cpu_id); 637 638 return rseq_update_usr(t, regs, &ids, node_id); 639efault: 640 return false; 641} 642 643static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) 644{ 645 struct task_struct *t = current; 646 647 /* 648 * If the task did not go through schedule or got the flag enforced 649 * by the rseq syscall or execve, then nothing to do here. 650 * 651 * CPU ID and MM CID can only change when going through a context 652 * switch. 653 * 654 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit 655 * only when rseq_event::has_rseq is true. That conditional is 656 * required to avoid setting the TIF bit if RSEQ is not registered 657 * for a task. rseq_event::sched_switch is cleared when RSEQ is 658 * unregistered by a task so it's sufficient to check for the 659 * sched_switch bit alone. 660 * 661 * A sane compiler requires three instructions for the nothing to do 662 * case including clearing the events, but your mileage might vary. 663 */ 664 if (unlikely((t->rseq.event.sched_switch))) { 665 rseq_stat_inc(rseq_stats.fastpath); 666 667 if (unlikely(!rseq_exit_user_update(regs, t))) 668 return true; 669 } 670 /* Clear state so next entry starts from a clean slate */ 671 t->rseq.event.events = 0; 672 return false; 673} 674 675/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ 676#ifdef CONFIG_HAVE_GENERIC_TIF_BITS 677static __always_inline bool test_tif_rseq(unsigned long ti_work) 678{ 679 return ti_work & _TIF_RSEQ; 680} 681 682static __always_inline void clear_tif_rseq(void) 683{ 684 static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); 685 clear_thread_flag(TIF_RSEQ); 686} 687#else 688static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } 689static __always_inline void clear_tif_rseq(void) { } 690#endif 691 692static __always_inline bool 693rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 694{ 695 if (unlikely(test_tif_rseq(ti_work))) { 696 if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { 697 current->rseq.event.slowpath = true; 698 set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 699 return true; 700 } 701 clear_tif_rseq(); 702 } 703 /* 704 * Arm the slice extension timer if nothing to do anymore and the 705 * task really goes out to user space. 706 */ 707 return rseq_arm_slice_extension_timer(); 708} 709 710#else /* CONFIG_GENERIC_ENTRY */ 711static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 712{ 713 return false; 714} 715#endif /* !CONFIG_GENERIC_ENTRY */ 716 717static __always_inline void rseq_syscall_exit_to_user_mode(void) 718{ 719 struct rseq_event *ev = &current->rseq.event; 720 721 rseq_stat_inc(rseq_stats.exit); 722 723 /* Needed to remove the store for the !lockdep case */ 724 if (IS_ENABLED(CONFIG_LOCKDEP)) { 725 WARN_ON_ONCE(ev->sched_switch); 726 ev->events = 0; 727 } 728} 729 730static __always_inline void rseq_irqentry_exit_to_user_mode(void) 731{ 732 struct rseq_event *ev = &current->rseq.event; 733 734 rseq_stat_inc(rseq_stats.exit); 735 736 lockdep_assert_once(!ev->sched_switch); 737 738 /* 739 * Ensure that event (especially user_irq) is cleared when the 740 * interrupt did not result in a schedule and therefore the 741 * rseq processing could not clear it. 742 */ 743 ev->events = 0; 744} 745 746/* Required to keep ARM64 working */ 747static __always_inline void rseq_exit_to_user_mode_legacy(void) 748{ 749 struct rseq_event *ev = &current->rseq.event; 750 751 rseq_stat_inc(rseq_stats.exit); 752 753 if (static_branch_unlikely(&rseq_debug_enabled)) 754 WARN_ON_ONCE(ev->sched_switch); 755 756 /* 757 * Ensure that event (especially user_irq) is cleared when the 758 * interrupt did not result in a schedule and therefore the 759 * rseq processing did not clear it. 760 */ 761 ev->events = 0; 762} 763 764void __rseq_debug_syscall_return(struct pt_regs *regs); 765 766static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) 767{ 768 if (static_branch_unlikely(&rseq_debug_enabled)) 769 __rseq_debug_syscall_return(regs); 770} 771#else /* CONFIG_RSEQ */ 772static inline void rseq_note_user_irq_entry(void) { } 773static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 774{ 775 return false; 776} 777static inline void rseq_syscall_exit_to_user_mode(void) { } 778static inline void rseq_irqentry_exit_to_user_mode(void) { } 779static inline void rseq_exit_to_user_mode_legacy(void) { } 780static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } 781static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } 782#endif /* !CONFIG_RSEQ */ 783 784#endif /* _LINUX_RSEQ_ENTRY_H */