Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at f1a5e78a55ebf2b05777fd5eb738038ddae609d6 794 lines 24 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_RSEQ_ENTRY_H 3#define _LINUX_RSEQ_ENTRY_H 4 5/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ 6#ifdef CONFIG_RSEQ_STATS 7#include <linux/percpu.h> 8 9struct rseq_stats { 10 unsigned long exit; 11 unsigned long signal; 12 unsigned long slowpath; 13 unsigned long fastpath; 14 unsigned long ids; 15 unsigned long cs; 16 unsigned long clear; 17 unsigned long fixup; 18 unsigned long s_granted; 19 unsigned long s_expired; 20 unsigned long s_revoked; 21 unsigned long s_yielded; 22 unsigned long s_aborted; 23}; 24 25DECLARE_PER_CPU(struct rseq_stats, rseq_stats); 26 27/* 28 * Slow path has interrupts and preemption enabled, but the fast path 29 * runs with interrupts disabled so there is no point in having the 30 * preemption checks implied in __this_cpu_inc() for every operation. 31 */ 32#ifdef RSEQ_BUILD_SLOW_PATH 33#define rseq_stat_inc(which) this_cpu_inc((which)) 34#else 35#define rseq_stat_inc(which) raw_cpu_inc((which)) 36#endif 37 38#else /* CONFIG_RSEQ_STATS */ 39#define rseq_stat_inc(x) do { } while (0) 40#endif /* !CONFIG_RSEQ_STATS */ 41 42#ifdef CONFIG_RSEQ 43#include <linux/hrtimer_rearm.h> 44#include <linux/jump_label.h> 45#include <linux/rseq.h> 46#include <linux/sched/signal.h> 47#include <linux/uaccess.h> 48 49#include <linux/tracepoint-defs.h> 50 51#ifdef CONFIG_TRACEPOINTS 52DECLARE_TRACEPOINT(rseq_update); 53DECLARE_TRACEPOINT(rseq_ip_fixup); 54void __rseq_trace_update(struct task_struct *t); 55void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 56 unsigned long offset, unsigned long abort_ip); 57 58static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) 59{ 60 if (tracepoint_enabled(rseq_update) && ids) 61 __rseq_trace_update(t); 62} 63 64static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 65 unsigned long offset, unsigned long abort_ip) 66{ 67 if (tracepoint_enabled(rseq_ip_fixup)) 68 __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 69} 70 71#else /* CONFIG_TRACEPOINT */ 72static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } 73static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 74 unsigned long offset, unsigned long abort_ip) { } 75#endif /* !CONFIG_TRACEPOINT */ 76 77DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 78 79#ifdef RSEQ_BUILD_SLOW_PATH 80#define rseq_inline 81#else 82#define rseq_inline __always_inline 83#endif 84 85#ifdef CONFIG_RSEQ_SLICE_EXTENSION 86DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key); 87 88static __always_inline bool rseq_slice_extension_enabled(void) 89{ 90 return static_branch_likely(&rseq_slice_extension_key); 91} 92 93extern unsigned int rseq_slice_ext_nsecs; 94bool __rseq_arm_slice_extension_timer(void); 95 96static __always_inline bool rseq_arm_slice_extension_timer(void) 97{ 98 if (!rseq_slice_extension_enabled()) 99 return false; 100 101 if (likely(!current->rseq.slice.state.granted)) 102 return false; 103 104 return __rseq_arm_slice_extension_timer(); 105} 106 107static __always_inline void rseq_slice_clear_grant(struct task_struct *t) 108{ 109 if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted) 110 rseq_stat_inc(rseq_stats.s_revoked); 111 t->rseq.slice.state.granted = false; 112} 113 114static __always_inline bool __rseq_grant_slice_extension(bool work_pending) 115{ 116 struct task_struct *curr = current; 117 struct rseq_slice_ctrl usr_ctrl; 118 union rseq_slice_state state; 119 struct rseq __user *rseq; 120 121 if (!rseq_slice_extension_enabled()) 122 return false; 123 124 /* If not enabled or not a return from interrupt, nothing to do. */ 125 state = curr->rseq.slice.state; 126 state.enabled &= curr->rseq.event.user_irq; 127 if (likely(!state.state)) 128 return false; 129 130 rseq = curr->rseq.usrptr; 131 scoped_user_rw_access(rseq, efault) { 132 133 /* 134 * Quick check conditions where a grant is not possible or 135 * needs to be revoked. 136 * 137 * 1) Any TIF bit which needs to do extra work aside of 138 * rescheduling prevents a grant. 139 * 140 * 2) A previous rescheduling request resulted in a slice 141 * extension grant. 142 */ 143 if (unlikely(work_pending || state.granted)) { 144 /* Clear user control unconditionally. No point for checking */ 145 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 146 rseq_slice_clear_grant(curr); 147 return false; 148 } 149 150 unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault); 151 if (likely(!(usr_ctrl.request))) 152 return false; 153 154 /* Grant the slice extention */ 155 usr_ctrl.request = 0; 156 usr_ctrl.granted = 1; 157 unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault); 158 } 159 160 rseq_stat_inc(rseq_stats.s_granted); 161 162 curr->rseq.slice.state.granted = true; 163 /* Store expiry time for arming the timer on the way out */ 164 curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns(); 165 /* 166 * This is racy against a remote CPU setting TIF_NEED_RESCHED in 167 * several ways: 168 * 169 * 1) 170 * CPU0 CPU1 171 * clear_tsk() 172 * set_tsk() 173 * clear_preempt() 174 * Raise scheduler IPI on CPU0 175 * --> IPI 176 * fold_need_resched() -> Folds correctly 177 * 2) 178 * CPU0 CPU1 179 * set_tsk() 180 * clear_tsk() 181 * clear_preempt() 182 * Raise scheduler IPI on CPU0 183 * --> IPI 184 * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false 185 * 186 * #1 is not any different from a regular remote reschedule as it 187 * sets the previously not set bit and then raises the IPI which 188 * folds it into the preempt counter 189 * 190 * #2 is obviously incorrect from a scheduler POV, but it's not 191 * differently incorrect than the code below clearing the 192 * reschedule request with the safety net of the timer. 193 * 194 * The important part is that the clearing is protected against the 195 * scheduler IPI and also against any other interrupt which might 196 * end up waking up a task and setting the bits in the middle of 197 * the operation: 198 * 199 * clear_tsk() 200 * ---> Interrupt 201 * wakeup_on_this_cpu() 202 * set_tsk() 203 * set_preempt() 204 * clear_preempt() 205 * 206 * which would be inconsistent state. 207 */ 208 scoped_guard(irq) { 209 clear_tsk_need_resched(curr); 210 clear_preempt_need_resched(); 211 } 212 return true; 213 214efault: 215 force_sig(SIGSEGV); 216 return false; 217} 218 219static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) 220{ 221 if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) { 222 hrtimer_rearm_deferred_tif(ti_work); 223 return true; 224 } 225 return false; 226} 227 228#else /* CONFIG_RSEQ_SLICE_EXTENSION */ 229static __always_inline bool rseq_slice_extension_enabled(void) { return false; } 230static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } 231static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } 232static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } 233#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ 234 235bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); 236bool rseq_debug_validate_ids(struct task_struct *t); 237 238static __always_inline void rseq_note_user_irq_entry(void) 239{ 240 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) 241 current->rseq.event.user_irq = true; 242} 243 244/* 245 * Check whether there is a valid critical section and whether the 246 * instruction pointer in @regs is inside the critical section. 247 * 248 * - If the critical section is invalid, terminate the task. 249 * 250 * - If valid and the instruction pointer is inside, set it to the abort IP. 251 * 252 * - If valid and the instruction pointer is outside, clear the critical 253 * section address. 254 * 255 * Returns true, if the section was valid and either fixup or clear was 256 * done, false otherwise. 257 * 258 * In the failure case task::rseq_event::fatal is set when a invalid 259 * section was found. It's clear when the failure was an unresolved page 260 * fault. 261 * 262 * If inlined into the exit to user path with interrupts disabled, the 263 * caller has to protect against page faults with pagefault_disable(). 264 * 265 * In preemptible task context this would be counterproductive as the page 266 * faults could not be fully resolved. As a consequence unresolved page 267 * faults in task context are fatal too. 268 */ 269 270#ifdef RSEQ_BUILD_SLOW_PATH 271/* 272 * The debug version is put out of line, but kept here so the code stays 273 * together. 274 * 275 * @csaddr has already been checked by the caller to be in user space 276 */ 277bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, 278 unsigned long csaddr) 279{ 280 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 281 u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; 282 unsigned long ip = instruction_pointer(regs); 283 u64 __user *uc_head = (u64 __user *) ucs; 284 u32 usig, __user *uc_sig; 285 286 scoped_user_rw_access(ucs, efault) { 287 /* 288 * Evaluate the user pile and exit if one of the conditions 289 * is not fulfilled. 290 */ 291 unsafe_get_user(start_ip, &ucs->start_ip, efault); 292 if (unlikely(start_ip >= tasksize)) 293 goto die; 294 /* If outside, just clear the critical section. */ 295 if (ip < start_ip) 296 goto clear; 297 298 unsafe_get_user(offset, &ucs->post_commit_offset, efault); 299 cs_end = start_ip + offset; 300 /* Check for overflow and wraparound */ 301 if (unlikely(cs_end >= tasksize || cs_end < start_ip)) 302 goto die; 303 304 /* If not inside, clear it. */ 305 if (ip >= cs_end) 306 goto clear; 307 308 unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 309 /* Ensure it's "valid" */ 310 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 311 goto die; 312 /* Validate that the abort IP is not in the critical section */ 313 if (unlikely(abort_ip - start_ip < offset)) 314 goto die; 315 316 /* 317 * Check version and flags for 0. No point in emitting 318 * deprecated warnings before dying. That could be done in 319 * the slow path eventually, but *shrug*. 320 */ 321 unsafe_get_user(head, uc_head, efault); 322 if (unlikely(head)) 323 goto die; 324 325 /* abort_ip - 4 is >= 0. See abort_ip check above */ 326 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 327 unsafe_get_user(usig, uc_sig, efault); 328 if (unlikely(usig != t->rseq.sig)) 329 goto die; 330 331 /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ 332 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 333 /* If not in interrupt from user context, let it die */ 334 if (unlikely(!t->rseq.event.user_irq)) 335 goto die; 336 } 337 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 338 instruction_pointer_set(regs, (unsigned long)abort_ip); 339 rseq_stat_inc(rseq_stats.fixup); 340 break; 341 clear: 342 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 343 rseq_stat_inc(rseq_stats.clear); 344 abort_ip = 0ULL; 345 } 346 347 if (unlikely(abort_ip)) 348 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 349 return true; 350die: 351 t->rseq.event.fatal = true; 352efault: 353 return false; 354} 355 356/* 357 * On debug kernels validate that user space did not mess with it if the 358 * debug branch is enabled. 359 */ 360bool rseq_debug_validate_ids(struct task_struct *t) 361{ 362 struct rseq __user *rseq = t->rseq.usrptr; 363 u32 cpu_id, uval, node_id; 364 365 /* 366 * On the first exit after registering the rseq region CPU ID is 367 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! 368 */ 369 node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? 370 cpu_to_node(t->rseq.ids.cpu_id) : 0; 371 372 scoped_user_read_access(rseq, efault) { 373 unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); 374 if (cpu_id != t->rseq.ids.cpu_id) 375 goto die; 376 unsafe_get_user(uval, &rseq->cpu_id, efault); 377 if (uval != cpu_id) 378 goto die; 379 unsafe_get_user(uval, &rseq->node_id, efault); 380 if (uval != node_id) 381 goto die; 382 unsafe_get_user(uval, &rseq->mm_cid, efault); 383 if (uval != t->rseq.ids.mm_cid) 384 goto die; 385 } 386 return true; 387die: 388 t->rseq.event.fatal = true; 389efault: 390 return false; 391} 392 393#endif /* RSEQ_BUILD_SLOW_PATH */ 394 395/* 396 * This only ensures that abort_ip is in the user address space and 397 * validates that it is preceded by the signature. 398 * 399 * No other sanity checks are done here, that's what the debug code is for. 400 */ 401static rseq_inline bool 402rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) 403{ 404 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 405 unsigned long ip = instruction_pointer(regs); 406 unsigned long tasksize = TASK_SIZE; 407 u64 start_ip, abort_ip, offset; 408 u32 usig, __user *uc_sig; 409 410 rseq_stat_inc(rseq_stats.cs); 411 412 if (unlikely(csaddr >= tasksize)) { 413 t->rseq.event.fatal = true; 414 return false; 415 } 416 417 if (static_branch_unlikely(&rseq_debug_enabled)) 418 return rseq_debug_update_user_cs(t, regs, csaddr); 419 420 scoped_user_rw_access(ucs, efault) { 421 unsafe_get_user(start_ip, &ucs->start_ip, efault); 422 unsafe_get_user(offset, &ucs->post_commit_offset, efault); 423 unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 424 425 /* 426 * No sanity checks. If user space screwed it up, it can 427 * keep the pieces. That's what debug code is for. 428 * 429 * If outside, just clear the critical section. 430 */ 431 if (ip - start_ip >= offset) 432 goto clear; 433 434 /* 435 * Two requirements for @abort_ip: 436 * - Must be in user space as x86 IRET would happily return to 437 * the kernel. 438 * - The four bytes preceding the instruction at @abort_ip must 439 * contain the signature. 440 * 441 * The latter protects against the following attack vector: 442 * 443 * An attacker with limited abilities to write, creates a critical 444 * section descriptor, sets the abort IP to a library function or 445 * some other ROP gadget and stores the address of the descriptor 446 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP 447 * protection. 448 */ 449 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 450 goto die; 451 452 /* The address is guaranteed to be >= 0 and < TASK_SIZE */ 453 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 454 unsafe_get_user(usig, uc_sig, efault); 455 if (unlikely(usig != t->rseq.sig)) 456 goto die; 457 458 /* Invalidate the critical section */ 459 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 460 /* Update the instruction pointer */ 461 instruction_pointer_set(regs, (unsigned long)abort_ip); 462 rseq_stat_inc(rseq_stats.fixup); 463 break; 464 clear: 465 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 466 rseq_stat_inc(rseq_stats.clear); 467 abort_ip = 0ULL; 468 } 469 470 if (unlikely(abort_ip)) 471 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 472 return true; 473die: 474 t->rseq.event.fatal = true; 475efault: 476 return false; 477} 478 479/* 480 * Updates CPU ID, Node ID and MM CID and reads the critical section 481 * address, when @csaddr != NULL. This allows to put the ID update and the 482 * read under the same uaccess region to spare a separate begin/end. 483 * 484 * As this is either invoked from a C wrapper with @csaddr = NULL or from 485 * the fast path code with a valid pointer, a clever compiler should be 486 * able to optimize the read out. Spares a duplicate implementation. 487 * 488 * Returns true, if the operation was successful, false otherwise. 489 * 490 * In the failure case task::rseq_event::fatal is set when invalid data 491 * was found on debug kernels. It's clear when the failure was an unresolved page 492 * fault. 493 * 494 * If inlined into the exit to user path with interrupts disabled, the 495 * caller has to protect against page faults with pagefault_disable(). 496 * 497 * In preemptible task context this would be counterproductive as the page 498 * faults could not be fully resolved. As a consequence unresolved page 499 * faults in task context are fatal too. 500 */ 501static rseq_inline 502bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, 503 u32 node_id, u64 *csaddr) 504{ 505 struct rseq __user *rseq = t->rseq.usrptr; 506 507 if (static_branch_unlikely(&rseq_debug_enabled)) { 508 if (!rseq_debug_validate_ids(t)) 509 return false; 510 } 511 512 scoped_user_rw_access(rseq, efault) { 513 unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); 514 unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); 515 unsafe_put_user(node_id, &rseq->node_id, efault); 516 unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); 517 if (csaddr) 518 unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); 519 520 /* Open coded, so it's in the same user access region */ 521 if (rseq_slice_extension_enabled()) { 522 /* Unconditionally clear it, no point in conditionals */ 523 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 524 } 525 } 526 527 rseq_slice_clear_grant(t); 528 /* Cache the new values */ 529 t->rseq.ids.cpu_cid = ids->cpu_cid; 530 rseq_stat_inc(rseq_stats.ids); 531 rseq_trace_update(t, ids); 532 return true; 533efault: 534 return false; 535} 536 537/* 538 * Update user space with new IDs and conditionally check whether the task 539 * is in a critical section. 540 */ 541static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, 542 struct rseq_ids *ids, u32 node_id) 543{ 544 u64 csaddr; 545 546 if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) 547 return false; 548 549 /* 550 * On architectures which utilize the generic entry code this 551 * allows to skip the critical section when the entry was not from 552 * a user space interrupt, unless debug mode is enabled. 553 */ 554 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 555 if (!static_branch_unlikely(&rseq_debug_enabled)) { 556 if (likely(!t->rseq.event.user_irq)) 557 return true; 558 } 559 } 560 if (likely(!csaddr)) 561 return true; 562 /* Sigh, this really needs to do work */ 563 return rseq_update_user_cs(t, regs, csaddr); 564} 565 566/* 567 * If you want to use this then convert your architecture to the generic 568 * entry code. I'm tired of building workarounds for people who can't be 569 * bothered to make the maintenance of generic infrastructure less 570 * burdensome. Just sucking everything into the architecture code and 571 * thereby making others chase the horrible hacks and keep them working is 572 * neither acceptable nor sustainable. 573 */ 574#ifdef CONFIG_GENERIC_ENTRY 575 576/* 577 * This is inlined into the exit path because: 578 * 579 * 1) It's a one time comparison in the fast path when there is no event to 580 * handle 581 * 582 * 2) The access to the user space rseq memory (TLS) is unlikely to fault 583 * so the straight inline operation is: 584 * 585 * - Four 32-bit stores only if CPU ID/ MM CID need to be updated 586 * - One 64-bit load to retrieve the critical section address 587 * 588 * 3) In the unlikely case that the critical section address is != NULL: 589 * 590 * - One 64-bit load to retrieve the start IP 591 * - One 64-bit load to retrieve the offset for calculating the end 592 * - One 64-bit load to retrieve the abort IP 593 * - One 64-bit load to retrieve the signature 594 * - One store to clear the critical section address 595 * 596 * The non-debug case implements only the minimal required checking. It 597 * provides protection against a rogue abort IP in kernel space, which 598 * would be exploitable at least on x86, and also against a rogue CS 599 * descriptor by checking the signature at the abort IP. Any fallout from 600 * invalid critical section descriptors is a user space problem. The debug 601 * case provides the full set of checks and terminates the task if a 602 * condition is not met. 603 * 604 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and 605 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq 606 * slow path there will handle the failure. 607 */ 608static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) 609{ 610 /* 611 * Page faults need to be disabled as this is called with 612 * interrupts disabled 613 */ 614 guard(pagefault)(); 615 if (likely(!t->rseq.event.ids_changed)) { 616 struct rseq __user *rseq = t->rseq.usrptr; 617 /* 618 * If IDs have not changed rseq_event::user_irq must be true 619 * See rseq_sched_switch_event(). 620 */ 621 u64 csaddr; 622 623 scoped_user_rw_access(rseq, efault) { 624 unsafe_get_user(csaddr, &rseq->rseq_cs, efault); 625 626 /* Open coded, so it's in the same user access region */ 627 if (rseq_slice_extension_enabled()) { 628 /* Unconditionally clear it, no point in conditionals */ 629 unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); 630 } 631 } 632 633 rseq_slice_clear_grant(t); 634 635 if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { 636 if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) 637 return false; 638 } 639 return true; 640 } 641 642 struct rseq_ids ids = { 643 .cpu_id = task_cpu(t), 644 .mm_cid = task_mm_cid(t), 645 }; 646 u32 node_id = cpu_to_node(ids.cpu_id); 647 648 return rseq_update_usr(t, regs, &ids, node_id); 649efault: 650 return false; 651} 652 653static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) 654{ 655 struct task_struct *t = current; 656 657 /* 658 * If the task did not go through schedule or got the flag enforced 659 * by the rseq syscall or execve, then nothing to do here. 660 * 661 * CPU ID and MM CID can only change when going through a context 662 * switch. 663 * 664 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit 665 * only when rseq_event::has_rseq is true. That conditional is 666 * required to avoid setting the TIF bit if RSEQ is not registered 667 * for a task. rseq_event::sched_switch is cleared when RSEQ is 668 * unregistered by a task so it's sufficient to check for the 669 * sched_switch bit alone. 670 * 671 * A sane compiler requires three instructions for the nothing to do 672 * case including clearing the events, but your mileage might vary. 673 */ 674 if (unlikely((t->rseq.event.sched_switch))) { 675 rseq_stat_inc(rseq_stats.fastpath); 676 677 if (unlikely(!rseq_exit_user_update(regs, t))) 678 return true; 679 } 680 /* Clear state so next entry starts from a clean slate */ 681 t->rseq.event.events = 0; 682 return false; 683} 684 685/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ 686#ifdef CONFIG_HAVE_GENERIC_TIF_BITS 687static __always_inline bool test_tif_rseq(unsigned long ti_work) 688{ 689 return ti_work & _TIF_RSEQ; 690} 691 692static __always_inline void clear_tif_rseq(void) 693{ 694 static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); 695 clear_thread_flag(TIF_RSEQ); 696} 697#else 698static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } 699static __always_inline void clear_tif_rseq(void) { } 700#endif 701 702static __always_inline bool 703rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 704{ 705 if (unlikely(test_tif_rseq(ti_work))) { 706 if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { 707 current->rseq.event.slowpath = true; 708 set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 709 return true; 710 } 711 clear_tif_rseq(); 712 } 713 /* 714 * Arm the slice extension timer if nothing to do anymore and the 715 * task really goes out to user space. 716 */ 717 return rseq_arm_slice_extension_timer(); 718} 719 720#else /* CONFIG_GENERIC_ENTRY */ 721static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 722{ 723 return false; 724} 725#endif /* !CONFIG_GENERIC_ENTRY */ 726 727static __always_inline void rseq_syscall_exit_to_user_mode(void) 728{ 729 struct rseq_event *ev = &current->rseq.event; 730 731 rseq_stat_inc(rseq_stats.exit); 732 733 /* Needed to remove the store for the !lockdep case */ 734 if (IS_ENABLED(CONFIG_LOCKDEP)) { 735 WARN_ON_ONCE(ev->sched_switch); 736 ev->events = 0; 737 } 738} 739 740static __always_inline void rseq_irqentry_exit_to_user_mode(void) 741{ 742 struct rseq_event *ev = &current->rseq.event; 743 744 rseq_stat_inc(rseq_stats.exit); 745 746 lockdep_assert_once(!ev->sched_switch); 747 748 /* 749 * Ensure that event (especially user_irq) is cleared when the 750 * interrupt did not result in a schedule and therefore the 751 * rseq processing could not clear it. 752 */ 753 ev->events = 0; 754} 755 756/* Required to keep ARM64 working */ 757static __always_inline void rseq_exit_to_user_mode_legacy(void) 758{ 759 struct rseq_event *ev = &current->rseq.event; 760 761 rseq_stat_inc(rseq_stats.exit); 762 763 if (static_branch_unlikely(&rseq_debug_enabled)) 764 WARN_ON_ONCE(ev->sched_switch); 765 766 /* 767 * Ensure that event (especially user_irq) is cleared when the 768 * interrupt did not result in a schedule and therefore the 769 * rseq processing did not clear it. 770 */ 771 ev->events = 0; 772} 773 774void __rseq_debug_syscall_return(struct pt_regs *regs); 775 776static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) 777{ 778 if (static_branch_unlikely(&rseq_debug_enabled)) 779 __rseq_debug_syscall_return(regs); 780} 781#else /* CONFIG_RSEQ */ 782static inline void rseq_note_user_irq_entry(void) { } 783static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 784{ 785 return false; 786} 787static inline void rseq_syscall_exit_to_user_mode(void) { } 788static inline void rseq_irqentry_exit_to_user_mode(void) { } 789static inline void rseq_exit_to_user_mode_legacy(void) { } 790static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } 791static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } 792#endif /* !CONFIG_RSEQ */ 793 794#endif /* _LINUX_RSEQ_ENTRY_H */