Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rseq: Provide and use rseq_update_user_cs()

Provide a straight forward implementation to check for and eventually
clear/fixup critical sections in user space.

The non-debug version does only the minimal sanity checks and aims for
efficiency.

There are two attack vectors, which are checked for:

1) An abort IP which is in the kernel address space. That would cause at
least x86 to return to kernel space via IRET.

2) A rogue critical section descriptor with an abort IP pointing to some
arbitrary address, which is not preceded by the RSEQ signature.

If the section descriptors are invalid then the resulting misbehaviour of
the user space application is not the kernels problem.

The kernel provides a run-time switchable debug slow path, which implements
the full zoo of checks including termination of the task when one of the
gazillion conditions is not met.

Replace the zoo in rseq.c with it and invoke it from the TIF_NOTIFY_RESUME
handler. Move the remainders into the CONFIG_DEBUG_RSEQ section, which will
be replaced and removed in a subsequent step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.151465632@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
abc850e7 9c37cb6e

+290 -171
+206
include/linux/rseq_entry.h
··· 36 36 #ifdef CONFIG_RSEQ 37 37 #include <linux/jump_label.h> 38 38 #include <linux/rseq.h> 39 + #include <linux/uaccess.h> 39 40 40 41 #include <linux/tracepoint-defs.h> 41 42 ··· 68 67 69 68 DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 70 69 70 + #ifdef RSEQ_BUILD_SLOW_PATH 71 + #define rseq_inline 72 + #else 73 + #define rseq_inline __always_inline 74 + #endif 75 + 76 + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); 77 + 71 78 static __always_inline void rseq_note_user_irq_entry(void) 72 79 { 73 80 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) 74 81 current->rseq.event.user_irq = true; 82 + } 83 + 84 + /* 85 + * Check whether there is a valid critical section and whether the 86 + * instruction pointer in @regs is inside the critical section. 87 + * 88 + * - If the critical section is invalid, terminate the task. 89 + * 90 + * - If valid and the instruction pointer is inside, set it to the abort IP. 91 + * 92 + * - If valid and the instruction pointer is outside, clear the critical 93 + * section address. 94 + * 95 + * Returns true, if the section was valid and either fixup or clear was 96 + * done, false otherwise. 97 + * 98 + * In the failure case task::rseq_event::fatal is set when a invalid 99 + * section was found. It's clear when the failure was an unresolved page 100 + * fault. 101 + * 102 + * If inlined into the exit to user path with interrupts disabled, the 103 + * caller has to protect against page faults with pagefault_disable(). 104 + * 105 + * In preemptible task context this would be counterproductive as the page 106 + * faults could not be fully resolved. As a consequence unresolved page 107 + * faults in task context are fatal too. 108 + */ 109 + 110 + #ifdef RSEQ_BUILD_SLOW_PATH 111 + /* 112 + * The debug version is put out of line, but kept here so the code stays 113 + * together. 114 + * 115 + * @csaddr has already been checked by the caller to be in user space 116 + */ 117 + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, 118 + unsigned long csaddr) 119 + { 120 + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 121 + u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; 122 + unsigned long ip = instruction_pointer(regs); 123 + u64 __user *uc_head = (u64 __user *) ucs; 124 + u32 usig, __user *uc_sig; 125 + 126 + scoped_user_rw_access(ucs, efault) { 127 + /* 128 + * Evaluate the user pile and exit if one of the conditions 129 + * is not fulfilled. 130 + */ 131 + unsafe_get_user(start_ip, &ucs->start_ip, efault); 132 + if (unlikely(start_ip >= tasksize)) 133 + goto die; 134 + /* If outside, just clear the critical section. */ 135 + if (ip < start_ip) 136 + goto clear; 137 + 138 + unsafe_get_user(offset, &ucs->post_commit_offset, efault); 139 + cs_end = start_ip + offset; 140 + /* Check for overflow and wraparound */ 141 + if (unlikely(cs_end >= tasksize || cs_end < start_ip)) 142 + goto die; 143 + 144 + /* If not inside, clear it. */ 145 + if (ip >= cs_end) 146 + goto clear; 147 + 148 + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 149 + /* Ensure it's "valid" */ 150 + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 151 + goto die; 152 + /* Validate that the abort IP is not in the critical section */ 153 + if (unlikely(abort_ip - start_ip < offset)) 154 + goto die; 155 + 156 + /* 157 + * Check version and flags for 0. No point in emitting 158 + * deprecated warnings before dying. That could be done in 159 + * the slow path eventually, but *shrug*. 160 + */ 161 + unsafe_get_user(head, uc_head, efault); 162 + if (unlikely(head)) 163 + goto die; 164 + 165 + /* abort_ip - 4 is >= 0. See abort_ip check above */ 166 + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 167 + unsafe_get_user(usig, uc_sig, efault); 168 + if (unlikely(usig != t->rseq.sig)) 169 + goto die; 170 + 171 + /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ 172 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 173 + /* If not in interrupt from user context, let it die */ 174 + if (unlikely(!t->rseq.event.user_irq)) 175 + goto die; 176 + } 177 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 178 + instruction_pointer_set(regs, (unsigned long)abort_ip); 179 + rseq_stat_inc(rseq_stats.fixup); 180 + break; 181 + clear: 182 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 183 + rseq_stat_inc(rseq_stats.clear); 184 + abort_ip = 0ULL; 185 + } 186 + 187 + if (unlikely(abort_ip)) 188 + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 189 + return true; 190 + die: 191 + t->rseq.event.fatal = true; 192 + efault: 193 + return false; 194 + } 195 + 196 + #endif /* RSEQ_BUILD_SLOW_PATH */ 197 + 198 + /* 199 + * This only ensures that abort_ip is in the user address space and 200 + * validates that it is preceded by the signature. 201 + * 202 + * No other sanity checks are done here, that's what the debug code is for. 203 + */ 204 + static rseq_inline bool 205 + rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) 206 + { 207 + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 208 + unsigned long ip = instruction_pointer(regs); 209 + u64 start_ip, abort_ip, offset; 210 + u32 usig, __user *uc_sig; 211 + 212 + rseq_stat_inc(rseq_stats.cs); 213 + 214 + if (unlikely(csaddr >= TASK_SIZE)) { 215 + t->rseq.event.fatal = true; 216 + return false; 217 + } 218 + 219 + if (static_branch_unlikely(&rseq_debug_enabled)) 220 + return rseq_debug_update_user_cs(t, regs, csaddr); 221 + 222 + scoped_user_rw_access(ucs, efault) { 223 + unsafe_get_user(start_ip, &ucs->start_ip, efault); 224 + unsafe_get_user(offset, &ucs->post_commit_offset, efault); 225 + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 226 + 227 + /* 228 + * No sanity checks. If user space screwed it up, it can 229 + * keep the pieces. That's what debug code is for. 230 + * 231 + * If outside, just clear the critical section. 232 + */ 233 + if (ip - start_ip >= offset) 234 + goto clear; 235 + 236 + /* 237 + * Two requirements for @abort_ip: 238 + * - Must be in user space as x86 IRET would happily return to 239 + * the kernel. 240 + * - The four bytes preceding the instruction at @abort_ip must 241 + * contain the signature. 242 + * 243 + * The latter protects against the following attack vector: 244 + * 245 + * An attacker with limited abilities to write, creates a critical 246 + * section descriptor, sets the abort IP to a library function or 247 + * some other ROP gadget and stores the address of the descriptor 248 + * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP 249 + * protection. 250 + */ 251 + if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig)) 252 + goto die; 253 + 254 + /* The address is guaranteed to be >= 0 and < TASK_SIZE */ 255 + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 256 + unsafe_get_user(usig, uc_sig, efault); 257 + if (unlikely(usig != t->rseq.sig)) 258 + goto die; 259 + 260 + /* Invalidate the critical section */ 261 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 262 + /* Update the instruction pointer */ 263 + instruction_pointer_set(regs, (unsigned long)abort_ip); 264 + rseq_stat_inc(rseq_stats.fixup); 265 + break; 266 + clear: 267 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 268 + rseq_stat_inc(rseq_stats.clear); 269 + abort_ip = 0ULL; 270 + } 271 + 272 + if (unlikely(abort_ip)) 273 + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 274 + return true; 275 + die: 276 + t->rseq.event.fatal = true; 277 + efault: 278 + return false; 75 279 } 76 280 77 281 static __always_inline void rseq_exit_to_user_mode(void)
+10 -1
include/linux/rseq_types.h
··· 14 14 * @sched_switch: True if the task was scheduled out 15 15 * @user_irq: True on interrupt entry from user mode 16 16 * @has_rseq: True if the task has a rseq pointer installed 17 + * @error: Compound error code for the slow path to analyze 18 + * @fatal: User space data corrupted or invalid 17 19 */ 18 20 struct rseq_event { 19 21 union { 20 - u32 all; 22 + u64 all; 21 23 struct { 22 24 union { 23 25 u16 events; ··· 30 28 }; 31 29 32 30 u8 has_rseq; 31 + u8 __pad; 32 + union { 33 + u16 error; 34 + struct { 35 + u8 fatal; 36 + }; 37 + }; 33 38 }; 34 39 }; 35 40 };
+74 -170
kernel/rseq.c
··· 382 382 return -EFAULT; 383 383 } 384 384 385 - /* 386 - * Get the user-space pointer value stored in the 'rseq_cs' field. 387 - */ 388 - static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) 385 + static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) 389 386 { 390 - if (!rseq_cs) 391 - return -EFAULT; 387 + struct rseq __user *urseq = t->rseq.usrptr; 388 + u64 csaddr; 392 389 393 - #ifdef CONFIG_64BIT 394 - if (get_user(*rseq_cs, &rseq->rseq_cs)) 395 - return -EFAULT; 396 - #else 397 - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) 398 - return -EFAULT; 399 - #endif 400 - 401 - return 0; 402 - } 403 - 404 - /* 405 - * If the rseq_cs field of 'struct rseq' contains a valid pointer to 406 - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. 407 - */ 408 - static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) 409 - { 410 - struct rseq_cs __user *urseq_cs; 411 - u64 ptr; 412 - u32 __user *usig; 413 - u32 sig; 414 - int ret; 415 - 416 - ret = rseq_get_rseq_cs_ptr_val(t->rseq.usrptr, &ptr); 417 - if (ret) 418 - return ret; 419 - 420 - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ 421 - if (!ptr) { 422 - memset(rseq_cs, 0, sizeof(*rseq_cs)); 423 - return 0; 424 - } 425 - /* Check that the pointer value fits in the user-space process space. */ 426 - if (ptr >= TASK_SIZE) 427 - return -EINVAL; 428 - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; 429 - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) 430 - return -EFAULT; 431 - 432 - if (rseq_cs->start_ip >= TASK_SIZE || 433 - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || 434 - rseq_cs->abort_ip >= TASK_SIZE || 435 - rseq_cs->version > 0) 436 - return -EINVAL; 437 - /* Check for overflow. */ 438 - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) 439 - return -EINVAL; 440 - /* Ensure that abort_ip is not in the critical section. */ 441 - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) 442 - return -EINVAL; 443 - 444 - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); 445 - ret = get_user(sig, usig); 446 - if (ret) 447 - return ret; 448 - 449 - if (current->rseq.sig != sig) { 450 - printk_ratelimited(KERN_WARNING 451 - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", 452 - sig, current->rseq.sig, current->pid, usig); 453 - return -EINVAL; 454 - } 455 - return 0; 456 - } 457 - 458 - static bool rseq_warn_flags(const char *str, u32 flags) 459 - { 460 - u32 test_flags; 461 - 462 - if (!flags) 463 - return false; 464 - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; 465 - if (test_flags) 466 - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); 467 - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; 468 - if (test_flags) 469 - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); 470 - return true; 471 - } 472 - 473 - static int rseq_check_flags(struct task_struct *t, u32 cs_flags) 474 - { 475 - u32 flags; 476 - int ret; 477 - 478 - if (rseq_warn_flags("rseq_cs", cs_flags)) 479 - return -EINVAL; 480 - 481 - /* Get thread flags. */ 482 - ret = get_user(flags, &t->rseq.usrptr->flags); 483 - if (ret) 484 - return ret; 485 - 486 - if (rseq_warn_flags("rseq", flags)) 487 - return -EINVAL; 488 - return 0; 489 - } 490 - 491 - static int clear_rseq_cs(struct rseq __user *rseq) 492 - { 493 - /* 494 - * The rseq_cs field is set to NULL on preemption or signal 495 - * delivery on top of rseq assembly block, as well as on top 496 - * of code outside of the rseq assembly block. This performs 497 - * a lazy clear of the rseq_cs field. 498 - * 499 - * Set rseq_cs to NULL. 500 - */ 501 - #ifdef CONFIG_64BIT 502 - return put_user(0UL, &rseq->rseq_cs); 503 - #else 504 - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) 505 - return -EFAULT; 506 - return 0; 507 - #endif 508 - } 509 - 510 - /* 511 - * Unsigned comparison will be true when ip >= start_ip, and when 512 - * ip < start_ip + post_commit_offset. 513 - */ 514 - static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) 515 - { 516 - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; 517 - } 518 - 519 - static int rseq_ip_fixup(struct pt_regs *regs, bool abort) 520 - { 521 - unsigned long ip = instruction_pointer(regs); 522 - struct task_struct *t = current; 523 - struct rseq_cs rseq_cs; 524 - int ret; 525 - 526 - rseq_stat_inc(rseq_stats.cs); 527 - 528 - ret = rseq_get_rseq_cs(t, &rseq_cs); 529 - if (ret) 530 - return ret; 531 - 532 - /* 533 - * Handle potentially not being within a critical section. 534 - * If not nested over a rseq critical section, restart is useless. 535 - * Clear the rseq_cs pointer and return. 536 - */ 537 - if (!in_rseq_cs(ip, &rseq_cs)) { 538 - rseq_stat_inc(rseq_stats.clear); 539 - return clear_rseq_cs(t->rseq.usrptr); 540 - } 541 - ret = rseq_check_flags(t, rseq_cs.flags); 542 - if (ret < 0) 543 - return ret; 544 - if (!abort) 545 - return 0; 546 - ret = clear_rseq_cs(t->rseq.usrptr); 547 - if (ret) 548 - return ret; 549 - rseq_stat_inc(rseq_stats.fixup); 550 - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, 551 - rseq_cs.abort_ip); 552 - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); 553 - return 0; 390 + scoped_user_read_access(urseq, efault) 391 + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); 392 + if (likely(!csaddr)) 393 + return true; 394 + return rseq_update_user_cs(t, regs, csaddr); 395 + efault: 396 + return false; 554 397 } 555 398 556 399 /* ··· 410 567 void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) 411 568 { 412 569 struct task_struct *t = current; 413 - int ret, sig; 414 570 bool event; 571 + int sig; 415 572 416 573 /* 417 574 * If invoked from hypervisors before entering the guest via ··· 461 618 if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) 462 619 return; 463 620 464 - ret = rseq_ip_fixup(regs, event); 465 - if (unlikely(ret < 0)) 621 + if (!rseq_handle_cs(t, regs)) 466 622 goto error; 467 623 468 624 if (unlikely(rseq_update_cpu_node_id(t))) ··· 474 632 } 475 633 476 634 #ifdef CONFIG_DEBUG_RSEQ 635 + /* 636 + * Unsigned comparison will be true when ip >= start_ip, and when 637 + * ip < start_ip + post_commit_offset. 638 + */ 639 + static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) 640 + { 641 + return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; 642 + } 643 + 644 + /* 645 + * If the rseq_cs field of 'struct rseq' contains a valid pointer to 646 + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. 647 + */ 648 + static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) 649 + { 650 + struct rseq __user *urseq = t->rseq.usrptr; 651 + struct rseq_cs __user *urseq_cs; 652 + u32 __user *usig; 653 + u64 ptr; 654 + u32 sig; 655 + int ret; 656 + 657 + if (get_user(ptr, &rseq->rseq_cs)) 658 + return -EFAULT; 659 + 660 + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ 661 + if (!ptr) { 662 + memset(rseq_cs, 0, sizeof(*rseq_cs)); 663 + return 0; 664 + } 665 + /* Check that the pointer value fits in the user-space process space. */ 666 + if (ptr >= TASK_SIZE) 667 + return -EINVAL; 668 + urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; 669 + if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) 670 + return -EFAULT; 671 + 672 + if (rseq_cs->start_ip >= TASK_SIZE || 673 + rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || 674 + rseq_cs->abort_ip >= TASK_SIZE || 675 + rseq_cs->version > 0) 676 + return -EINVAL; 677 + /* Check for overflow. */ 678 + if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) 679 + return -EINVAL; 680 + /* Ensure that abort_ip is not in the critical section. */ 681 + if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) 682 + return -EINVAL; 683 + 684 + usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); 685 + ret = get_user(sig, usig); 686 + if (ret) 687 + return ret; 688 + 689 + if (current->rseq.sig != sig) { 690 + printk_ratelimited(KERN_WARNING 691 + "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", 692 + sig, current->rseq.sig, current->pid, usig); 693 + return -EINVAL; 694 + } 695 + return 0; 696 + } 477 697 478 698 /* 479 699 * Terminate the process if a syscall is issued within a restartable