Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rseq: Implement time slice extension enforcement timer

If a time slice extension is granted and the reschedule delayed, the kernel
has to ensure that user space cannot abuse the extension and exceed the
maximum granted time.

It was suggested to implement this via the existing hrtick() timer in the
scheduler, but that turned out to be problematic for several reasons:

1) It creates a dependency on CONFIG_SCHED_HRTICK, which can be disabled
independently of CONFIG_HIGHRES_TIMERS

2) HRTICK usage in the scheduler can be runtime disabled or is only used
for certain aspects of scheduling.

3) The function is calling into the scheduler code and that might have
unexpected consequences when this is invoked due to a time slice
enforcement expiry. Especially when the task managed to clear the
grant via sched_yield(0).

It would be possible to address #2 and #3 by storing state in the
scheduler, but that is extra complexity and fragility for no value.

Implement a dedicated per CPU hrtimer instead, which is solely used for the
purpose of time slice enforcement.

The timer is armed when an extension was granted right before actually
returning to user mode in rseq_exit_to_user_mode_restart().

It is disarmed, when the task relinquishes the CPU. This is expensive as
the timer is probably the first expiring timer on the CPU, which means it
has to reprogram the hardware. But that's less expensive than going through
a full hrtimer interrupt cycle for nothing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251215155709.068329497@linutronix.de

authored by

Thomas Gleixner and committed by
Peter Zijlstra
0ac3b5c3 dd0a0460

+170 -13
+11
Documentation/admin-guide/sysctl/kernel.rst
··· 1248 1248 ROM/Flash boot loader. Maybe to tell it what to do after 1249 1249 rebooting. ??? 1250 1250 1251 + rseq_slice_extension_nsec 1252 + ========================= 1253 + 1254 + A task can request to delay its scheduling if it is in a critical section 1255 + via the prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum 1256 + allowed extension in nanoseconds before scheduling of the task is enforced. 1257 + Default value is 10000ns (10us). The possible range is 10000ns (10us) to 1258 + 50000ns (50us). 1259 + 1260 + This value has a direct correlation to the worst case scheduling latency; 1261 + increment at your own risk. 1251 1262 1252 1263 sched_energy_aware 1253 1264 ==================
+28 -10
include/linux/rseq_entry.h
··· 87 87 { 88 88 return static_branch_likely(&rseq_slice_extension_key); 89 89 } 90 + 91 + extern unsigned int rseq_slice_ext_nsecs; 92 + bool __rseq_arm_slice_extension_timer(void); 93 + 94 + static __always_inline bool rseq_arm_slice_extension_timer(void) 95 + { 96 + if (!rseq_slice_extension_enabled()) 97 + return false; 98 + 99 + if (likely(!current->rseq.slice.state.granted)) 100 + return false; 101 + 102 + return __rseq_arm_slice_extension_timer(); 103 + } 104 + 90 105 #else /* CONFIG_RSEQ_SLICE_EXTENSION */ 91 106 static inline bool rseq_slice_extension_enabled(void) { return false; } 107 + static inline bool rseq_arm_slice_extension_timer(void) { return false; } 92 108 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ 93 109 94 110 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); ··· 559 543 static __always_inline bool 560 544 rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 561 545 { 562 - if (likely(!test_tif_rseq(ti_work))) 563 - return false; 564 - 565 - if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { 566 - current->rseq.event.slowpath = true; 567 - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 568 - return true; 546 + if (unlikely(test_tif_rseq(ti_work))) { 547 + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { 548 + current->rseq.event.slowpath = true; 549 + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 550 + return true; 551 + } 552 + clear_tif_rseq(); 569 553 } 570 - 571 - clear_tif_rseq(); 572 - return false; 554 + /* 555 + * Arm the slice extension timer if nothing to do anymore and the 556 + * task really goes out to user space. 557 + */ 558 + return rseq_arm_slice_extension_timer(); 573 559 } 574 560 575 561 #else /* CONFIG_GENERIC_ENTRY */
+2
include/linux/rseq_types.h
··· 89 89 /** 90 90 * struct rseq_slice - Status information for rseq time slice extension 91 91 * @state: Time slice extension state 92 + * @expires: The time when a grant expires 92 93 * @yielded: Indicator for rseq_slice_yield() 93 94 */ 94 95 struct rseq_slice { 95 96 union rseq_slice_state state; 97 + u64 expires; 96 98 u8 yielded; 97 99 }; 98 100
+129 -3
kernel/rseq.c
··· 71 71 #define RSEQ_BUILD_SLOW_PATH 72 72 73 73 #include <linux/debugfs.h> 74 + #include <linux/hrtimer.h> 75 + #include <linux/percpu.h> 74 76 #include <linux/prctl.h> 75 77 #include <linux/ratelimit.h> 76 78 #include <linux/rseq_entry.h> ··· 502 500 } 503 501 504 502 #ifdef CONFIG_RSEQ_SLICE_EXTENSION 503 + struct slice_timer { 504 + struct hrtimer timer; 505 + void *cookie; 506 + }; 507 + 508 + unsigned int rseq_slice_ext_nsecs __read_mostly = 10 * NSEC_PER_USEC; 509 + static DEFINE_PER_CPU(struct slice_timer, slice_timer); 505 510 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); 511 + 512 + /* 513 + * When the timer expires and the task is still in user space, the return 514 + * from interrupt will revoke the grant and schedule. If the task already 515 + * entered the kernel via a syscall and the timer fires before the syscall 516 + * work was able to cancel it, then depending on the preemption model this 517 + * will either reschedule on return from interrupt or in the syscall work 518 + * below. 519 + */ 520 + static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) 521 + { 522 + struct slice_timer *st = container_of(tmr, struct slice_timer, timer); 523 + 524 + /* 525 + * Validate that the task which armed the timer is still on the 526 + * CPU. It could have been scheduled out without canceling the 527 + * timer. 528 + */ 529 + if (st->cookie == current && current->rseq.slice.state.granted) { 530 + rseq_stat_inc(rseq_stats.s_expired); 531 + set_need_resched_current(); 532 + } 533 + return HRTIMER_NORESTART; 534 + } 535 + 536 + bool __rseq_arm_slice_extension_timer(void) 537 + { 538 + struct slice_timer *st = this_cpu_ptr(&slice_timer); 539 + struct task_struct *curr = current; 540 + 541 + lockdep_assert_irqs_disabled(); 542 + 543 + /* 544 + * This check prevents a task, which got a time slice extension 545 + * granted, from exceeding the maximum scheduling latency when the 546 + * grant expired before going out to user space. Don't bother to 547 + * clear the grant here, it will be cleaned up automatically before 548 + * going out to user space after being scheduled back in. 549 + */ 550 + if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { 551 + set_need_resched_current(); 552 + return true; 553 + } 554 + 555 + /* 556 + * Store the task pointer as a cookie for comparison in the timer 557 + * function. This is safe as the timer is CPU local and cannot be 558 + * in the expiry function at this point. 559 + */ 560 + st->cookie = curr; 561 + hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); 562 + /* Arm the syscall entry work */ 563 + set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); 564 + return false; 565 + } 566 + 567 + static void rseq_cancel_slice_extension_timer(void) 568 + { 569 + struct slice_timer *st = this_cpu_ptr(&slice_timer); 570 + 571 + /* 572 + * st->cookie can be safely read as preemption is disabled and the 573 + * timer is CPU local. 574 + * 575 + * As this is most probably the first expiring timer, the cancel is 576 + * expensive as it has to reprogram the hardware, but that's less 577 + * expensive than going through a full hrtimer_interrupt() cycle 578 + * for nothing. 579 + * 580 + * hrtimer_try_to_cancel() is sufficient here as the timer is CPU 581 + * local and once the hrtimer code disabled interrupts the timer 582 + * callback cannot be running. 583 + */ 584 + if (st->cookie == current) 585 + hrtimer_try_to_cancel(&st->timer); 586 + } 506 587 507 588 static inline void rseq_slice_set_need_resched(struct task_struct *curr) 508 589 { ··· 648 563 return; 649 564 650 565 /* 651 - * Required to make set_tsk_need_resched() correct on PREEMPT[RT] 652 - * kernels. Leaving the scope will reschedule on preemption models 653 - * FULL, LAZY and RT if necessary. 566 + * Required to stabilize the per CPU timer pointer and to make 567 + * set_tsk_need_resched() correct on PREEMPT[RT] kernels. 568 + * 569 + * Leaving the scope will reschedule on preemption models FULL, 570 + * LAZY and RT if necessary. 654 571 */ 655 572 scoped_guard(preempt) { 573 + rseq_cancel_slice_extension_timer(); 656 574 /* 657 575 * Now that preemption is disabled, quickly check whether 658 576 * the task was already rescheduled before arriving here. ··· 753 665 return yielded; 754 666 } 755 667 668 + #ifdef CONFIG_SYSCTL 669 + static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; 670 + static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; 671 + 672 + static const struct ctl_table rseq_slice_ext_sysctl[] = { 673 + { 674 + .procname = "rseq_slice_extension_nsec", 675 + .data = &rseq_slice_ext_nsecs, 676 + .maxlen = sizeof(unsigned int), 677 + .mode = 0644, 678 + .proc_handler = proc_douintvec_minmax, 679 + .extra1 = (unsigned int *)&rseq_slice_ext_nsecs_min, 680 + .extra2 = (unsigned int *)&rseq_slice_ext_nsecs_max, 681 + }, 682 + }; 683 + 684 + static void rseq_slice_sysctl_init(void) 685 + { 686 + if (rseq_slice_extension_enabled()) 687 + register_sysctl_init("kernel", rseq_slice_ext_sysctl); 688 + } 689 + #else /* CONFIG_SYSCTL */ 690 + static inline void rseq_slice_sysctl_init(void) { } 691 + #endif /* !CONFIG_SYSCTL */ 692 + 756 693 static int __init rseq_slice_cmdline(char *str) 757 694 { 758 695 bool on; ··· 790 677 return 1; 791 678 } 792 679 __setup("rseq_slice_ext=", rseq_slice_cmdline); 680 + 681 + static int __init rseq_slice_init(void) 682 + { 683 + unsigned int cpu; 684 + 685 + for_each_possible_cpu(cpu) { 686 + hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, 687 + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); 688 + } 689 + rseq_slice_sysctl_init(); 690 + return 0; 691 + } 692 + device_initcall(rseq_slice_init); 793 693 #endif /* CONFIG_RSEQ_SLICE_EXTENSION */