Merge branch 'sched/hrtick' into timers/core

+2

arch/x86/Kconfig

··· 141 141 select ARCH_USE_SYM_ANNOTATIONS 142 142 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 143 143 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 144 + select ARCH_WANTS_CLOCKSOURCE_READ_INLINE if X86_64 144 145 select ARCH_WANTS_DYNAMIC_TASK_STRUCT 145 146 select ARCH_WANTS_NO_INSTR 146 147 select ARCH_WANT_GENERAL_HUGETLB ··· 164 163 select EDAC_SUPPORT 165 164 select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) 166 165 select GENERIC_CLOCKEVENTS_BROADCAST_IDLE if GENERIC_CLOCKEVENTS_BROADCAST 166 + select GENERIC_CLOCKEVENTS_COUPLED_INLINE if X86_64 167 167 select GENERIC_CLOCKEVENTS_MIN_ADJUST 168 168 select GENERIC_CMOS_UPDATE 169 169 select GENERIC_CPU_AUTOPROBE

+22

arch/x86/include/asm/clock_inlined.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_X86_CLOCK_INLINED_H 3 + #define _ASM_X86_CLOCK_INLINED_H 4 + 5 + #include <asm/tsc.h> 6 + 7 + struct clocksource; 8 + 9 + static __always_inline u64 arch_inlined_clocksource_read(struct clocksource *cs) 10 + { 11 + return (u64)rdtsc_ordered(); 12 + } 13 + 14 + struct clock_event_device; 15 + 16 + static __always_inline void 17 + arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *evt) 18 + { 19 + native_wrmsrq(MSR_IA32_TSC_DEADLINE, cycles); 20 + } 21 + 22 + #endif

+23 -18

arch/x86/kernel/apic/apic.c

··· 412 412 /* 413 413 * Program the next event, relative to now 414 414 */ 415 - static int lapic_next_event(unsigned long delta, 416 - struct clock_event_device *evt) 415 + static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) 417 416 { 418 417 apic_write(APIC_TMICT, delta); 419 418 return 0; 420 419 } 421 420 422 - static int lapic_next_deadline(unsigned long delta, 423 - struct clock_event_device *evt) 421 + static int lapic_next_deadline(unsigned long delta, struct clock_event_device *evt) 424 422 { 425 - u64 tsc; 423 + /* 424 + * There is no weak_wrmsr_fence() required here as all of this is purely 425 + * CPU local. Avoid the [ml]fence overhead. 426 + */ 427 + u64 tsc = rdtsc(); 426 428 427 - /* This MSR is special and need a special fence: */ 428 - weak_wrmsr_fence(); 429 - 430 - tsc = rdtsc(); 431 - wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); 429 + native_wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); 432 430 return 0; 433 431 } 434 432 ··· 450 452 * the timer _and_ zero the counter registers: 451 453 */ 452 454 if (v & APIC_LVT_TIMER_TSCDEADLINE) 453 - wrmsrq(MSR_IA32_TSC_DEADLINE, 0); 455 + native_wrmsrq(MSR_IA32_TSC_DEADLINE, 0); 454 456 else 455 457 apic_write(APIC_TMICT, 0); 456 458 ··· 547 549 548 550 if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) 549 551 return false; 552 + 553 + /* XEN_PV does not support it, but be paranoia about it */ 554 + if (boot_cpu_has(X86_FEATURE_XENPV)) 555 + goto clear; 556 + 550 557 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) 551 558 return true; 552 559 ··· 564 561 if (boot_cpu_data.microcode >= rev) 565 562 return true; 566 563 567 - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); 568 564 pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " 569 565 "please update microcode to version: 0x%x (or later)\n", rev); 566 + 567 + clear: 568 + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); 570 569 return false; 571 570 } 572 571 ··· 591 586 592 587 if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { 593 588 levt->name = "lapic-deadline"; 594 - levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | 595 - CLOCK_EVT_FEAT_DUMMY); 589 + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_DUMMY); 590 + levt->features |= CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED; 591 + levt->cs_id = CSID_X86_TSC; 596 592 levt->set_next_event = lapic_next_deadline; 597 - clockevents_config_and_register(levt, 598 - tsc_khz * (1000 / TSC_DIVISOR), 599 - 0xF, ~0UL); 600 - } else 593 + clockevents_config_and_register(levt, tsc_khz * (1000 / TSC_DIVISOR), 0xF, ~0UL); 594 + } else { 601 595 clockevents_register_device(levt); 596 + } 602 597 603 598 apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true); 604 599 }

+3 -1

arch/x86/kernel/tsc.c

··· 1201 1201 .mask = CLOCKSOURCE_MASK(64), 1202 1202 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 1203 1203 CLOCK_SOURCE_VALID_FOR_HRES | 1204 + CLOCK_SOURCE_CAN_INLINE_READ | 1204 1205 CLOCK_SOURCE_MUST_VERIFY | 1205 - CLOCK_SOURCE_VERIFY_PERCPU, 1206 + CLOCK_SOURCE_VERIFY_PERCPU | 1207 + CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT, 1206 1208 .id = CSID_X86_TSC, 1207 1209 .vdso_clock_mode = VDSO_CLOCKMODE_TSC, 1208 1210 .enable = tsc_cs_enable,

+4 -1

include/asm-generic/thread_info_tif.h

··· 41 41 #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) 42 42 43 43 #ifdef HAVE_TIF_RESTORE_SIGMASK 44 - # define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */ 44 + # define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() 45 45 # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) 46 46 #endif 47 47 48 48 #define TIF_RSEQ 11 // Run RSEQ fast path 49 49 #define _TIF_RSEQ BIT(TIF_RSEQ) 50 + 51 + #define TIF_HRTIMER_REARM 12 // re-arm the timer 52 + #define _TIF_HRTIMER_REARM BIT(TIF_HRTIMER_REARM) 50 53 51 54 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */

+5 -3

include/linux/clockchips.h

··· 43 43 /* 44 44 * Clock event features 45 45 */ 46 - # define CLOCK_EVT_FEAT_PERIODIC 0x000001 47 - # define CLOCK_EVT_FEAT_ONESHOT 0x000002 48 - # define CLOCK_EVT_FEAT_KTIME 0x000004 46 + # define CLOCK_EVT_FEAT_PERIODIC 0x000001 47 + # define CLOCK_EVT_FEAT_ONESHOT 0x000002 48 + # define CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED 0x000004 49 49 50 50 /* 51 51 * x86(64) specific (mis)features: ··· 101 101 void (*event_handler)(struct clock_event_device *); 102 102 int (*set_next_event)(unsigned long evt, struct clock_event_device *); 103 103 int (*set_next_ktime)(ktime_t expires, struct clock_event_device *); 104 + void (*set_next_coupled)(u64 cycles, struct clock_event_device *); 104 105 ktime_t next_event; 105 106 u64 max_delta_ns; 106 107 u64 min_delta_ns; ··· 109 108 u32 shift; 110 109 enum clock_event_state state_use_accessors; 111 110 unsigned int features; 111 + enum clocksource_ids cs_id; 112 112 unsigned long retries; 113 113 114 114 int (*set_state_periodic)(struct clock_event_device *);

+3

include/linux/clocksource.h

··· 149 149 #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 150 150 #define CLOCK_SOURCE_RESELECT 0x100 151 151 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 152 + #define CLOCK_SOURCE_CAN_INLINE_READ 0x400 153 + #define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800 154 + 152 155 /* simplify initialization of mask field */ 153 156 #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) 154 157

+20 -39

include/linux/hrtimer.h

··· 13 13 #define _LINUX_HRTIMER_H 14 14 15 15 #include <linux/hrtimer_defs.h> 16 + #include <linux/hrtimer_rearm.h> 16 17 #include <linux/hrtimer_types.h> 17 18 #include <linux/init.h> 18 19 #include <linux/list.h> ··· 32 31 * soft irq context 33 32 * HRTIMER_MODE_HARD - Timer callback function will be executed in 34 33 * hard irq context even on PREEMPT_RT. 34 + * HRTIMER_MODE_LAZY_REARM - Avoid reprogramming if the timer was the 35 + * first expiring timer and is moved into the 36 + * future. Special mode for the HRTICK timer to 37 + * avoid extensive reprogramming of the hardware, 38 + * which is expensive in virtual machines. Risks 39 + * a pointless expiry, but that's better than 40 + * reprogramming on every context switch, 35 41 */ 36 42 enum hrtimer_mode { 37 43 HRTIMER_MODE_ABS = 0x00, ··· 46 38 HRTIMER_MODE_PINNED = 0x02, 47 39 HRTIMER_MODE_SOFT = 0x04, 48 40 HRTIMER_MODE_HARD = 0x08, 41 + HRTIMER_MODE_LAZY_REARM = 0x10, 49 42 50 43 HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, 51 44 HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, ··· 63 54 HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD, 64 55 HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, 65 56 }; 66 - 67 - /* 68 - * Values to track state of the timer 69 - * 70 - * Possible states: 71 - * 72 - * 0x00 inactive 73 - * 0x01 enqueued into rbtree 74 - * 75 - * The callback state is not part of the timer->state because clearing it would 76 - * mean touching the timer after the callback, this makes it impossible to free 77 - * the timer from the callback function. 78 - * 79 - * Therefore we track the callback state in: 80 - * 81 - * timer->base->cpu_base->running == timer 82 - * 83 - * On SMP it is possible to have a "callback function running and enqueued" 84 - * status. It happens for example when a posix timer expired and the callback 85 - * queued a signal. Between dropping the lock which protects the posix timer 86 - * and reacquiring the base lock of the hrtimer, another CPU can deliver the 87 - * signal and rearm the timer. 88 - * 89 - * All state transitions are protected by cpu_base->lock. 90 - */ 91 - #define HRTIMER_STATE_INACTIVE 0x00 92 - #define HRTIMER_STATE_ENQUEUED 0x01 93 57 94 58 /** 95 59 * struct hrtimer_sleeper - simple sleeper structure ··· 128 146 return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); 129 147 } 130 148 131 - static inline int hrtimer_is_hres_active(struct hrtimer *timer) 132 - { 133 - return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? 134 - timer->base->cpu_base->hres_active : 0; 135 - } 136 - 137 149 #ifdef CONFIG_HIGH_RES_TIMERS 150 + extern unsigned int hrtimer_resolution; 138 151 struct clock_event_device; 139 152 140 153 extern void hrtimer_interrupt(struct clock_event_device *dev); 141 154 142 - extern unsigned int hrtimer_resolution; 155 + extern struct static_key_false hrtimer_highres_enabled_key; 143 156 144 - #else 157 + static inline bool hrtimer_highres_enabled(void) 158 + { 159 + return static_branch_likely(&hrtimer_highres_enabled_key); 160 + } 145 161 162 + #else /* CONFIG_HIGH_RES_TIMERS */ 146 163 #define hrtimer_resolution (unsigned int)LOW_RES_NSEC 147 - 148 - #endif 164 + static inline bool hrtimer_highres_enabled(void) { return false; } 165 + #endif /* !CONFIG_HIGH_RES_TIMERS */ 149 166 150 167 static inline ktime_t 151 168 __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) ··· 274 293 */ 275 294 static inline bool hrtimer_is_queued(struct hrtimer *timer) 276 295 { 277 - /* The READ_ONCE pairs with the update functions of timer->state */ 278 - return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); 296 + /* The READ_ONCE pairs with the update functions of timer->is_queued */ 297 + return READ_ONCE(timer->is_queued); 279 298 } 280 299 281 300 /*

+41 -38

include/linux/hrtimer_defs.h

··· 19 19 * timer to a base on another cpu. 20 20 * @clockid: clock id for per_cpu support 21 21 * @seq: seqcount around __run_hrtimer 22 + * @expires_next: Absolute time of the next event in this clock base 22 23 * @running: pointer to the currently running hrtimer 23 24 * @active: red black tree root node for the active timers 24 25 * @offset: offset of this clock to the monotonic base 25 26 */ 26 27 struct hrtimer_clock_base { 27 - struct hrtimer_cpu_base *cpu_base; 28 - unsigned int index; 29 - clockid_t clockid; 30 - seqcount_raw_spinlock_t seq; 31 - struct hrtimer *running; 32 - struct timerqueue_head active; 33 - ktime_t offset; 28 + struct hrtimer_cpu_base *cpu_base; 29 + unsigned int index; 30 + clockid_t clockid; 31 + seqcount_raw_spinlock_t seq; 32 + ktime_t expires_next; 33 + struct hrtimer *running; 34 + struct timerqueue_linked_head active; 35 + ktime_t offset; 34 36 } __hrtimer_clock_base_align; 35 37 36 38 enum hrtimer_base_type { ··· 49 47 50 48 /** 51 49 * struct hrtimer_cpu_base - the per cpu clock bases 52 - * @lock: lock protecting the base and associated clock bases 53 - * and timers 54 - * @cpu: cpu number 55 - * @active_bases: Bitfield to mark bases with active timers 56 - * @clock_was_set_seq: Sequence counter of clock was set events 57 - * @hres_active: State of high resolution mode 58 - * @in_hrtirq: hrtimer_interrupt() is currently executing 59 - * @hang_detected: The last hrtimer interrupt detected a hang 60 - * @softirq_activated: displays, if the softirq is raised - update of softirq 61 - * related settings is not required then. 62 - * @nr_events: Total number of hrtimer interrupt events 63 - * @nr_retries: Total number of hrtimer interrupt retries 64 - * @nr_hangs: Total number of hrtimer interrupt hangs 65 - * @max_hang_time: Maximum time spent in hrtimer_interrupt 66 - * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are 67 - * expired 68 - * @online: CPU is online from an hrtimers point of view 69 - * @timer_waiters: A hrtimer_cancel() invocation waits for the timer 70 - * callback to finish. 71 - * @expires_next: absolute time of the next event, is required for remote 72 - * hrtimer enqueue; it is the total first expiry time (hard 73 - * and soft hrtimer are taken into account) 74 - * @next_timer: Pointer to the first expiring timer 75 - * @softirq_expires_next: Time to check, if soft queues needs also to be expired 76 - * @softirq_next_timer: Pointer to the first expiring softirq based timer 77 - * @clock_base: array of clock bases for this cpu 50 + * @lock: lock protecting the base and associated clock bases and timers 51 + * @cpu: cpu number 52 + * @active_bases: Bitfield to mark bases with active timers 53 + * @clock_was_set_seq: Sequence counter of clock was set events 54 + * @hres_active: State of high resolution mode 55 + * @deferred_rearm: A deferred rearm is pending 56 + * @deferred_needs_update: The deferred rearm must re-evaluate the first timer 57 + * @hang_detected: The last hrtimer interrupt detected a hang 58 + * @softirq_activated: displays, if the softirq is raised - update of softirq 59 + * related settings is not required then. 60 + * @nr_events: Total number of hrtimer interrupt events 61 + * @nr_retries: Total number of hrtimer interrupt retries 62 + * @nr_hangs: Total number of hrtimer interrupt hangs 63 + * @max_hang_time: Maximum time spent in hrtimer_interrupt 64 + * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are expired 65 + * @online: CPU is online from an hrtimers point of view 66 + * @timer_waiters: A hrtimer_cancel() waiters for the timer callback to finish. 67 + * @expires_next: Absolute time of the next event, is required for remote 68 + * hrtimer enqueue; it is the total first expiry time (hard 69 + * and soft hrtimer are taken into account) 70 + * @next_timer: Pointer to the first expiring timer 71 + * @softirq_expires_next: Time to check, if soft queues needs also to be expired 72 + * @softirq_next_timer: Pointer to the first expiring softirq based timer 73 + * @deferred_expires_next: Cached expires next value for deferred rearm 74 + * @clock_base: Array of clock bases for this cpu 78 75 * 79 76 * Note: next_timer is just an optimization for __remove_hrtimer(). 80 77 * Do not dereference the pointer because it is not reliable on ··· 84 83 unsigned int cpu; 85 84 unsigned int active_bases; 86 85 unsigned int clock_was_set_seq; 87 - unsigned int hres_active : 1, 88 - in_hrtirq : 1, 89 - hang_detected : 1, 90 - softirq_activated : 1, 91 - online : 1; 86 + bool hres_active; 87 + bool deferred_rearm; 88 + bool deferred_needs_update; 89 + bool hang_detected; 90 + bool softirq_activated; 91 + bool online; 92 92 #ifdef CONFIG_HIGH_RES_TIMERS 93 93 unsigned int nr_events; 94 94 unsigned short nr_retries; ··· 104 102 struct hrtimer *next_timer; 105 103 ktime_t softirq_expires_next; 106 104 struct hrtimer *softirq_next_timer; 105 + ktime_t deferred_expires_next; 107 106 struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; 108 107 call_single_data_t csd; 109 108 } ____cacheline_aligned;

+83

include/linux/hrtimer_rearm.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef _LINUX_HRTIMER_REARM_H 3 + #define _LINUX_HRTIMER_REARM_H 4 + 5 + #ifdef CONFIG_HRTIMER_REARM_DEFERRED 6 + #include <linux/thread_info.h> 7 + 8 + void __hrtimer_rearm_deferred(void); 9 + 10 + /* 11 + * This is purely CPU local, so check the TIF bit first to avoid the overhead of 12 + * the atomic test_and_clear_bit() operation for the common case where the bit 13 + * is not set. 14 + */ 15 + static __always_inline bool hrtimer_test_and_clear_rearm_deferred_tif(unsigned long tif_work) 16 + { 17 + lockdep_assert_irqs_disabled(); 18 + 19 + if (unlikely(tif_work & _TIF_HRTIMER_REARM)) { 20 + clear_thread_flag(TIF_HRTIMER_REARM); 21 + return true; 22 + } 23 + return false; 24 + } 25 + 26 + #define TIF_REARM_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_HRTIMER_REARM) 27 + 28 + /* Invoked from the exit to user before invoking exit_to_user_mode_loop() */ 29 + static __always_inline bool 30 + hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) 31 + { 32 + /* Help the compiler to optimize the function out for syscall returns */ 33 + if (!(tif_mask & _TIF_HRTIMER_REARM)) 34 + return false; 35 + /* 36 + * Rearm the timer if none of the resched flags is set before going into 37 + * the loop which re-enables interrupts. 38 + */ 39 + if (unlikely((*tif_work & TIF_REARM_MASK) == _TIF_HRTIMER_REARM)) { 40 + clear_thread_flag(TIF_HRTIMER_REARM); 41 + __hrtimer_rearm_deferred(); 42 + /* Don't go into the loop if HRTIMER_REARM was the only flag */ 43 + *tif_work &= ~TIF_HRTIMER_REARM; 44 + return !*tif_work; 45 + } 46 + return false; 47 + } 48 + 49 + /* Invoked from the time slice extension decision function */ 50 + static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) 51 + { 52 + if (hrtimer_test_and_clear_rearm_deferred_tif(tif_work)) 53 + __hrtimer_rearm_deferred(); 54 + } 55 + 56 + /* 57 + * This is to be called on all irqentry_exit() paths that will enable 58 + * interrupts. 59 + */ 60 + static __always_inline void hrtimer_rearm_deferred(void) 61 + { 62 + hrtimer_rearm_deferred_tif(read_thread_flags()); 63 + } 64 + 65 + /* 66 + * Invoked from the scheduler on entry to __schedule() so it can defer 67 + * rearming after the load balancing callbacks which might change hrtick. 68 + */ 69 + static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) 70 + { 71 + return hrtimer_test_and_clear_rearm_deferred_tif(read_thread_flags()); 72 + } 73 + 74 + #else /* CONFIG_HRTIMER_REARM_DEFERRED */ 75 + static __always_inline void __hrtimer_rearm_deferred(void) { } 76 + static __always_inline void hrtimer_rearm_deferred(void) { } 77 + static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } 78 + static __always_inline bool 79 + hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } 80 + static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } 81 + #endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ 82 + 83 + #endif

+11 -8

include/linux/hrtimer_types.h

··· 17 17 18 18 /** 19 19 * struct hrtimer - the basic hrtimer structure 20 - * @node: timerqueue node, which also manages node.expires, 20 + * @node: Linked timerqueue node, which also manages node.expires, 21 21 * the absolute expiry time in the hrtimers internal 22 22 * representation. The time is related to the clock on 23 23 * which the timer is based. Is setup by adding ··· 28 28 * was armed. 29 29 * @function: timer expiry callback function 30 30 * @base: pointer to the timer base (per cpu and per clock) 31 - * @state: state information (See bit values above) 31 + * @is_queued: Indicates whether a timer is enqueued or not 32 32 * @is_rel: Set if the timer was armed relative 33 33 * @is_soft: Set if hrtimer will be expired in soft interrupt context. 34 34 * @is_hard: Set if hrtimer will be expired in hard interrupt context 35 35 * even on RT. 36 + * @is_lazy: Set if the timer is frequently rearmed to avoid updates 37 + * of the clock event device 36 38 * 37 39 * The hrtimer structure must be initialized by hrtimer_setup() 38 40 */ 39 41 struct hrtimer { 40 - struct timerqueue_node node; 42 + struct timerqueue_linked_node node; 43 + struct hrtimer_clock_base *base; 44 + bool is_queued; 45 + bool is_rel; 46 + bool is_soft; 47 + bool is_hard; 48 + bool is_lazy; 41 49 ktime_t _softexpires; 42 50 enum hrtimer_restart (*__private function)(struct hrtimer *); 43 - struct hrtimer_clock_base *base; 44 - u8 state; 45 - u8 is_rel; 46 - u8 is_soft; 47 - u8 is_hard; 48 51 }; 49 52 50 53 #endif /* _LINUX_HRTIMER_TYPES_H */

+19 -6

include/linux/irq-entry-common.h

··· 3 3 #define __LINUX_IRQENTRYCOMMON_H 4 4 5 5 #include <linux/context_tracking.h> 6 + #include <linux/hrtimer_rearm.h> 6 7 #include <linux/kmsan.h> 7 8 #include <linux/rseq_entry.h> 8 9 #include <linux/static_call_types.h> ··· 33 32 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ 34 33 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ 35 34 ARCH_EXIT_TO_USER_MODE_WORK) 35 + 36 + #ifdef CONFIG_HRTIMER_REARM_DEFERRED 37 + # define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) 38 + # define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK | _TIF_HRTIMER_REARM) 39 + #else 40 + # define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) 41 + # define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK) 42 + #endif 36 43 37 44 /** 38 45 * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs ··· 212 203 /** 213 204 * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 214 205 * @regs: Pointer to pt_regs on entry stack 206 + * @work_mask: Which TIF bits need to be evaluated 215 207 * 216 208 * 1) check that interrupts are disabled 217 209 * 2) call tick_nohz_user_enter_prepare() ··· 222 212 * 223 213 * Don't invoke directly, use the syscall/irqentry_ prefixed variants below 224 214 */ 225 - static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) 215 + static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs, 216 + const unsigned long work_mask) 226 217 { 227 218 unsigned long ti_work; 228 219 ··· 233 222 tick_nohz_user_enter_prepare(); 234 223 235 224 ti_work = read_thread_flags(); 236 - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 237 - ti_work = exit_to_user_mode_loop(regs, ti_work); 225 + if (unlikely(ti_work & work_mask)) { 226 + if (!hrtimer_rearm_deferred_user_irq(&ti_work, work_mask)) 227 + ti_work = exit_to_user_mode_loop(regs, ti_work); 228 + } 238 229 239 230 arch_exit_to_user_mode_prepare(regs, ti_work); 240 231 } ··· 252 239 /* Temporary workaround to keep ARM64 alive */ 253 240 static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) 254 241 { 255 - __exit_to_user_mode_prepare(regs); 242 + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); 256 243 rseq_exit_to_user_mode_legacy(); 257 244 __exit_to_user_mode_validate(); 258 245 } ··· 266 253 */ 267 254 static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 268 255 { 269 - __exit_to_user_mode_prepare(regs); 256 + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_SYSCALL); 270 257 rseq_syscall_exit_to_user_mode(); 271 258 __exit_to_user_mode_validate(); 272 259 } ··· 280 267 */ 281 268 static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) 282 269 { 283 - __exit_to_user_mode_prepare(regs); 270 + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_IRQ); 284 271 rseq_irqentry_exit_to_user_mode(); 285 272 __exit_to_user_mode_validate(); 286 273 }

+72 -9

include/linux/rbtree.h

··· 35 35 #define RB_CLEAR_NODE(node) \ 36 36 ((node)->__rb_parent_color = (unsigned long)(node)) 37 37 38 + #define RB_EMPTY_LINKED_NODE(lnode) RB_EMPTY_NODE(&(lnode)->node) 39 + #define RB_CLEAR_LINKED_NODE(lnode) ({ \ 40 + RB_CLEAR_NODE(&(lnode)->node); \ 41 + (lnode)->prev = (lnode)->next = NULL; \ 42 + }) 38 43 39 44 extern void rb_insert_color(struct rb_node *, struct rb_root *); 40 45 extern void rb_erase(struct rb_node *, struct rb_root *); 41 - 46 + extern bool rb_erase_linked(struct rb_node_linked *, struct rb_root_linked *); 42 47 43 48 /* Find logical next and previous nodes in a tree */ 44 49 extern struct rb_node *rb_next(const struct rb_node *); ··· 218 213 return leftmost ? node : NULL; 219 214 } 220 215 221 - /** 222 - * rb_add() - insert @node into @tree 223 - * @node: node to insert 224 - * @tree: tree to insert @node into 225 - * @less: operator defining the (partial) node order 226 - */ 227 216 static __always_inline void 228 - rb_add(struct rb_node *node, struct rb_root *tree, 229 - bool (*less)(struct rb_node *, const struct rb_node *)) 217 + __rb_add(struct rb_node *node, struct rb_root *tree, 218 + bool (*less)(struct rb_node *, const struct rb_node *), 219 + void (*linkop)(struct rb_node *, struct rb_node *, struct rb_node **)) 230 220 { 231 221 struct rb_node **link = &tree->rb_node; 232 222 struct rb_node *parent = NULL; ··· 234 234 link = &parent->rb_right; 235 235 } 236 236 237 + linkop(node, parent, link); 237 238 rb_link_node(node, parent, link); 238 239 rb_insert_color(node, tree); 240 + } 241 + 242 + #define __node_2_linked_node(_n) \ 243 + rb_entry((_n), struct rb_node_linked, node) 244 + 245 + static inline void 246 + rb_link_linked_node(struct rb_node *node, struct rb_node *parent, struct rb_node **link) 247 + { 248 + if (!parent) 249 + return; 250 + 251 + struct rb_node_linked *nnew = __node_2_linked_node(node); 252 + struct rb_node_linked *npar = __node_2_linked_node(parent); 253 + 254 + if (link == &parent->rb_left) { 255 + nnew->prev = npar->prev; 256 + nnew->next = npar; 257 + npar->prev = nnew; 258 + if (nnew->prev) 259 + nnew->prev->next = nnew; 260 + } else { 261 + nnew->next = npar->next; 262 + nnew->prev = npar; 263 + npar->next = nnew; 264 + if (nnew->next) 265 + nnew->next->prev = nnew; 266 + } 267 + } 268 + 269 + /** 270 + * rb_add_linked() - insert @node into the leftmost linked tree @tree 271 + * @node: node to insert 272 + * @tree: linked tree to insert @node into 273 + * @less: operator defining the (partial) node order 274 + * 275 + * Returns @true when @node is the new leftmost, @false otherwise. 276 + */ 277 + static __always_inline bool 278 + rb_add_linked(struct rb_node_linked *node, struct rb_root_linked *tree, 279 + bool (*less)(struct rb_node *, const struct rb_node *)) 280 + { 281 + __rb_add(&node->node, &tree->rb_root, less, rb_link_linked_node); 282 + if (!node->prev) 283 + tree->rb_leftmost = node; 284 + return !node->prev; 285 + } 286 + 287 + /* Empty linkop function which is optimized away by the compiler */ 288 + static __always_inline void 289 + rb_link_noop(struct rb_node *n, struct rb_node *p, struct rb_node **l) { } 290 + 291 + /** 292 + * rb_add() - insert @node into @tree 293 + * @node: node to insert 294 + * @tree: tree to insert @node into 295 + * @less: operator defining the (partial) node order 296 + */ 297 + static __always_inline void 298 + rb_add(struct rb_node *node, struct rb_root *tree, 299 + bool (*less)(struct rb_node *, const struct rb_node *)) 300 + { 301 + __rb_add(node, tree, less, rb_link_noop); 239 302 } 240 303 241 304 /**

+16

include/linux/rbtree_types.h

··· 9 9 } __attribute__((aligned(sizeof(long)))); 10 10 /* The alignment might seem pointless, but allegedly CRIS needs it */ 11 11 12 + struct rb_node_linked { 13 + struct rb_node node; 14 + struct rb_node_linked *prev; 15 + struct rb_node_linked *next; 16 + }; 17 + 12 18 struct rb_root { 13 19 struct rb_node *rb_node; 14 20 }; ··· 34 28 struct rb_node *rb_leftmost; 35 29 }; 36 30 31 + /* 32 + * Leftmost tree with links. This would allow a trivial rb_rightmost update, 33 + * but that has been omitted due to the lack of users. 34 + */ 35 + struct rb_root_linked { 36 + struct rb_root rb_root; 37 + struct rb_node_linked *rb_leftmost; 38 + }; 39 + 37 40 #define RB_ROOT (struct rb_root) { NULL, } 38 41 #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } 42 + #define RB_ROOT_LINKED (struct rb_root_linked) { {NULL, }, NULL } 39 43 40 44 #endif

+13 -3

include/linux/rseq_entry.h

··· 40 40 #endif /* !CONFIG_RSEQ_STATS */ 41 41 42 42 #ifdef CONFIG_RSEQ 43 + #include <linux/hrtimer_rearm.h> 43 44 #include <linux/jump_label.h> 44 45 #include <linux/rseq.h> 45 46 #include <linux/sched/signal.h> ··· 111 110 t->rseq.slice.state.granted = false; 112 111 } 113 112 114 - static __always_inline bool rseq_grant_slice_extension(bool work_pending) 113 + static __always_inline bool __rseq_grant_slice_extension(bool work_pending) 115 114 { 116 115 struct task_struct *curr = current; 117 116 struct rseq_slice_ctrl usr_ctrl; ··· 216 215 return false; 217 216 } 218 217 218 + static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) 219 + { 220 + if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) { 221 + hrtimer_rearm_deferred_tif(ti_work); 222 + return true; 223 + } 224 + return false; 225 + } 226 + 219 227 #else /* CONFIG_RSEQ_SLICE_EXTENSION */ 220 228 static inline bool rseq_slice_extension_enabled(void) { return false; } 221 229 static inline bool rseq_arm_slice_extension_timer(void) { return false; } 222 230 static inline void rseq_slice_clear_grant(struct task_struct *t) { } 223 - static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } 231 + static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } 224 232 #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ 225 233 226 234 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); ··· 788 778 static inline void rseq_irqentry_exit_to_user_mode(void) { } 789 779 static inline void rseq_exit_to_user_mode_legacy(void) { } 790 780 static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } 791 - static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } 781 + static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } 792 782 #endif /* !CONFIG_RSEQ */ 793 783 794 784 #endif /* _LINUX_RSEQ_ENTRY_H */

+8

include/linux/timekeeper_internal.h

··· 72 72 * @id: The timekeeper ID 73 73 * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW 74 74 * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds 75 + * @cs_id: The ID of the current clocksource 76 + * @cs_ns_to_cyc_mult: Multiplicator for nanoseconds to cycles conversion 77 + * @cs_ns_to_cyc_shift: Shift value for nanoseconds to cycles conversion 78 + * @cs_ns_to_cyc_maxns: Maximum nanoseconds to cyles conversion range 75 79 * @clock_was_set_seq: The sequence number of clock was set events 76 80 * @cs_was_changed_seq: The sequence number of clocksource change events 77 81 * @clock_valid: Indicator for valid clock ··· 163 159 u64 raw_sec; 164 160 165 161 /* Cachline 3 and 4 (timekeeping internal variables): */ 162 + enum clocksource_ids cs_id; 163 + u32 cs_ns_to_cyc_mult; 164 + u32 cs_ns_to_cyc_shift; 165 + u64 cs_ns_to_cyc_maxns; 166 166 unsigned int clock_was_set_seq; 167 167 u8 cs_was_changed_seq; 168 168 u8 clock_valid;

+48 -8

include/linux/timerqueue.h

··· 5 5 #include <linux/rbtree.h> 6 6 #include <linux/timerqueue_types.h> 7 7 8 - extern bool timerqueue_add(struct timerqueue_head *head, 9 - struct timerqueue_node *node); 10 - extern bool timerqueue_del(struct timerqueue_head *head, 11 - struct timerqueue_node *node); 12 - extern struct timerqueue_node *timerqueue_iterate_next( 13 - struct timerqueue_node *node); 8 + bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); 9 + bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node); 10 + struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node); 11 + 12 + bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node); 14 13 15 14 /** 16 15 * timerqueue_getnext - Returns the timer with the earliest expiration time ··· 18 19 * 19 20 * Returns a pointer to the timer node that has the earliest expiration time. 20 21 */ 21 - static inline 22 - struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) 22 + static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) 23 23 { 24 24 struct rb_node *leftmost = rb_first_cached(&head->rb_root); 25 25 ··· 39 41 { 40 42 head->rb_root = RB_ROOT_CACHED; 41 43 } 44 + 45 + /* Timer queues with linked nodes */ 46 + 47 + static __always_inline 48 + struct timerqueue_linked_node *timerqueue_linked_first(struct timerqueue_linked_head *head) 49 + { 50 + return rb_entry_safe(head->rb_root.rb_leftmost, struct timerqueue_linked_node, node); 51 + } 52 + 53 + static __always_inline 54 + struct timerqueue_linked_node *timerqueue_linked_next(struct timerqueue_linked_node *node) 55 + { 56 + return rb_entry_safe(node->node.next, struct timerqueue_linked_node, node); 57 + } 58 + 59 + static __always_inline 60 + struct timerqueue_linked_node *timerqueue_linked_prev(struct timerqueue_linked_node *node) 61 + { 62 + return rb_entry_safe(node->node.prev, struct timerqueue_linked_node, node); 63 + } 64 + 65 + static __always_inline 66 + bool timerqueue_linked_del(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) 67 + { 68 + return rb_erase_linked(&node->node, &head->rb_root); 69 + } 70 + 71 + static __always_inline void timerqueue_linked_init(struct timerqueue_linked_node *node) 72 + { 73 + RB_CLEAR_LINKED_NODE(&node->node); 74 + } 75 + 76 + static __always_inline bool timerqueue_linked_node_queued(struct timerqueue_linked_node *node) 77 + { 78 + return !RB_EMPTY_LINKED_NODE(&node->node); 79 + } 80 + 81 + static __always_inline void timerqueue_linked_init_head(struct timerqueue_linked_head *head) 82 + { 83 + head->rb_root = RB_ROOT_LINKED; 84 + } 85 + 42 86 #endif /* _LINUX_TIMERQUEUE_H */

+12 -3

include/linux/timerqueue_types.h

··· 6 6 #include <linux/types.h> 7 7 8 8 struct timerqueue_node { 9 - struct rb_node node; 10 - ktime_t expires; 9 + struct rb_node node; 10 + ktime_t expires; 11 11 }; 12 12 13 13 struct timerqueue_head { 14 - struct rb_root_cached rb_root; 14 + struct rb_root_cached rb_root; 15 + }; 16 + 17 + struct timerqueue_linked_node { 18 + struct rb_node_linked node; 19 + ktime_t expires; 20 + }; 21 + 22 + struct timerqueue_linked_head { 23 + struct rb_root_linked rb_root; 15 24 }; 16 25 17 26 #endif /* _LINUX_TIMERQUEUE_TYPES_H */

+31 -4

include/trace/events/timer.h

··· 218 218 * hrtimer_start - called when the hrtimer is started 219 219 * @hrtimer: pointer to struct hrtimer 220 220 * @mode: the hrtimers mode 221 + * @was_armed: Was armed when hrtimer_start*() was invoked 221 222 */ 222 223 TRACE_EVENT(hrtimer_start, 223 224 224 - TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode), 225 + TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode, bool was_armed), 225 226 226 - TP_ARGS(hrtimer, mode), 227 + TP_ARGS(hrtimer, mode, was_armed), 227 228 228 229 TP_STRUCT__entry( 229 230 __field( void *, hrtimer ) ··· 232 231 __field( s64, expires ) 233 232 __field( s64, softexpires ) 234 233 __field( enum hrtimer_mode, mode ) 234 + __field( bool, was_armed ) 235 235 ), 236 236 237 237 TP_fast_assign( ··· 241 239 __entry->expires = hrtimer_get_expires(hrtimer); 242 240 __entry->softexpires = hrtimer_get_softexpires(hrtimer); 243 241 __entry->mode = mode; 242 + __entry->was_armed = was_armed; 244 243 ), 245 244 246 245 TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu " 247 - "mode=%s", __entry->hrtimer, __entry->function, 246 + "mode=%s was_armed=%d", __entry->hrtimer, __entry->function, 248 247 (unsigned long long) __entry->expires, 249 248 (unsigned long long) __entry->softexpires, 250 - decode_hrtimer_mode(__entry->mode)) 249 + decode_hrtimer_mode(__entry->mode), __entry->was_armed) 251 250 ); 252 251 253 252 /** ··· 322 319 TP_PROTO(struct hrtimer *hrtimer), 323 320 324 321 TP_ARGS(hrtimer) 322 + ); 323 + 324 + /** 325 + * hrtimer_rearm - Invoked when the clockevent device is rearmed 326 + * @next_event: The next expiry time (CLOCK_MONOTONIC) 327 + */ 328 + TRACE_EVENT(hrtimer_rearm, 329 + 330 + TP_PROTO(ktime_t next_event, bool deferred), 331 + 332 + TP_ARGS(next_event, deferred), 333 + 334 + TP_STRUCT__entry( 335 + __field( s64, next_event ) 336 + __field( bool, deferred ) 337 + ), 338 + 339 + TP_fast_assign( 340 + __entry->next_event = next_event; 341 + __entry->deferred = deferred; 342 + ), 343 + 344 + TP_printk("next_event=%llu deferred=%d", 345 + (unsigned long long) __entry->next_event, __entry->deferred) 325 346 ); 326 347 327 348 /**

+3 -1

kernel/entry/common.c

··· 50 50 local_irq_enable_exit_to_user(ti_work); 51 51 52 52 if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { 53 - if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) 53 + if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY)) 54 54 schedule(); 55 55 } 56 56 ··· 225 225 */ 226 226 if (state.exit_rcu) { 227 227 instrumentation_begin(); 228 + hrtimer_rearm_deferred(); 228 229 /* Tell the tracer that IRET will enable interrupts */ 229 230 trace_hardirqs_on_prepare(); 230 231 lockdep_hardirqs_on_prepare(); ··· 239 238 if (IS_ENABLED(CONFIG_PREEMPTION)) 240 239 irqentry_exit_cond_resched(); 241 240 241 + hrtimer_rearm_deferred(); 242 242 /* Covers both tracing and lockdep */ 243 243 trace_hardirqs_on(); 244 244 instrumentation_end();

+77 -18

kernel/sched/core.c

··· 872 872 * Use HR-timers to deliver accurate preemption points. 873 873 */ 874 874 875 - static void hrtick_clear(struct rq *rq) 875 + enum { 876 + HRTICK_SCHED_NONE = 0, 877 + HRTICK_SCHED_DEFER = BIT(1), 878 + HRTICK_SCHED_START = BIT(2), 879 + HRTICK_SCHED_REARM_HRTIMER = BIT(3) 880 + }; 881 + 882 + static void __used hrtick_clear(struct rq *rq) 876 883 { 877 884 if (hrtimer_active(&rq->hrtick_timer)) 878 885 hrtimer_cancel(&rq->hrtick_timer); ··· 904 897 return HRTIMER_NORESTART; 905 898 } 906 899 907 - static void __hrtick_restart(struct rq *rq) 900 + static inline bool hrtick_needs_rearm(struct hrtimer *timer, ktime_t expires) 901 + { 902 + /* 903 + * Queued is false when the timer is not started or currently 904 + * running the callback. In both cases, restart. If queued check 905 + * whether the expiry time actually changes substantially. 906 + */ 907 + return !hrtimer_is_queued(timer) || 908 + abs(expires - hrtimer_get_expires(timer)) > 5000; 909 + } 910 + 911 + static void hrtick_cond_restart(struct rq *rq) 908 912 { 909 913 struct hrtimer *timer = &rq->hrtick_timer; 910 914 ktime_t time = rq->hrtick_time; 911 915 912 - hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); 916 + if (hrtick_needs_rearm(timer, time)) 917 + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); 913 918 } 914 919 915 920 /* ··· 933 914 struct rq_flags rf; 934 915 935 916 rq_lock(rq, &rf); 936 - __hrtick_restart(rq); 917 + hrtick_cond_restart(rq); 937 918 rq_unlock(rq, &rf); 938 919 } 939 920 ··· 944 925 */ 945 926 void hrtick_start(struct rq *rq, u64 delay) 946 927 { 947 - struct hrtimer *timer = &rq->hrtick_timer; 948 928 s64 delta; 949 929 950 930 /* ··· 951 933 * doesn't make sense and can cause timer DoS. 952 934 */ 953 935 delta = max_t(s64, delay, 10000LL); 954 - rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta); 936 + 937 + /* 938 + * If this is in the middle of schedule() only note the delay 939 + * and let hrtick_schedule_exit() deal with it. 940 + */ 941 + if (rq->hrtick_sched) { 942 + rq->hrtick_sched |= HRTICK_SCHED_START; 943 + rq->hrtick_delay = delta; 944 + return; 945 + } 946 + 947 + rq->hrtick_time = ktime_add_ns(ktime_get(), delta); 948 + if (!hrtick_needs_rearm(&rq->hrtick_timer, rq->hrtick_time)) 949 + return; 955 950 956 951 if (rq == this_rq()) 957 - __hrtick_restart(rq); 952 + hrtimer_start(&rq->hrtick_timer, rq->hrtick_time, HRTIMER_MODE_ABS_PINNED_HARD); 958 953 else 959 954 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 955 + } 956 + 957 + static inline void hrtick_schedule_enter(struct rq *rq) 958 + { 959 + rq->hrtick_sched = HRTICK_SCHED_DEFER; 960 + if (hrtimer_test_and_clear_rearm_deferred()) 961 + rq->hrtick_sched |= HRTICK_SCHED_REARM_HRTIMER; 962 + } 963 + 964 + static inline void hrtick_schedule_exit(struct rq *rq) 965 + { 966 + if (rq->hrtick_sched & HRTICK_SCHED_START) { 967 + rq->hrtick_time = ktime_add_ns(ktime_get(), rq->hrtick_delay); 968 + hrtick_cond_restart(rq); 969 + } else if (idle_rq(rq)) { 970 + /* 971 + * No need for using hrtimer_is_active(). The timer is CPU local 972 + * and interrupts are disabled, so the callback cannot be 973 + * running and the queued state is valid. 974 + */ 975 + if (hrtimer_is_queued(&rq->hrtick_timer)) 976 + hrtimer_cancel(&rq->hrtick_timer); 977 + } 978 + 979 + if (rq->hrtick_sched & HRTICK_SCHED_REARM_HRTIMER) 980 + __hrtimer_rearm_deferred(); 981 + 982 + rq->hrtick_sched = HRTICK_SCHED_NONE; 960 983 } 961 984 962 985 static void hrtick_rq_init(struct rq *rq) 963 986 { 964 987 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); 965 - hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 988 + rq->hrtick_sched = HRTICK_SCHED_NONE; 989 + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, 990 + HRTIMER_MODE_REL_HARD | HRTIMER_MODE_LAZY_REARM); 966 991 } 967 992 #else /* !CONFIG_SCHED_HRTICK: */ 968 - static inline void hrtick_clear(struct rq *rq) 969 - { 970 - } 971 - 972 - static inline void hrtick_rq_init(struct rq *rq) 973 - { 974 - } 993 + static inline void hrtick_clear(struct rq *rq) { } 994 + static inline void hrtick_rq_init(struct rq *rq) { } 995 + static inline void hrtick_schedule_enter(struct rq *rq) { } 996 + static inline void hrtick_schedule_exit(struct rq *rq) { } 975 997 #endif /* !CONFIG_SCHED_HRTICK */ 976 998 977 999 /* ··· 5087 5029 */ 5088 5030 spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); 5089 5031 __balance_callbacks(rq, NULL); 5032 + hrtick_schedule_exit(rq); 5090 5033 raw_spin_rq_unlock_irq(rq); 5091 5034 } 5092 5035 ··· 6841 6782 6842 6783 schedule_debug(prev, preempt); 6843 6784 6844 - if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) 6845 - hrtick_clear(rq); 6846 - 6847 6785 klp_sched_try_switch(prev); 6848 6786 6849 6787 local_irq_disable(); ··· 6866 6810 */ 6867 6811 rq_lock(rq, &rf); 6868 6812 smp_mb__after_spinlock(); 6813 + 6814 + hrtick_schedule_enter(rq); 6869 6815 6870 6816 /* Promote REQ to ACT */ 6871 6817 rq->clock_update_flags <<= 1; ··· 6970 6912 6971 6913 rq_unpin_lock(rq, &rf); 6972 6914 __balance_callbacks(rq, NULL); 6915 + hrtick_schedule_exit(rq); 6973 6916 raw_spin_rq_unlock_irq(rq); 6974 6917 } 6975 6918 trace_sched_exit_tp(is_switch);

+1 -1

kernel/sched/deadline.c

··· 1097 1097 act = ns_to_ktime(dl_next_period(dl_se)); 1098 1098 } 1099 1099 1100 - now = hrtimer_cb_get_time(timer); 1100 + now = ktime_get(); 1101 1101 delta = ktime_to_ns(now) - rq_clock(rq); 1102 1102 act = ktime_add_ns(act, delta); 1103 1103

+32 -23

kernel/sched/fair.c

··· 5530 5530 * validating it and just reschedule. 5531 5531 */ 5532 5532 if (queued) { 5533 - resched_curr_lazy(rq_of(cfs_rq)); 5533 + resched_curr(rq_of(cfs_rq)); 5534 5534 return; 5535 5535 } 5536 5536 #endif ··· 6735 6735 static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 6736 6736 { 6737 6737 struct sched_entity *se = &p->se; 6738 + unsigned long scale = 1024; 6739 + unsigned long util = 0; 6740 + u64 vdelta; 6741 + u64 delta; 6738 6742 6739 6743 WARN_ON_ONCE(task_rq(p) != rq); 6740 6744 6741 - if (rq->cfs.h_nr_queued > 1) { 6742 - u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 6743 - u64 slice = se->slice; 6744 - s64 delta = slice - ran; 6745 + if (rq->cfs.h_nr_queued <= 1) 6746 + return; 6745 6747 6746 - if (delta < 0) { 6747 - if (task_current_donor(rq, p)) 6748 - resched_curr(rq); 6749 - return; 6750 - } 6751 - hrtick_start(rq, delta); 6748 + /* 6749 + * Compute time until virtual deadline 6750 + */ 6751 + vdelta = se->deadline - se->vruntime; 6752 + if ((s64)vdelta < 0) { 6753 + if (task_current_donor(rq, p)) 6754 + resched_curr(rq); 6755 + return; 6752 6756 } 6757 + delta = (se->load.weight * vdelta) / NICE_0_LOAD; 6758 + 6759 + /* 6760 + * Correct for instantaneous load of other classes. 6761 + */ 6762 + util += cpu_util_irq(rq); 6763 + if (util && util < 1024) { 6764 + scale *= 1024; 6765 + scale /= (1024 - util); 6766 + } 6767 + 6768 + hrtick_start(rq, (scale * delta) / 1024); 6753 6769 } 6754 6770 6755 6771 /* 6756 - * called from enqueue/dequeue and updates the hrtick when the 6757 - * current task is from our class and nr_running is low enough 6758 - * to matter. 6772 + * Called on enqueue to start the hrtick when h_nr_queued becomes more than 1. 6759 6773 */ 6760 6774 static void hrtick_update(struct rq *rq) 6761 6775 { 6762 6776 struct task_struct *donor = rq->donor; 6763 6777 6764 6778 if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) 6779 + return; 6780 + 6781 + if (hrtick_active(rq)) 6765 6782 return; 6766 6783 6767 6784 hrtick_start_fair(rq, donor); ··· 7103 7086 WARN_ON_ONCE(!task_sleep); 7104 7087 WARN_ON_ONCE(p->on_rq != 1); 7105 7088 7106 - /* Fix-up what dequeue_task_fair() skipped */ 7107 - hrtick_update(rq); 7108 - 7109 7089 /* 7110 7090 * Fix-up what block_task() skipped. 7111 7091 * ··· 7136 7122 /* 7137 7123 * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). 7138 7124 */ 7139 - 7140 - hrtick_update(rq); 7141 7125 return true; 7142 7126 } 7143 7127 ··· 13377 13365 entity_tick(cfs_rq, se, queued); 13378 13366 } 13379 13367 13380 - if (queued) { 13381 - if (!need_resched()) 13382 - hrtick_start_fair(rq, curr); 13368 + if (queued) 13383 13369 return; 13384 - } 13385 13370 13386 13371 if (static_branch_unlikely(&sched_numa_balancing)) 13387 13372 task_tick_numa(rq, curr);

+5

kernel/sched/features.h

··· 63 63 */ 64 64 SCHED_FEAT(WAKEUP_PREEMPTION, true) 65 65 66 + #ifdef CONFIG_HRTIMER_REARM_DEFERRED 67 + SCHED_FEAT(HRTICK, true) 68 + SCHED_FEAT(HRTICK_DL, true) 69 + #else 66 70 SCHED_FEAT(HRTICK, false) 67 71 SCHED_FEAT(HRTICK_DL, false) 72 + #endif 68 73 69 74 /* 70 75 * Decrement CPU capacity based on time not spent running tasks

+15 -28

kernel/sched/sched.h

··· 1285 1285 call_single_data_t hrtick_csd; 1286 1286 struct hrtimer hrtick_timer; 1287 1287 ktime_t hrtick_time; 1288 + ktime_t hrtick_delay; 1289 + unsigned int hrtick_sched; 1288 1290 #endif 1289 1291 1290 1292 #ifdef CONFIG_SCHEDSTATS ··· 3021 3019 * - enabled by features 3022 3020 * - hrtimer is actually high res 3023 3021 */ 3024 - static inline int hrtick_enabled(struct rq *rq) 3022 + static inline bool hrtick_enabled(struct rq *rq) 3025 3023 { 3026 - if (!cpu_active(cpu_of(rq))) 3027 - return 0; 3028 - return hrtimer_is_hres_active(&rq->hrtick_timer); 3024 + return cpu_active(cpu_of(rq)) && hrtimer_highres_enabled(); 3029 3025 } 3030 3026 3031 - static inline int hrtick_enabled_fair(struct rq *rq) 3027 + static inline bool hrtick_enabled_fair(struct rq *rq) 3032 3028 { 3033 - if (!sched_feat(HRTICK)) 3034 - return 0; 3035 - return hrtick_enabled(rq); 3029 + return sched_feat(HRTICK) && hrtick_enabled(rq); 3036 3030 } 3037 3031 3038 - static inline int hrtick_enabled_dl(struct rq *rq) 3032 + static inline bool hrtick_enabled_dl(struct rq *rq) 3039 3033 { 3040 - if (!sched_feat(HRTICK_DL)) 3041 - return 0; 3042 - return hrtick_enabled(rq); 3034 + return sched_feat(HRTICK_DL) && hrtick_enabled(rq); 3043 3035 } 3044 3036 3045 3037 extern void hrtick_start(struct rq *rq, u64 delay); 3038 + static inline bool hrtick_active(struct rq *rq) 3039 + { 3040 + return hrtimer_active(&rq->hrtick_timer); 3041 + } 3046 3042 3047 3043 #else /* !CONFIG_SCHED_HRTICK: */ 3048 - 3049 - static inline int hrtick_enabled_fair(struct rq *rq) 3050 - { 3051 - return 0; 3052 - } 3053 - 3054 - static inline int hrtick_enabled_dl(struct rq *rq) 3055 - { 3056 - return 0; 3057 - } 3058 - 3059 - static inline int hrtick_enabled(struct rq *rq) 3060 - { 3061 - return 0; 3062 - } 3063 - 3044 + static inline bool hrtick_enabled_fair(struct rq *rq) { return false; } 3045 + static inline bool hrtick_enabled_dl(struct rq *rq) { return false; } 3046 + static inline bool hrtick_enabled(struct rq *rq) { return false; } 3064 3047 #endif /* !CONFIG_SCHED_HRTICK */ 3065 3048 3066 3049 #ifndef arch_scale_freq_tick

+14 -1

kernel/softirq.c

··· 663 663 { 664 664 __irq_enter_raw(); 665 665 666 + /* 667 + * If this is a nested interrupt that hits the exit_to_user_mode_loop 668 + * where it has enabled interrupts but before it has hit schedule() we 669 + * could have hrtimers in an undefined state. Fix it up here. 670 + */ 671 + hrtimer_rearm_deferred(); 672 + 666 673 if (tick_nohz_full_cpu(smp_processor_id()) || 667 674 (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET))) 668 675 tick_irq_enter(); ··· 726 719 #endif 727 720 account_hardirq_exit(current); 728 721 preempt_count_sub(HARDIRQ_OFFSET); 729 - if (!in_interrupt() && local_softirq_pending()) 722 + if (!in_interrupt() && local_softirq_pending()) { 723 + /* 724 + * If we left hrtimers unarmed, make sure to arm them now, 725 + * before enabling interrupts to run SoftIRQ. 726 + */ 727 + hrtimer_rearm_deferred(); 730 728 invoke_softirq(); 729 + } 731 730 732 731 if (IS_ENABLED(CONFIG_IRQ_FORCED_THREADING) && force_irqthreads() && 733 732 local_timers_pending_force_th() && !(in_nmi() | in_hardirq()))

+16

kernel/time/Kconfig

··· 17 17 config ARCH_CLOCKSOURCE_INIT 18 18 bool 19 19 20 + config ARCH_WANTS_CLOCKSOURCE_READ_INLINE 21 + bool 22 + 20 23 # Timekeeping vsyscall support 21 24 config GENERIC_TIME_VSYSCALL 22 25 bool ··· 47 44 config GENERIC_CLOCKEVENTS_MIN_ADJUST 48 45 bool 49 46 47 + config GENERIC_CLOCKEVENTS_COUPLED 48 + bool 49 + 50 + config GENERIC_CLOCKEVENTS_COUPLED_INLINE 51 + select GENERIC_CLOCKEVENTS_COUPLED 52 + bool 53 + 50 54 # Generic update of CMOS clock 51 55 config GENERIC_CMOS_UPDATE 52 56 bool 57 + 58 + # Deferred rearming of the hrtimer interrupt 59 + config HRTIMER_REARM_DEFERRED 60 + def_bool y 61 + depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS 62 + depends on HIGH_RES_TIMERS && SCHED_HRTICK 53 63 54 64 # Select to handle posix CPU timers from task_work 55 65 # and not from the timer interrupt context

+41 -7

kernel/time/clockevents.c

··· 292 292 293 293 #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ 294 294 295 + #ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED 296 + #ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE 297 + #include <asm/clock_inlined.h> 298 + #else 299 + static __always_inline void 300 + arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { } 301 + #endif 302 + 303 + static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) 304 + { 305 + u64 cycles; 306 + 307 + if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED))) 308 + return false; 309 + 310 + if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles))) 311 + return false; 312 + 313 + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE)) 314 + arch_inlined_clockevent_set_next_coupled(cycles, dev); 315 + else 316 + dev->set_next_coupled(cycles, dev); 317 + return true; 318 + } 319 + 320 + #else 321 + static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) 322 + { 323 + return false; 324 + } 325 + #endif 326 + 295 327 /** 296 328 * clockevents_program_event - Reprogram the clock event device. 297 329 * @dev: device to program ··· 332 300 * 333 301 * Returns 0 on success, -ETIME when the event is in the past. 334 302 */ 335 - int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, 336 - bool force) 303 + int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) 337 304 { 338 - unsigned long long clc; 339 305 int64_t delta; 306 + u64 cycles; 340 307 int rc; 341 308 342 309 if (WARN_ON_ONCE(expires < 0)) ··· 350 319 WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", 351 320 clockevent_get_state(dev)); 352 321 353 - /* Shortcut for clockevent devices that can deal with ktime. */ 354 - if (dev->features & CLOCK_EVT_FEAT_KTIME) 322 + /* ktime_t based reprogramming for the broadcast hrtimer device */ 323 + if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) 355 324 return dev->set_next_ktime(expires, dev); 325 + 326 + if (likely(clockevent_set_next_coupled(dev, expires))) 327 + return 0; 356 328 357 329 delta = ktime_to_ns(ktime_sub(expires, ktime_get())); 358 330 if (delta <= 0) ··· 364 330 delta = min(delta, (int64_t) dev->max_delta_ns); 365 331 delta = max(delta, (int64_t) dev->min_delta_ns); 366 332 367 - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; 368 - rc = dev->set_next_event((unsigned long) clc, dev); 333 + cycles = ((u64)delta * dev->mult) >> dev->shift; 334 + rc = dev->set_next_event((unsigned long) cycles, dev); 369 335 370 336 return (rc && force) ? clockevents_program_min_delta(dev) : rc; 371 337 }

+7

kernel/time/clocksource.c

··· 1169 1169 1170 1170 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 1171 1171 NSEC_PER_SEC / scale, sec * scale); 1172 + 1173 + /* Update cs::freq_khz */ 1174 + cs->freq_khz = div_u64((u64)freq * scale, 1000); 1172 1175 } 1173 1176 1174 1177 /* ··· 1244 1241 1245 1242 if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) 1246 1243 cs->id = CSID_GENERIC; 1244 + 1245 + if (WARN_ON_ONCE(!freq && cs->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) 1246 + cs->flags &= ~CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT; 1247 + 1247 1248 if (cs->vdso_clock_mode < 0 || 1248 1249 cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { 1249 1250 pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",

+646 -485

kernel/time/hrtimer.c

··· 50 50 #include "tick-internal.h" 51 51 52 52 /* 53 + * Constants to set the queued state of the timer (INACTIVE, ENQUEUED) 54 + * 55 + * The callback state is kept separate in the CPU base because having it in 56 + * the timer would required touching the timer after the callback, which 57 + * makes it impossible to free the timer from the callback function. 58 + * 59 + * Therefore we track the callback state in: 60 + * 61 + * timer->base->cpu_base->running == timer 62 + * 63 + * On SMP it is possible to have a "callback function running and enqueued" 64 + * status. It happens for example when a posix timer expired and the callback 65 + * queued a signal. Between dropping the lock which protects the posix timer 66 + * and reacquiring the base lock of the hrtimer, another CPU can deliver the 67 + * signal and rearm the timer. 68 + * 69 + * All state transitions are protected by cpu_base->lock. 70 + */ 71 + #define HRTIMER_STATE_INACTIVE false 72 + #define HRTIMER_STATE_ENQUEUED true 73 + 74 + /* 53 75 * The resolution of the clocks. The resolution value is returned in 54 76 * the clock_getres() system call to give application programmers an 55 77 * idea of the (in)accuracy of timers. Timer values are rounded up to ··· 99 77 * to reach a base using a clockid, hrtimer_clockid_to_base() 100 78 * is used to convert from clockid to the proper hrtimer_base_type. 101 79 */ 80 + 81 + #define BASE_INIT(idx, cid) \ 82 + [idx] = { .index = idx, .clockid = cid } 83 + 102 84 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 103 85 { 104 86 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), 105 - .clock_base = 106 - { 107 - { 108 - .index = HRTIMER_BASE_MONOTONIC, 109 - .clockid = CLOCK_MONOTONIC, 110 - }, 111 - { 112 - .index = HRTIMER_BASE_REALTIME, 113 - .clockid = CLOCK_REALTIME, 114 - }, 115 - { 116 - .index = HRTIMER_BASE_BOOTTIME, 117 - .clockid = CLOCK_BOOTTIME, 118 - }, 119 - { 120 - .index = HRTIMER_BASE_TAI, 121 - .clockid = CLOCK_TAI, 122 - }, 123 - { 124 - .index = HRTIMER_BASE_MONOTONIC_SOFT, 125 - .clockid = CLOCK_MONOTONIC, 126 - }, 127 - { 128 - .index = HRTIMER_BASE_REALTIME_SOFT, 129 - .clockid = CLOCK_REALTIME, 130 - }, 131 - { 132 - .index = HRTIMER_BASE_BOOTTIME_SOFT, 133 - .clockid = CLOCK_BOOTTIME, 134 - }, 135 - { 136 - .index = HRTIMER_BASE_TAI_SOFT, 137 - .clockid = CLOCK_TAI, 138 - }, 87 + .clock_base = { 88 + BASE_INIT(HRTIMER_BASE_MONOTONIC, CLOCK_MONOTONIC), 89 + BASE_INIT(HRTIMER_BASE_REALTIME, CLOCK_REALTIME), 90 + BASE_INIT(HRTIMER_BASE_BOOTTIME, CLOCK_BOOTTIME), 91 + BASE_INIT(HRTIMER_BASE_TAI, CLOCK_TAI), 92 + BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT, CLOCK_MONOTONIC), 93 + BASE_INIT(HRTIMER_BASE_REALTIME_SOFT, CLOCK_REALTIME), 94 + BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT, CLOCK_BOOTTIME), 95 + BASE_INIT(HRTIMER_BASE_TAI_SOFT, CLOCK_TAI), 139 96 }, 140 97 .csd = CSD_INIT(retrigger_next_event, NULL) 141 98 }; ··· 127 126 return likely(base->online); 128 127 } 129 128 129 + #ifdef CONFIG_HIGH_RES_TIMERS 130 + DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key); 131 + 132 + static void hrtimer_hres_workfn(struct work_struct *work) 133 + { 134 + static_branch_enable(&hrtimer_highres_enabled_key); 135 + } 136 + 137 + static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn); 138 + 139 + static inline void hrtimer_schedule_hres_work(void) 140 + { 141 + if (!hrtimer_highres_enabled()) 142 + schedule_work(&hrtimer_hres_work); 143 + } 144 + #else 145 + static inline void hrtimer_schedule_hres_work(void) { } 146 + #endif 147 + 130 148 /* 131 149 * Functions and macros which are different for UP/SMP systems are kept in a 132 150 * single place 133 151 */ 134 152 #ifdef CONFIG_SMP 135 - 136 153 /* 137 154 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() 138 155 * such that hrtimer_callback_running() can unconditionally dereference 139 156 * timer->base->cpu_base 140 157 */ 141 158 static struct hrtimer_cpu_base migration_cpu_base = { 142 - .clock_base = { { 143 - .cpu_base = &migration_cpu_base, 144 - .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, 145 - &migration_cpu_base.lock), 146 - }, }, 159 + .clock_base = { 160 + [0] = { 161 + .cpu_base = &migration_cpu_base, 162 + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, 163 + &migration_cpu_base.lock), 164 + }, 165 + }, 147 166 }; 148 167 149 168 #define migration_base migration_cpu_base.clock_base[0] ··· 180 159 * possible to set timer->base = &migration_base and drop the lock: the timer 181 160 * remains locked. 182 161 */ 183 - static 184 - struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, 185 - unsigned long *flags) 162 + static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, 163 + unsigned long *flags) 186 164 __acquires(&timer->base->lock) 187 165 { 188 - struct hrtimer_clock_base *base; 189 - 190 166 for (;;) { 191 - base = READ_ONCE(timer->base); 167 + struct hrtimer_clock_base *base = READ_ONCE(timer->base); 168 + 192 169 if (likely(base != &migration_base)) { 193 170 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); 194 171 if (likely(base == timer->base)) ··· 239 220 return expires >= new_base->cpu_base->expires_next; 240 221 } 241 222 242 - static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) 223 + static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned) 243 224 { 244 225 if (!hrtimer_base_is_online(base)) { 245 226 int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); ··· 267 248 * the timer callback is currently running. 268 249 */ 269 250 static inline struct hrtimer_clock_base * 270 - switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, 271 - int pinned) 251 + switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned) 272 252 { 273 253 struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; 274 254 struct hrtimer_clock_base *new_base; ··· 280 262 281 263 if (base != new_base) { 282 264 /* 283 - * We are trying to move timer to new_base. 284 - * However we can't change timer's base while it is running, 285 - * so we keep it on the same CPU. No hassle vs. reprogramming 286 - * the event source in the high resolution case. The softirq 287 - * code will take care of this when the timer function has 288 - * completed. There is no conflict as we hold the lock until 289 - * the timer is enqueued. 265 + * We are trying to move timer to new_base. However we can't 266 + * change timer's base while it is running, so we keep it on 267 + * the same CPU. No hassle vs. reprogramming the event source 268 + * in the high resolution case. The remote CPU will take care 269 + * of this when the timer function has completed. There is no 270 + * conflict as we hold the lock until the timer is enqueued. 290 271 */ 291 272 if (unlikely(hrtimer_callback_running(timer))) 292 273 return base; ··· 295 278 raw_spin_unlock(&base->cpu_base->lock); 296 279 raw_spin_lock(&new_base->cpu_base->lock); 297 280 298 - if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, 299 - this_cpu_base)) { 281 + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { 300 282 raw_spin_unlock(&new_base->cpu_base->lock); 301 283 raw_spin_lock(&base->cpu_base->lock); 302 284 new_cpu_base = this_cpu_base; ··· 314 298 315 299 #else /* CONFIG_SMP */ 316 300 317 - static inline struct hrtimer_clock_base * 318 - lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 301 + static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, 302 + unsigned long *flags) 319 303 __acquires(&timer->base->cpu_base->lock) 320 304 { 321 305 struct hrtimer_clock_base *base = timer->base; 322 306 323 307 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); 324 - 325 308 return base; 326 309 } 327 310 ··· 437 422 } 438 423 } 439 424 425 + /* Stub timer callback for improperly used timers. */ 426 + static enum hrtimer_restart stub_timer(struct hrtimer *unused) 427 + { 428 + WARN_ON_ONCE(1); 429 + return HRTIMER_NORESTART; 430 + } 431 + 432 + /* 433 + * hrtimer_fixup_assert_init is called when: 434 + * - an untracked/uninit-ed object is found 435 + */ 436 + static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state) 437 + { 438 + struct hrtimer *timer = addr; 439 + 440 + switch (state) { 441 + case ODEBUG_STATE_NOTAVAILABLE: 442 + hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0); 443 + return true; 444 + default: 445 + return false; 446 + } 447 + } 448 + 440 449 static const struct debug_obj_descr hrtimer_debug_descr = { 441 - .name = "hrtimer", 442 - .debug_hint = hrtimer_debug_hint, 443 - .fixup_init = hrtimer_fixup_init, 444 - .fixup_activate = hrtimer_fixup_activate, 445 - .fixup_free = hrtimer_fixup_free, 450 + .name = "hrtimer", 451 + .debug_hint = hrtimer_debug_hint, 452 + .fixup_init = hrtimer_fixup_init, 453 + .fixup_activate = hrtimer_fixup_activate, 454 + .fixup_free = hrtimer_fixup_free, 455 + .fixup_assert_init = hrtimer_fixup_assert_init, 446 456 }; 447 457 448 458 static inline void debug_hrtimer_init(struct hrtimer *timer) ··· 480 440 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 481 441 } 482 442 483 - static inline void debug_hrtimer_activate(struct hrtimer *timer, 484 - enum hrtimer_mode mode) 443 + static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) 485 444 { 486 445 debug_object_activate(timer, &hrtimer_debug_descr); 487 446 } ··· 488 449 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) 489 450 { 490 451 debug_object_deactivate(timer, &hrtimer_debug_descr); 452 + } 453 + 454 + static inline void debug_hrtimer_assert_init(struct hrtimer *timer) 455 + { 456 + debug_object_assert_init(timer, &hrtimer_debug_descr); 491 457 } 492 458 493 459 void destroy_hrtimer_on_stack(struct hrtimer *timer) ··· 505 461 506 462 static inline void debug_hrtimer_init(struct hrtimer *timer) { } 507 463 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } 508 - static inline void debug_hrtimer_activate(struct hrtimer *timer, 509 - enum hrtimer_mode mode) { } 464 + static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } 510 465 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 466 + static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { } 511 467 #endif 512 468 513 469 static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) ··· 523 479 trace_hrtimer_setup(timer, clockid, mode); 524 480 } 525 481 526 - static inline void debug_activate(struct hrtimer *timer, 527 - enum hrtimer_mode mode) 482 + static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed) 528 483 { 529 484 debug_hrtimer_activate(timer, mode); 530 - trace_hrtimer_start(timer, mode); 485 + trace_hrtimer_start(timer, mode, was_armed); 531 486 } 532 487 533 - static inline void debug_deactivate(struct hrtimer *timer) 488 + #define for_each_active_base(base, cpu_base, active) \ 489 + for (unsigned int idx = ffs(active); idx--; idx = ffs((active))) \ 490 + for (bool done = false; !done; active &= ~(1U << idx)) \ 491 + for (base = &cpu_base->clock_base[idx]; !done; done = true) 492 + 493 + #if defined(CONFIG_NO_HZ_COMMON) 494 + /* 495 + * Same as hrtimer_bases_next_event() below, but skips the excluded timer and 496 + * does not update cpu_base->next_timer/expires. 497 + */ 498 + static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base, 499 + const struct hrtimer *exclude, 500 + unsigned int active, ktime_t expires_next) 534 501 { 535 - debug_hrtimer_deactivate(timer); 536 - trace_hrtimer_cancel(timer); 537 - } 502 + struct hrtimer_clock_base *base; 503 + ktime_t expires; 538 504 539 - static struct hrtimer_clock_base * 540 - __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) 505 + lockdep_assert_held(&cpu_base->lock); 506 + 507 + for_each_active_base(base, cpu_base, active) { 508 + expires = ktime_sub(base->expires_next, base->offset); 509 + if (expires >= expires_next) 510 + continue; 511 + 512 + /* 513 + * If the excluded timer is the first on this base evaluate the 514 + * next timer. 515 + */ 516 + struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active); 517 + 518 + if (unlikely(&exclude->node == node)) { 519 + node = timerqueue_linked_next(node); 520 + if (!node) 521 + continue; 522 + expires = ktime_sub(node->expires, base->offset); 523 + if (expires >= expires_next) 524 + continue; 525 + } 526 + expires_next = expires; 527 + } 528 + /* If base->offset changed, the result might be negative */ 529 + return max(expires_next, 0); 530 + } 531 + #endif 532 + 533 + static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base) 541 534 { 542 - unsigned int idx; 535 + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); 543 536 544 - if (!*active) 545 - return NULL; 546 - 547 - idx = __ffs(*active); 548 - *active &= ~(1U << idx); 549 - 550 - return &cpu_base->clock_base[idx]; 537 + return container_of(next, struct hrtimer, node); 551 538 } 552 539 553 - #define for_each_active_base(base, cpu_base, active) \ 554 - while ((base = __next_base((cpu_base), &(active)))) 555 - 556 - static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, 557 - const struct hrtimer *exclude, 558 - unsigned int active, 559 - ktime_t expires_next) 540 + /* Find the base with the earliest expiry */ 541 + static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active, 542 + ktime_t *expires_next, struct hrtimer **next_timer) 560 543 { 561 544 struct hrtimer_clock_base *base; 562 545 ktime_t expires; 563 546 564 547 for_each_active_base(base, cpu_base, active) { 565 - struct timerqueue_node *next; 566 - struct hrtimer *timer; 567 - 568 - next = timerqueue_getnext(&base->active); 569 - timer = container_of(next, struct hrtimer, node); 570 - if (timer == exclude) { 571 - /* Get to the next timer in the queue. */ 572 - next = timerqueue_iterate_next(next); 573 - if (!next) 574 - continue; 575 - 576 - timer = container_of(next, struct hrtimer, node); 577 - } 578 - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 579 - if (expires < expires_next) { 580 - expires_next = expires; 581 - 582 - /* Skip cpu_base update if a timer is being excluded. */ 583 - if (exclude) 584 - continue; 585 - 586 - if (timer->is_soft) 587 - cpu_base->softirq_next_timer = timer; 588 - else 589 - cpu_base->next_timer = timer; 548 + expires = ktime_sub(base->expires_next, base->offset); 549 + if (expires < *expires_next) { 550 + *expires_next = expires; 551 + *next_timer = clock_base_next_timer(base); 590 552 } 591 553 } 592 - /* 593 - * clock_was_set() might have changed base->offset of any of 594 - * the clock bases so the result might be negative. Fix it up 595 - * to prevent a false positive in clockevents_program_event(). 596 - */ 597 - if (expires_next < 0) 598 - expires_next = 0; 599 - return expires_next; 600 554 } 601 555 602 556 /* ··· 617 575 * - HRTIMER_ACTIVE_SOFT, or 618 576 * - HRTIMER_ACTIVE_HARD. 619 577 */ 620 - static ktime_t 621 - __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) 578 + static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) 622 579 { 623 - unsigned int active; 624 580 struct hrtimer *next_timer = NULL; 625 581 ktime_t expires_next = KTIME_MAX; 582 + unsigned int active; 583 + 584 + lockdep_assert_held(&cpu_base->lock); 626 585 627 586 if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { 628 587 active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; 629 - cpu_base->softirq_next_timer = NULL; 630 - expires_next = __hrtimer_next_event_base(cpu_base, NULL, 631 - active, KTIME_MAX); 632 - 633 - next_timer = cpu_base->softirq_next_timer; 588 + if (active) 589 + hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); 590 + cpu_base->softirq_next_timer = next_timer; 634 591 } 635 592 636 593 if (active_mask & HRTIMER_ACTIVE_HARD) { 637 594 active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; 595 + if (active) 596 + hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); 638 597 cpu_base->next_timer = next_timer; 639 - expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, 640 - expires_next); 641 598 } 642 - 643 - return expires_next; 599 + return max(expires_next, 0); 644 600 } 645 601 646 602 static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) ··· 678 638 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; 679 639 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; 680 640 681 - ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, 682 - offs_real, offs_boot, offs_tai); 641 + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, 642 + offs_boot, offs_tai); 683 643 684 644 base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; 685 645 base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; ··· 689 649 } 690 650 691 651 /* 692 - * Is the high resolution mode active ? 652 + * Is the high resolution mode active in the CPU base. This cannot use the 653 + * static key as the CPUs are switched to high resolution mode 654 + * asynchronously. 693 655 */ 694 656 static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) 695 657 { ··· 699 657 cpu_base->hres_active : 0; 700 658 } 701 659 702 - static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, 703 - struct hrtimer *next_timer, 660 + static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred) 661 + { 662 + trace_hrtimer_rearm(expires_next, deferred); 663 + tick_program_event(expires_next, 1); 664 + } 665 + 666 + static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, 704 667 ktime_t expires_next) 705 668 { 706 669 cpu_base->expires_next = expires_next; ··· 730 683 if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) 731 684 return; 732 685 733 - tick_program_event(expires_next, 1); 686 + hrtimer_rearm_event(expires_next, false); 734 687 } 735 688 736 - /* 737 - * Reprogram the event source with checking both queues for the 738 - * next event 739 - * Called with interrupts disabled and base->lock held 740 - */ 741 - static void 742 - hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) 689 + /* Reprogram the event source with a evaluation of all clock bases */ 690 + static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal) 743 691 { 744 - ktime_t expires_next; 745 - 746 - expires_next = hrtimer_update_next_event(cpu_base); 692 + ktime_t expires_next = hrtimer_update_next_event(cpu_base); 747 693 748 694 if (skip_equal && expires_next == cpu_base->expires_next) 749 695 return; ··· 747 707 /* High resolution timer related functions */ 748 708 #ifdef CONFIG_HIGH_RES_TIMERS 749 709 750 - /* 751 - * High resolution timer enabled ? 752 - */ 710 + /* High resolution timer enabled ? */ 753 711 static bool hrtimer_hres_enabled __read_mostly = true; 754 712 unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; 755 713 EXPORT_SYMBOL_GPL(hrtimer_resolution); 756 714 757 - /* 758 - * Enable / Disable high resolution mode 759 - */ 715 + /* Enable / Disable high resolution mode */ 760 716 static int __init setup_hrtimer_hres(char *str) 761 717 { 762 718 return (kstrtobool(str, &hrtimer_hres_enabled) == 0); 763 719 } 764 - 765 720 __setup("highres=", setup_hrtimer_hres); 766 721 767 - /* 768 - * hrtimer_high_res_enabled - query, if the highres mode is enabled 769 - */ 770 - static inline int hrtimer_is_hres_enabled(void) 722 + /* hrtimer_high_res_enabled - query, if the highres mode is enabled */ 723 + static inline bool hrtimer_is_hres_enabled(void) 771 724 { 772 725 return hrtimer_hres_enabled; 773 726 } 774 727 775 - /* 776 - * Switch to high resolution mode 777 - */ 728 + /* Switch to high resolution mode */ 778 729 static void hrtimer_switch_to_hres(void) 779 730 { 780 731 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); 781 732 782 733 if (tick_init_highres()) { 783 - pr_warn("Could not switch to high resolution mode on CPU %u\n", 784 - base->cpu); 734 + pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); 785 735 return; 786 736 } 787 - base->hres_active = 1; 737 + base->hres_active = true; 788 738 hrtimer_resolution = HIGH_RES_NSEC; 789 739 790 740 tick_setup_sched_timer(true); 791 741 /* "Retrigger" the interrupt to get things going */ 792 742 retrigger_next_event(NULL); 743 + hrtimer_schedule_hres_work(); 793 744 } 794 745 795 746 #else 796 747 797 - static inline int hrtimer_is_hres_enabled(void) { return 0; } 748 + static inline bool hrtimer_is_hres_enabled(void) { return 0; } 798 749 static inline void hrtimer_switch_to_hres(void) { } 799 750 800 751 #endif /* CONFIG_HIGH_RES_TIMERS */ 752 + 801 753 /* 802 754 * Retrigger next event is called after clock was set with interrupts 803 755 * disabled through an SMP function call or directly from low level ··· 824 792 * In periodic low resolution mode, the next softirq expiration 825 793 * must also be updated. 826 794 */ 827 - raw_spin_lock(&base->lock); 795 + guard(raw_spinlock)(&base->lock); 828 796 hrtimer_update_base(base); 829 797 if (hrtimer_hres_active(base)) 830 - hrtimer_force_reprogram(base, 0); 798 + hrtimer_force_reprogram(base, /* skip_equal */ false); 831 799 else 832 800 hrtimer_update_next_event(base); 833 - raw_spin_unlock(&base->lock); 834 801 } 835 802 836 803 /* ··· 843 812 { 844 813 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 845 814 struct hrtimer_clock_base *base = timer->base; 846 - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 815 + ktime_t expires = hrtimer_get_expires(timer); 847 816 848 - WARN_ON_ONCE(hrtimer_get_expires(timer) < 0); 817 + WARN_ON_ONCE(expires < 0); 849 818 819 + expires = ktime_sub(expires, base->offset); 850 820 /* 851 821 * CLOCK_REALTIME timer might be requested with an absolute 852 822 * expiry time which is less than base->offset. Set it to 0. ··· 874 842 timer_cpu_base->softirq_next_timer = timer; 875 843 timer_cpu_base->softirq_expires_next = expires; 876 844 877 - if (!ktime_before(expires, timer_cpu_base->expires_next) || 878 - !reprogram) 845 + if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) 879 846 return; 880 847 } 881 848 ··· 888 857 if (expires >= cpu_base->expires_next) 889 858 return; 890 859 891 - /* 892 - * If the hrtimer interrupt is running, then it will reevaluate the 893 - * clock bases and reprogram the clock event device. 894 - */ 895 - if (cpu_base->in_hrtirq) 860 + /* If a deferred rearm is pending skip reprogramming the device */ 861 + if (cpu_base->deferred_rearm) 896 862 return; 897 863 898 864 cpu_base->next_timer = timer; ··· 897 869 __hrtimer_reprogram(cpu_base, timer, expires); 898 870 } 899 871 900 - static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, 901 - unsigned int active) 872 + static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) 902 873 { 903 874 struct hrtimer_clock_base *base; 904 875 unsigned int seq; ··· 923 896 if (seq == cpu_base->clock_was_set_seq) 924 897 return false; 925 898 926 - /* 927 - * If the remote CPU is currently handling an hrtimer interrupt, it 928 - * will reevaluate the first expiring timer of all clock bases 929 - * before reprogramming. Nothing to do here. 930 - */ 931 - if (cpu_base->in_hrtirq) 899 + /* If a deferred rearm is pending the remote CPU will take care of it */ 900 + if (cpu_base->deferred_rearm) { 901 + cpu_base->deferred_needs_update = true; 932 902 return false; 903 + } 933 904 934 905 /* 935 906 * Walk the affected clock bases and check whether the first expiring ··· 938 913 active &= cpu_base->active_bases; 939 914 940 915 for_each_active_base(base, cpu_base, active) { 941 - struct timerqueue_node *next; 916 + struct timerqueue_linked_node *next; 942 917 943 - next = timerqueue_getnext(&base->active); 918 + next = timerqueue_linked_first(&base->active); 944 919 expires = ktime_sub(next->expires, base->offset); 945 920 if (expires < cpu_base->expires_next) 946 921 return true; ··· 972 947 */ 973 948 void clock_was_set(unsigned int bases) 974 949 { 975 - struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); 976 950 cpumask_var_t mask; 977 - int cpu; 978 951 979 - if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active()) 952 + if (!hrtimer_highres_enabled() && !tick_nohz_is_active()) 980 953 goto out_timerfd; 981 954 982 955 if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { ··· 983 960 } 984 961 985 962 /* Avoid interrupting CPUs if possible */ 986 - cpus_read_lock(); 987 - for_each_online_cpu(cpu) { 988 - unsigned long flags; 963 + scoped_guard(cpus_read_lock) { 964 + int cpu; 989 965 990 - cpu_base = &per_cpu(hrtimer_bases, cpu); 991 - raw_spin_lock_irqsave(&cpu_base->lock, flags); 966 + for_each_online_cpu(cpu) { 967 + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 992 968 993 - if (update_needs_ipi(cpu_base, bases)) 994 - cpumask_set_cpu(cpu, mask); 995 - 996 - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 969 + guard(raw_spinlock_irqsave)(&cpu_base->lock); 970 + if (update_needs_ipi(cpu_base, bases)) 971 + cpumask_set_cpu(cpu, mask); 972 + } 973 + scoped_guard(preempt) 974 + smp_call_function_many(mask, retrigger_next_event, NULL, 1); 997 975 } 998 - 999 - preempt_disable(); 1000 - smp_call_function_many(mask, retrigger_next_event, NULL, 1); 1001 - preempt_enable(); 1002 - cpus_read_unlock(); 1003 976 free_cpumask_var(mask); 1004 977 1005 978 out_timerfd: ··· 1030 1011 retrigger_next_event(NULL); 1031 1012 } 1032 1013 1033 - /* 1034 - * Counterpart to lock_hrtimer_base above: 1035 - */ 1036 - static inline 1037 - void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 1014 + /* Counterpart to lock_hrtimer_base above */ 1015 + static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 1038 1016 __releases(&timer->base->cpu_base->lock) 1039 1017 { 1040 1018 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); ··· 1048 1032 * .. note:: 1049 1033 * This only updates the timer expiry value and does not requeue the timer. 1050 1034 * 1051 - * There is also a variant of the function hrtimer_forward_now(). 1035 + * There is also a variant of this function: hrtimer_forward_now(). 1052 1036 * 1053 1037 * Context: Can be safely called from the callback function of @timer. If called 1054 1038 * from other contexts @timer must neither be enqueued nor running the ··· 1058 1042 */ 1059 1043 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) 1060 1044 { 1061 - u64 orun = 1; 1062 1045 ktime_t delta; 1046 + u64 orun = 1; 1063 1047 1064 1048 delta = ktime_sub(now, hrtimer_get_expires(timer)); 1065 1049 1066 1050 if (delta < 0) 1067 1051 return 0; 1068 1052 1069 - if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) 1053 + if (WARN_ON(timer->is_queued)) 1070 1054 return 0; 1071 1055 1072 1056 if (interval < hrtimer_resolution) ··· 1095 1079 * enqueue_hrtimer - internal function to (re)start a timer 1096 1080 * 1097 1081 * The timer is inserted in expiry order. Insertion into the 1098 - * red black tree is O(log(n)). Must hold the base lock. 1082 + * red black tree is O(log(n)). 1099 1083 * 1100 1084 * Returns true when the new timer is the leftmost timer in the tree. 1101 1085 */ 1102 1086 static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1103 - enum hrtimer_mode mode) 1087 + enum hrtimer_mode mode, bool was_armed) 1104 1088 { 1105 - debug_activate(timer, mode); 1089 + lockdep_assert_held(&base->cpu_base->lock); 1090 + 1091 + debug_activate(timer, mode, was_armed); 1106 1092 WARN_ON_ONCE(!base->cpu_base->online); 1107 1093 1108 1094 base->cpu_base->active_bases |= 1 << base->index; 1109 1095 1110 1096 /* Pairs with the lockless read in hrtimer_is_queued() */ 1111 - WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); 1097 + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); 1112 1098 1113 - return timerqueue_add(&base->active, &timer->node); 1099 + if (!timerqueue_linked_add(&base->active, &timer->node)) 1100 + return false; 1101 + 1102 + base->expires_next = hrtimer_get_expires(timer); 1103 + return true; 1104 + } 1105 + 1106 + static inline void base_update_next_timer(struct hrtimer_clock_base *base) 1107 + { 1108 + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); 1109 + 1110 + base->expires_next = next ? next->expires : KTIME_MAX; 1114 1111 } 1115 1112 1116 1113 /* 1117 1114 * __remove_hrtimer - internal function to remove a timer 1118 - * 1119 - * Caller must hold the base lock. 1120 1115 * 1121 1116 * High resolution timer mode reprograms the clock event device when the 1122 1117 * timer is the one which expires next. The caller can disable this by setting 1123 1118 * reprogram to zero. This is useful, when the context does a reprogramming 1124 1119 * anyway (e.g. timer interrupt) 1125 1120 */ 1126 - static void __remove_hrtimer(struct hrtimer *timer, 1127 - struct hrtimer_clock_base *base, 1128 - u8 newstate, int reprogram) 1121 + static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1122 + bool newstate, bool reprogram) 1129 1123 { 1130 1124 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1131 - u8 state = timer->state; 1125 + bool was_first; 1132 1126 1133 - /* Pairs with the lockless read in hrtimer_is_queued() */ 1134 - WRITE_ONCE(timer->state, newstate); 1135 - if (!(state & HRTIMER_STATE_ENQUEUED)) 1127 + lockdep_assert_held(&cpu_base->lock); 1128 + 1129 + if (!timer->is_queued) 1136 1130 return; 1137 1131 1138 - if (!timerqueue_del(&base->active, &timer->node)) 1132 + /* Pairs with the lockless read in hrtimer_is_queued() */ 1133 + WRITE_ONCE(timer->is_queued, newstate); 1134 + 1135 + was_first = !timerqueue_linked_prev(&timer->node); 1136 + 1137 + if (!timerqueue_linked_del(&base->active, &timer->node)) 1139 1138 cpu_base->active_bases &= ~(1 << base->index); 1140 1139 1140 + /* Nothing to update if this was not the first timer in the base */ 1141 + if (!was_first) 1142 + return; 1143 + 1144 + base_update_next_timer(base); 1145 + 1141 1146 /* 1142 - * Note: If reprogram is false we do not update 1143 - * cpu_base->next_timer. This happens when we remove the first 1144 - * timer on a remote cpu. No harm as we never dereference 1145 - * cpu_base->next_timer. So the worst thing what can happen is 1146 - * an superfluous call to hrtimer_force_reprogram() on the 1147 - * remote cpu later on if the same timer gets enqueued again. 1147 + * If reprogram is false don't update cpu_base->next_timer and do not 1148 + * touch the clock event device. 1149 + * 1150 + * This happens when removing the first timer on a remote CPU, which 1151 + * will be handled by the remote CPU's interrupt. It also happens when 1152 + * a local timer is removed to be immediately restarted. That's handled 1153 + * at the call site. 1148 1154 */ 1149 - if (reprogram && timer == cpu_base->next_timer) 1150 - hrtimer_force_reprogram(cpu_base, 1); 1155 + if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy) 1156 + return; 1157 + 1158 + if (cpu_base->deferred_rearm) 1159 + cpu_base->deferred_needs_update = true; 1160 + else 1161 + hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); 1151 1162 } 1152 1163 1153 - /* 1154 - * remove hrtimer, called with base lock held 1155 - */ 1156 - static inline int 1157 - remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1158 - bool restart, bool keep_local) 1164 + static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, 1165 + bool newstate) 1159 1166 { 1160 - u8 state = timer->state; 1167 + lockdep_assert_held(&base->cpu_base->lock); 1161 1168 1162 - if (state & HRTIMER_STATE_ENQUEUED) { 1169 + if (timer->is_queued) { 1163 1170 bool reprogram; 1171 + 1172 + debug_hrtimer_deactivate(timer); 1164 1173 1165 1174 /* 1166 1175 * Remove the timer and force reprogramming when high ··· 1195 1154 * reprogramming happens in the interrupt handler. This is a 1196 1155 * rare case and less expensive than a smp call. 1197 1156 */ 1198 - debug_deactivate(timer); 1199 1157 reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); 1200 1158 1201 - /* 1202 - * If the timer is not restarted then reprogramming is 1203 - * required if the timer is local. If it is local and about 1204 - * to be restarted, avoid programming it twice (on removal 1205 - * and a moment later when it's requeued). 1206 - */ 1207 - if (!restart) 1208 - state = HRTIMER_STATE_INACTIVE; 1209 - else 1210 - reprogram &= !keep_local; 1211 - 1212 - __remove_hrtimer(timer, base, state, reprogram); 1213 - return 1; 1159 + __remove_hrtimer(timer, base, newstate, reprogram); 1160 + return true; 1214 1161 } 1215 - return 0; 1162 + return false; 1163 + } 1164 + 1165 + /* 1166 + * Update in place has to retrieve the expiry times of the neighbour nodes 1167 + * if they exist. That is cache line neutral because the dequeue/enqueue 1168 + * operation is going to need the same cache lines. But there is a big win 1169 + * when the dequeue/enqueue can be avoided because the RB tree does not 1170 + * have to be rebalanced twice. 1171 + */ 1172 + static inline bool 1173 + hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires) 1174 + { 1175 + struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node); 1176 + struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node); 1177 + 1178 + /* If the new expiry goes behind the next timer, requeue is required */ 1179 + if (next && expires > next->expires) 1180 + return false; 1181 + 1182 + /* If this is the first timer, update in place */ 1183 + if (!prev) 1184 + return true; 1185 + 1186 + /* Update in place when it does not go ahead of the previous one */ 1187 + return expires >= prev->expires; 1188 + } 1189 + 1190 + static inline bool 1191 + remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base, 1192 + const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns) 1193 + { 1194 + bool was_first = false; 1195 + 1196 + /* Remove it from the timer queue if active */ 1197 + if (timer->is_queued) { 1198 + was_first = !timerqueue_linked_prev(&timer->node); 1199 + 1200 + /* Try to update in place to avoid the de/enqueue dance */ 1201 + if (hrtimer_can_update_in_place(timer, base, expires)) { 1202 + hrtimer_set_expires_range_ns(timer, expires, delta_ns); 1203 + trace_hrtimer_start(timer, mode, true); 1204 + if (was_first) 1205 + base->expires_next = expires; 1206 + return was_first; 1207 + } 1208 + 1209 + debug_hrtimer_deactivate(timer); 1210 + timerqueue_linked_del(&base->active, &timer->node); 1211 + } 1212 + 1213 + /* Set the new expiry time */ 1214 + hrtimer_set_expires_range_ns(timer, expires, delta_ns); 1215 + 1216 + debug_activate(timer, mode, timer->is_queued); 1217 + base->cpu_base->active_bases |= 1 << base->index; 1218 + 1219 + /* Pairs with the lockless read in hrtimer_is_queued() */ 1220 + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); 1221 + 1222 + /* If it's the first expiring timer now or again, update base */ 1223 + if (timerqueue_linked_add(&base->active, &timer->node)) { 1224 + base->expires_next = expires; 1225 + return true; 1226 + } 1227 + 1228 + if (was_first) 1229 + base_update_next_timer(base); 1230 + 1231 + return false; 1216 1232 } 1217 1233 1218 1234 static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, ··· 1288 1190 return tim; 1289 1191 } 1290 1192 1291 - static void 1292 - hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) 1193 + static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) 1293 1194 { 1294 - ktime_t expires; 1195 + ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); 1295 1196 1296 1197 /* 1297 - * Find the next SOFT expiration. 1298 - */ 1299 - expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); 1300 - 1301 - /* 1302 - * reprogramming needs to be triggered, even if the next soft 1303 - * hrtimer expires at the same time than the next hard 1198 + * Reprogramming needs to be triggered, even if the next soft 1199 + * hrtimer expires at the same time as the next hard 1304 1200 * hrtimer. cpu_base->softirq_expires_next needs to be updated! 1305 1201 */ 1306 1202 if (expires == KTIME_MAX) 1307 1203 return; 1308 1204 1309 1205 /* 1310 - * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() 1311 - * cpu_base->*expires_next is only set by hrtimer_reprogram() 1206 + * cpu_base->next_timer is recomputed by __hrtimer_get_next_event() 1207 + * cpu_base->expires_next is only set by hrtimer_reprogram() 1312 1208 */ 1313 1209 hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); 1314 1210 } 1315 1211 1316 - static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, 1317 - u64 delta_ns, const enum hrtimer_mode mode, 1318 - struct hrtimer_clock_base *base) 1212 + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 1213 + static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) 1214 + { 1215 + if (static_branch_likely(&timers_migration_enabled)) { 1216 + /* 1217 + * If it is local and the first expiring timer keep it on the local 1218 + * CPU to optimize reprogramming of the clockevent device. Also 1219 + * avoid switch_hrtimer_base() overhead when local and pinned. 1220 + */ 1221 + if (!is_local) 1222 + return false; 1223 + if (is_first || is_pinned) 1224 + return true; 1225 + 1226 + /* Honour the NOHZ full restrictions */ 1227 + if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE)) 1228 + return false; 1229 + 1230 + /* 1231 + * If the tick is not stopped or need_resched() is set, then 1232 + * there is no point in moving the timer somewhere else. 1233 + */ 1234 + return !tick_nohz_tick_stopped() || need_resched(); 1235 + } 1236 + return is_local; 1237 + } 1238 + #else 1239 + static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) 1240 + { 1241 + return is_local; 1242 + } 1243 + #endif 1244 + 1245 + static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first, 1246 + bool is_pinned) 1247 + { 1248 + /* If the timer is running the callback it has to stay on its CPU base. */ 1249 + if (unlikely(timer->base->running == timer)) 1250 + return true; 1251 + 1252 + return hrtimer_prefer_local(is_local, is_first, is_pinned); 1253 + } 1254 + 1255 + static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, 1256 + const enum hrtimer_mode mode, struct hrtimer_clock_base *base) 1319 1257 { 1320 1258 struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); 1321 - struct hrtimer_clock_base *new_base; 1322 - bool force_local, first; 1259 + bool is_pinned, first, was_first, keep_base = false; 1260 + struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1261 + 1262 + was_first = cpu_base->next_timer == timer; 1263 + is_pinned = !!(mode & HRTIMER_MODE_PINNED); 1323 1264 1324 1265 /* 1325 - * If the timer is on the local cpu base and is the first expiring 1326 - * timer then this might end up reprogramming the hardware twice 1327 - * (on removal and on enqueue). To avoid that by prevent the 1328 - * reprogram on removal, keep the timer local to the current CPU 1329 - * and enforce reprogramming after it is queued no matter whether 1330 - * it is the new first expiring timer again or not. 1266 + * Don't keep it local if this enqueue happens on a unplugged CPU 1267 + * after hrtimer_cpu_dying() has been invoked. 1331 1268 */ 1332 - force_local = base->cpu_base == this_cpu_base; 1333 - force_local &= base->cpu_base->next_timer == timer; 1269 + if (likely(this_cpu_base->online)) { 1270 + bool is_local = cpu_base == this_cpu_base; 1334 1271 1335 - /* 1336 - * Don't force local queuing if this enqueue happens on a unplugged 1337 - * CPU after hrtimer_cpu_dying() has been invoked. 1338 - */ 1339 - force_local &= this_cpu_base->online; 1272 + keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned); 1273 + } 1274 + 1275 + /* Calculate absolute expiry time for relative timers */ 1276 + if (mode & HRTIMER_MODE_REL) 1277 + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); 1278 + /* Compensate for low resolution granularity */ 1279 + tim = hrtimer_update_lowres(timer, tim, mode); 1340 1280 1341 1281 /* 1342 1282 * Remove an active timer from the queue. In case it is not queued ··· 1386 1250 * reprogramming later if it was the first expiring timer. This 1387 1251 * avoids programming the underlying clock event twice (once at 1388 1252 * removal and once after enqueue). 1253 + * 1254 + * @keep_base is also true if the timer callback is running on a 1255 + * remote CPU and for local pinned timers. 1389 1256 */ 1390 - remove_hrtimer(timer, base, true, force_local); 1391 - 1392 - if (mode & HRTIMER_MODE_REL) 1393 - tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); 1394 - 1395 - tim = hrtimer_update_lowres(timer, tim, mode); 1396 - 1397 - hrtimer_set_expires_range_ns(timer, tim, delta_ns); 1398 - 1399 - /* Switch the timer base, if necessary: */ 1400 - if (!force_local) { 1401 - new_base = switch_hrtimer_base(timer, base, 1402 - mode & HRTIMER_MODE_PINNED); 1257 + if (likely(keep_base)) { 1258 + first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns); 1403 1259 } else { 1404 - new_base = base; 1260 + /* Keep the ENQUEUED state in case it is queued */ 1261 + bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED); 1262 + 1263 + hrtimer_set_expires_range_ns(timer, tim, delta_ns); 1264 + 1265 + /* Switch the timer base, if necessary: */ 1266 + base = switch_hrtimer_base(timer, base, is_pinned); 1267 + cpu_base = base->cpu_base; 1268 + 1269 + first = enqueue_hrtimer(timer, base, mode, was_armed); 1405 1270 } 1406 1271 1407 - first = enqueue_hrtimer(timer, new_base, mode); 1408 - if (!force_local) { 1272 + /* If a deferred rearm is pending skip reprogramming the device */ 1273 + if (cpu_base->deferred_rearm) { 1274 + cpu_base->deferred_needs_update = true; 1275 + return false; 1276 + } 1277 + 1278 + if (!was_first || cpu_base != this_cpu_base) { 1409 1279 /* 1410 - * If the current CPU base is online, then the timer is 1411 - * never queued on a remote CPU if it would be the first 1412 - * expiring timer there. 1280 + * If the current CPU base is online, then the timer is never 1281 + * queued on a remote CPU if it would be the first expiring 1282 + * timer there unless the timer callback is currently executed 1283 + * on the remote CPU. In the latter case the remote CPU will 1284 + * re-evaluate the first expiring timer after completing the 1285 + * callbacks. 1413 1286 */ 1414 - if (hrtimer_base_is_online(this_cpu_base)) 1287 + if (likely(hrtimer_base_is_online(this_cpu_base))) 1415 1288 return first; 1416 1289 1417 1290 /* ··· 1428 1283 * already offline. If the timer is the first to expire, 1429 1284 * kick the remote CPU to reprogram the clock event. 1430 1285 */ 1431 - if (first) { 1432 - struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; 1433 - 1434 - smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); 1435 - } 1436 - return 0; 1286 + if (first) 1287 + smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd); 1288 + return false; 1437 1289 } 1438 1290 1439 1291 /* 1440 - * Timer was forced to stay on the current CPU to avoid 1441 - * reprogramming on removal and enqueue. Force reprogram the 1442 - * hardware by evaluating the new first expiring timer. 1292 + * Special case for the HRTICK timer. It is frequently rearmed and most 1293 + * of the time moves the expiry into the future. That's expensive in 1294 + * virtual machines and it's better to take the pointless already armed 1295 + * interrupt than reprogramming the hardware on every context switch. 1296 + * 1297 + * If the new expiry is before the armed time, then reprogramming is 1298 + * required. 1443 1299 */ 1444 - hrtimer_force_reprogram(new_base->cpu_base, 1); 1445 - return 0; 1300 + if (timer->is_lazy) { 1301 + if (cpu_base->expires_next <= hrtimer_get_expires(timer)) 1302 + return false; 1303 + } 1304 + 1305 + /* 1306 + * Timer was the first expiring timer and forced to stay on the 1307 + * current CPU to avoid reprogramming on removal and enqueue. Force 1308 + * reprogram the hardware by evaluating the new first expiring 1309 + * timer. 1310 + */ 1311 + hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); 1312 + return false; 1446 1313 } 1447 1314 1448 1315 /** ··· 1466 1309 * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); 1467 1310 * softirq based mode is considered for debug purpose only! 1468 1311 */ 1469 - void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, 1470 - u64 delta_ns, const enum hrtimer_mode mode) 1312 + void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, 1313 + const enum hrtimer_mode mode) 1471 1314 { 1472 1315 struct hrtimer_clock_base *base; 1473 1316 unsigned long flags; 1317 + 1318 + debug_hrtimer_assert_init(timer); 1474 1319 1475 1320 /* 1476 1321 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft ··· 1521 1362 1522 1363 base = lock_hrtimer_base(timer, &flags); 1523 1364 1524 - if (!hrtimer_callback_running(timer)) 1525 - ret = remove_hrtimer(timer, base, false, false); 1365 + if (!hrtimer_callback_running(timer)) { 1366 + ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE); 1367 + if (ret) 1368 + trace_hrtimer_cancel(timer); 1369 + } 1526 1370 1527 1371 unlock_hrtimer_base(timer, &flags); 1528 1372 ··· 1559 1397 * the timer callback to finish. Drop expiry_lock and reacquire it. That 1560 1398 * allows the waiter to acquire the lock and make progress. 1561 1399 */ 1562 - static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, 1563 - unsigned long flags) 1400 + static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) 1564 1401 { 1565 1402 if (atomic_read(&cpu_base->timer_waiters)) { 1566 1403 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); ··· 1624 1463 spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); 1625 1464 } 1626 1465 #else 1627 - static inline void 1628 - hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } 1629 - static inline void 1630 - hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } 1631 - static inline void 1632 - hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } 1633 - static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, 1634 - unsigned long flags) { } 1466 + static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } 1467 + static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } 1468 + static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } 1469 + static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { } 1635 1470 #endif 1636 1471 1637 1472 /** ··· 1683 1526 { 1684 1527 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1685 1528 u64 expires = KTIME_MAX; 1686 - unsigned long flags; 1687 1529 1688 - raw_spin_lock_irqsave(&cpu_base->lock, flags); 1689 - 1530 + guard(raw_spinlock_irqsave)(&cpu_base->lock); 1690 1531 if (!hrtimer_hres_active(cpu_base)) 1691 1532 expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); 1692 - 1693 - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1694 1533 1695 1534 return expires; 1696 1535 } ··· 1702 1549 { 1703 1550 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1704 1551 u64 expires = KTIME_MAX; 1705 - unsigned long flags; 1552 + unsigned int active; 1706 1553 1707 - raw_spin_lock_irqsave(&cpu_base->lock, flags); 1554 + guard(raw_spinlock_irqsave)(&cpu_base->lock); 1555 + if (!hrtimer_hres_active(cpu_base)) 1556 + return expires; 1708 1557 1709 - if (hrtimer_hres_active(cpu_base)) { 1710 - unsigned int active; 1558 + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; 1559 + if (active && !cpu_base->softirq_activated) 1560 + expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX); 1711 1561 1712 - if (!cpu_base->softirq_activated) { 1713 - active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; 1714 - expires = __hrtimer_next_event_base(cpu_base, exclude, 1715 - active, KTIME_MAX); 1716 - } 1717 - active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; 1718 - expires = __hrtimer_next_event_base(cpu_base, exclude, active, 1719 - expires); 1720 - } 1721 - 1722 - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1723 - 1724 - return expires; 1562 + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; 1563 + if (!active) 1564 + return expires; 1565 + return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires); 1725 1566 } 1726 1567 #endif 1727 1568 ··· 1759 1612 } 1760 1613 EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); 1761 1614 1762 - static void __hrtimer_setup(struct hrtimer *timer, 1763 - enum hrtimer_restart (*function)(struct hrtimer *), 1615 + static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *), 1764 1616 clockid_t clock_id, enum hrtimer_mode mode) 1765 1617 { 1766 1618 bool softtimer = !!(mode & HRTIMER_MODE_SOFT); ··· 1791 1645 base += hrtimer_clockid_to_base(clock_id); 1792 1646 timer->is_soft = softtimer; 1793 1647 timer->is_hard = !!(mode & HRTIMER_MODE_HARD); 1648 + timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); 1794 1649 timer->base = &cpu_base->clock_base[base]; 1795 - timerqueue_init(&timer->node); 1650 + timerqueue_linked_init(&timer->node); 1796 1651 1797 - if (WARN_ON_ONCE(!function)) 1652 + if (WARN_ON_ONCE(!fn)) 1798 1653 ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; 1799 1654 else 1800 - ACCESS_PRIVATE(timer, function) = function; 1655 + ACCESS_PRIVATE(timer, function) = fn; 1801 1656 } 1802 1657 1803 1658 /** ··· 1857 1710 base = READ_ONCE(timer->base); 1858 1711 seq = raw_read_seqcount_begin(&base->seq); 1859 1712 1860 - if (timer->state != HRTIMER_STATE_INACTIVE || 1861 - base->running == timer) 1713 + if (timer->is_queued || base->running == timer) 1862 1714 return true; 1863 1715 1864 - } while (read_seqcount_retry(&base->seq, seq) || 1865 - base != READ_ONCE(timer->base)); 1716 + } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); 1866 1717 1867 1718 return false; 1868 1719 } ··· 1874 1729 * - callback: the timer is being ran 1875 1730 * - post: the timer is inactive or (re)queued 1876 1731 * 1877 - * On the read side we ensure we observe timer->state and cpu_base->running 1732 + * On the read side we ensure we observe timer->is_queued and cpu_base->running 1878 1733 * from the same section, if anything changed while we looked at it, we retry. 1879 1734 * This includes timer->base changing because sequence numbers alone are 1880 1735 * insufficient for that. ··· 1883 1738 * a false negative if the read side got smeared over multiple consecutive 1884 1739 * __run_hrtimer() invocations. 1885 1740 */ 1886 - 1887 - static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, 1888 - struct hrtimer_clock_base *base, 1889 - struct hrtimer *timer, ktime_t *now, 1890 - unsigned long flags) __must_hold(&cpu_base->lock) 1741 + static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, 1742 + struct hrtimer *timer, ktime_t *now, unsigned long flags) 1743 + __must_hold(&cpu_base->lock) 1891 1744 { 1892 1745 enum hrtimer_restart (*fn)(struct hrtimer *); 1893 1746 bool expires_in_hardirq; ··· 1897 1754 base->running = timer; 1898 1755 1899 1756 /* 1900 - * Separate the ->running assignment from the ->state assignment. 1757 + * Separate the ->running assignment from the ->is_queued assignment. 1901 1758 * 1902 1759 * As with a regular write barrier, this ensures the read side in 1903 1760 * hrtimer_active() cannot observe base->running == NULL && 1904 - * timer->state == INACTIVE. 1761 + * timer->is_queued == INACTIVE. 1905 1762 */ 1906 1763 raw_write_seqcount_barrier(&base->seq); 1907 1764 1908 - __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); 1765 + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false); 1909 1766 fn = ACCESS_PRIVATE(timer, function); 1910 1767 1911 1768 /* ··· 1940 1797 * hrtimer_start_range_ns() can have popped in and enqueued the timer 1941 1798 * for us already. 1942 1799 */ 1943 - if (restart != HRTIMER_NORESTART && 1944 - !(timer->state & HRTIMER_STATE_ENQUEUED)) 1945 - enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); 1800 + if (restart == HRTIMER_RESTART && !timer->is_queued) 1801 + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); 1946 1802 1947 1803 /* 1948 - * Separate the ->running assignment from the ->state assignment. 1804 + * Separate the ->running assignment from the ->is_queued assignment. 1949 1805 * 1950 1806 * As with a regular write barrier, this ensures the read side in 1951 1807 * hrtimer_active() cannot observe base->running.timer == NULL && 1952 - * timer->state == INACTIVE. 1808 + * timer->is_queued == INACTIVE. 1953 1809 */ 1954 1810 raw_write_seqcount_barrier(&base->seq); 1955 1811 ··· 1956 1814 base->running = NULL; 1957 1815 } 1958 1816 1817 + static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base) 1818 + { 1819 + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); 1820 + 1821 + return next ? container_of(next, struct hrtimer, node) : NULL; 1822 + } 1823 + 1959 1824 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, 1960 1825 unsigned long flags, unsigned int active_mask) 1961 1826 { 1962 - struct hrtimer_clock_base *base; 1963 1827 unsigned int active = cpu_base->active_bases & active_mask; 1828 + struct hrtimer_clock_base *base; 1964 1829 1965 1830 for_each_active_base(base, cpu_base, active) { 1966 - struct timerqueue_node *node; 1967 - ktime_t basenow; 1831 + ktime_t basenow = ktime_add(now, base->offset); 1832 + struct hrtimer *timer; 1968 1833 1969 - basenow = ktime_add(now, base->offset); 1970 - 1971 - while ((node = timerqueue_getnext(&base->active))) { 1972 - struct hrtimer *timer; 1973 - 1974 - timer = container_of(node, struct hrtimer, node); 1975 - 1834 + while ((timer = clock_base_next_timer(base))) { 1976 1835 /* 1977 1836 * The immediate goal for using the softexpires is 1978 1837 * minimizing wakeups, not running timers at the ··· 2008 1865 now = hrtimer_update_base(cpu_base); 2009 1866 __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); 2010 1867 2011 - cpu_base->softirq_activated = 0; 1868 + cpu_base->softirq_activated = false; 2012 1869 hrtimer_update_softirq_timer(cpu_base, true); 2013 1870 2014 1871 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); ··· 2016 1873 } 2017 1874 2018 1875 #ifdef CONFIG_HIGH_RES_TIMERS 1876 + 1877 + /* 1878 + * Very similar to hrtimer_force_reprogram(), except it deals with 1879 + * deferred_rearm and hang_detected. 1880 + */ 1881 + static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred) 1882 + { 1883 + cpu_base->expires_next = expires_next; 1884 + cpu_base->deferred_rearm = false; 1885 + 1886 + if (unlikely(cpu_base->hang_detected)) { 1887 + /* 1888 + * Give the system a chance to do something else than looping 1889 + * on hrtimer interrupts. 1890 + */ 1891 + expires_next = ktime_add_ns(ktime_get(), 1892 + min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time)); 1893 + } 1894 + hrtimer_rearm_event(expires_next, deferred); 1895 + } 1896 + 1897 + #ifdef CONFIG_HRTIMER_REARM_DEFERRED 1898 + void __hrtimer_rearm_deferred(void) 1899 + { 1900 + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1901 + ktime_t expires_next; 1902 + 1903 + if (!cpu_base->deferred_rearm) 1904 + return; 1905 + 1906 + guard(raw_spinlock)(&cpu_base->lock); 1907 + if (cpu_base->deferred_needs_update) { 1908 + hrtimer_update_base(cpu_base); 1909 + expires_next = hrtimer_update_next_event(cpu_base); 1910 + } else { 1911 + /* No timer added/removed. Use the cached value */ 1912 + expires_next = cpu_base->deferred_expires_next; 1913 + } 1914 + hrtimer_rearm(cpu_base, expires_next, true); 1915 + } 1916 + 1917 + static __always_inline void 1918 + hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) 1919 + { 1920 + /* hrtimer_interrupt() just re-evaluated the first expiring timer */ 1921 + cpu_base->deferred_needs_update = false; 1922 + /* Cache the expiry time */ 1923 + cpu_base->deferred_expires_next = expires_next; 1924 + set_thread_flag(TIF_HRTIMER_REARM); 1925 + } 1926 + #else /* CONFIG_HRTIMER_REARM_DEFERRED */ 1927 + static __always_inline void 1928 + hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) 1929 + { 1930 + hrtimer_rearm(cpu_base, expires_next, false); 1931 + } 1932 + #endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ 2019 1933 2020 1934 /* 2021 1935 * High resolution timer interrupt ··· 2092 1892 raw_spin_lock_irqsave(&cpu_base->lock, flags); 2093 1893 entry_time = now = hrtimer_update_base(cpu_base); 2094 1894 retry: 2095 - cpu_base->in_hrtirq = 1; 1895 + cpu_base->deferred_rearm = true; 2096 1896 /* 2097 - * We set expires_next to KTIME_MAX here with cpu_base->lock 2098 - * held to prevent that a timer is enqueued in our queue via 2099 - * the migration code. This does not affect enqueueing of 2100 - * timers which run their callback and need to be requeued on 2101 - * this CPU. 1897 + * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue 1898 + * timers while __hrtimer_run_queues() is expiring the clock bases. 1899 + * Timers which are re/enqueued on the local CPU are not affected by 1900 + * this. 2102 1901 */ 2103 1902 cpu_base->expires_next = KTIME_MAX; 2104 1903 2105 1904 if (!ktime_before(now, cpu_base->softirq_expires_next)) { 2106 1905 cpu_base->softirq_expires_next = KTIME_MAX; 2107 - cpu_base->softirq_activated = 1; 1906 + cpu_base->softirq_activated = true; 2108 1907 raise_timer_softirq(HRTIMER_SOFTIRQ); 2109 1908 } 2110 1909 2111 1910 __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); 2112 - 2113 - /* Reevaluate the clock bases for the [soft] next expiry */ 2114 - expires_next = hrtimer_update_next_event(cpu_base); 2115 - /* 2116 - * Store the new expiry value so the migration code can verify 2117 - * against it. 2118 - */ 2119 - cpu_base->expires_next = expires_next; 2120 - cpu_base->in_hrtirq = 0; 2121 - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 2122 - 2123 - /* Reprogramming necessary ? */ 2124 - if (!tick_program_event(expires_next, 0)) { 2125 - cpu_base->hang_detected = 0; 2126 - return; 2127 - } 2128 1911 2129 1912 /* 2130 1913 * The next timer was already expired due to: ··· 2115 1932 * - long lasting callbacks 2116 1933 * - being scheduled away when running in a VM 2117 1934 * 2118 - * We need to prevent that we loop forever in the hrtimer 2119 - * interrupt routine. We give it 3 attempts to avoid 2120 - * overreacting on some spurious event. 2121 - * 2122 - * Acquire base lock for updating the offsets and retrieving 2123 - * the current time. 1935 + * We need to prevent that we loop forever in the hrtiner interrupt 1936 + * routine. We give it 3 attempts to avoid overreacting on some 1937 + * spurious event. 2124 1938 */ 2125 - raw_spin_lock_irqsave(&cpu_base->lock, flags); 2126 1939 now = hrtimer_update_base(cpu_base); 2127 - cpu_base->nr_retries++; 2128 - if (++retries < 3) 2129 - goto retry; 2130 - /* 2131 - * Give the system a chance to do something else than looping 2132 - * here. We stored the entry time, so we know exactly how long 2133 - * we spent here. We schedule the next event this amount of 2134 - * time away. 2135 - */ 2136 - cpu_base->nr_hangs++; 2137 - cpu_base->hang_detected = 1; 2138 - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1940 + expires_next = hrtimer_update_next_event(cpu_base); 1941 + cpu_base->hang_detected = false; 1942 + if (expires_next < now) { 1943 + if (++retries < 3) 1944 + goto retry; 2139 1945 2140 - delta = ktime_sub(now, entry_time); 2141 - if ((unsigned int)delta > cpu_base->max_hang_time) 2142 - cpu_base->max_hang_time = (unsigned int) delta; 2143 - /* 2144 - * Limit it to a sensible value as we enforce a longer 2145 - * delay. Give the CPU at least 100ms to catch up. 2146 - */ 2147 - if (delta > 100 * NSEC_PER_MSEC) 2148 - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); 2149 - else 2150 - expires_next = ktime_add(now, delta); 2151 - tick_program_event(expires_next, 1); 2152 - pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); 1946 + delta = ktime_sub(now, entry_time); 1947 + cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta); 1948 + cpu_base->nr_hangs++; 1949 + cpu_base->hang_detected = true; 1950 + } 1951 + 1952 + hrtimer_interrupt_rearm(cpu_base, expires_next); 1953 + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 2153 1954 } 1955 + 2154 1956 #endif /* !CONFIG_HIGH_RES_TIMERS */ 2155 1957 2156 1958 /* ··· 2167 1999 2168 2000 if (!ktime_before(now, cpu_base->softirq_expires_next)) { 2169 2001 cpu_base->softirq_expires_next = KTIME_MAX; 2170 - cpu_base->softirq_activated = 1; 2002 + cpu_base->softirq_activated = true; 2171 2003 raise_timer_softirq(HRTIMER_SOFTIRQ); 2172 2004 } 2173 2005 ··· 2180 2012 */ 2181 2013 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) 2182 2014 { 2183 - struct hrtimer_sleeper *t = 2184 - container_of(timer, struct hrtimer_sleeper, timer); 2015 + struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); 2185 2016 struct task_struct *task = t->task; 2186 2017 2187 2018 t->task = NULL; ··· 2198 2031 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers 2199 2032 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) 2200 2033 */ 2201 - void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, 2202 - enum hrtimer_mode mode) 2034 + void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) 2203 2035 { 2204 2036 /* 2205 2037 * Make the enqueue delivery mode check work on RT. If the sleeper ··· 2214 2048 } 2215 2049 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); 2216 2050 2217 - static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, 2218 - clockid_t clock_id, enum hrtimer_mode mode) 2051 + static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, 2052 + enum hrtimer_mode mode) 2219 2053 { 2220 2054 /* 2221 2055 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly ··· 2251 2085 * @clock_id: the clock to be used 2252 2086 * @mode: timer mode abs/rel 2253 2087 */ 2254 - void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, 2255 - clockid_t clock_id, enum hrtimer_mode mode) 2088 + void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, 2089 + enum hrtimer_mode mode) 2256 2090 { 2257 2091 debug_setup_on_stack(&sl->timer, clock_id, mode); 2258 2092 __hrtimer_setup_sleeper(sl, clock_id, mode); ··· 2325 2159 return ret; 2326 2160 } 2327 2161 2328 - long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, 2329 - const clockid_t clockid) 2162 + long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) 2330 2163 { 2331 2164 struct restart_block *restart; 2332 2165 struct hrtimer_sleeper t; ··· 2368 2203 current->restart_block.fn = do_no_restart_syscall; 2369 2204 current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; 2370 2205 current->restart_block.nanosleep.rmtp = rmtp; 2371 - return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, 2372 - CLOCK_MONOTONIC); 2206 + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); 2373 2207 } 2374 2208 2375 2209 #endif ··· 2376 2212 #ifdef CONFIG_COMPAT_32BIT_TIME 2377 2213 2378 2214 SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, 2379 - struct old_timespec32 __user *, rmtp) 2215 + struct old_timespec32 __user *, rmtp) 2380 2216 { 2381 2217 struct timespec64 tu; 2382 2218 ··· 2389 2225 current->restart_block.fn = do_no_restart_syscall; 2390 2226 current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; 2391 2227 current->restart_block.nanosleep.compat_rmtp = rmtp; 2392 - return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, 2393 - CLOCK_MONOTONIC); 2228 + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); 2394 2229 } 2395 2230 #endif 2396 2231 ··· 2399 2236 int hrtimers_prepare_cpu(unsigned int cpu) 2400 2237 { 2401 2238 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 2402 - int i; 2403 2239 2404 - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 2240 + for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 2405 2241 struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; 2406 2242 2407 2243 clock_b->cpu_base = cpu_base; 2408 2244 seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); 2409 - timerqueue_init_head(&clock_b->active); 2245 + timerqueue_linked_init_head(&clock_b->active); 2410 2246 } 2411 2247 2412 2248 cpu_base->cpu = cpu; ··· 2419 2257 2420 2258 /* Clear out any left over state from a CPU down operation */ 2421 2259 cpu_base->active_bases = 0; 2422 - cpu_base->hres_active = 0; 2423 - cpu_base->hang_detected = 0; 2260 + cpu_base->hres_active = false; 2261 + cpu_base->hang_detected = false; 2424 2262 cpu_base->next_timer = NULL; 2425 2263 cpu_base->softirq_next_timer = NULL; 2426 2264 cpu_base->expires_next = KTIME_MAX; 2427 2265 cpu_base->softirq_expires_next = KTIME_MAX; 2428 - cpu_base->online = 1; 2266 + cpu_base->softirq_activated = false; 2267 + cpu_base->online = true; 2429 2268 return 0; 2430 2269 } 2431 2270 ··· 2435 2272 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 2436 2273 struct hrtimer_clock_base *new_base) 2437 2274 { 2275 + struct timerqueue_linked_node *node; 2438 2276 struct hrtimer *timer; 2439 - struct timerqueue_node *node; 2440 2277 2441 - while ((node = timerqueue_getnext(&old_base->active))) { 2278 + while ((node = timerqueue_linked_first(&old_base->active))) { 2442 2279 timer = container_of(node, struct hrtimer, node); 2443 2280 BUG_ON(hrtimer_callback_running(timer)); 2444 - debug_deactivate(timer); 2281 + debug_hrtimer_deactivate(timer); 2445 2282 2446 2283 /* 2447 2284 * Mark it as ENQUEUED not INACTIVE otherwise the 2448 2285 * timer could be seen as !active and just vanish away 2449 2286 * under us on another CPU 2450 2287 */ 2451 - __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); 2288 + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false); 2452 2289 timer->base = new_base; 2453 2290 /* 2454 2291 * Enqueue the timers on the new cpu. This does not ··· 2458 2295 * sort out already expired timers and reprogram the 2459 2296 * event device. 2460 2297 */ 2461 - enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); 2298 + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true); 2462 2299 } 2463 2300 } 2464 2301 2465 2302 int hrtimers_cpu_dying(unsigned int dying_cpu) 2466 2303 { 2467 - int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); 2304 + int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); 2468 2305 struct hrtimer_cpu_base *old_base, *new_base; 2469 2306 2470 2307 old_base = this_cpu_ptr(&hrtimer_bases); ··· 2477 2314 raw_spin_lock(&old_base->lock); 2478 2315 raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); 2479 2316 2480 - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 2481 - migrate_hrtimer_list(&old_base->clock_base[i], 2482 - &new_base->clock_base[i]); 2483 - } 2317 + for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 2318 + migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); 2484 2319 2485 2320 /* Tell the other CPU to retrigger the next event */ 2486 2321 smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); 2487 2322 2488 2323 raw_spin_unlock(&new_base->lock); 2489 - old_base->online = 0; 2324 + old_base->online = false; 2490 2325 raw_spin_unlock(&old_base->lock); 2491 2326 2492 2327 return 0;

-1

kernel/time/tick-broadcast-hrtimer.c

··· 78 78 .set_state_shutdown = bc_shutdown, 79 79 .set_next_ktime = bc_set_next, 80 80 .features = CLOCK_EVT_FEAT_ONESHOT | 81 - CLOCK_EVT_FEAT_KTIME | 82 81 CLOCK_EVT_FEAT_HRTIMER, 83 82 .rating = 0, 84 83 .bound_on = -1,

+20 -7

kernel/time/tick-sched.c

··· 864 864 } 865 865 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 866 866 867 + /* Simplified variant of hrtimer_forward_now() */ 868 + static ktime_t tick_forward_now(ktime_t expires, ktime_t now) 869 + { 870 + ktime_t delta = now - expires; 871 + 872 + if (likely(delta < TICK_NSEC)) 873 + return expires + TICK_NSEC; 874 + 875 + expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC); 876 + if (expires > now) 877 + return expires; 878 + return expires + TICK_NSEC; 879 + } 880 + 867 881 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 868 882 { 869 - hrtimer_cancel(&ts->sched_timer); 870 - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); 883 + ktime_t expires = ts->last_tick; 871 884 872 - /* Forward the time to expire in the future */ 873 - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 885 + if (now >= expires) 886 + expires = tick_forward_now(expires, now); 874 887 875 888 if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { 876 - hrtimer_start_expires(&ts->sched_timer, 877 - HRTIMER_MODE_ABS_PINNED_HARD); 889 + hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); 878 890 } else { 879 - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 891 + hrtimer_set_expires(&ts->sched_timer, expires); 892 + tick_program_event(expires, 1); 880 893 } 881 894 882 895 /*

+174 -21

kernel/time/timekeeping.c

··· 3 3 * Kernel timekeeping code and accessor functions. Based on code from 4 4 * timer.c, moved in commit 8524070b7982. 5 5 */ 6 - #include <linux/timekeeper_internal.h> 7 - #include <linux/module.h> 8 - #include <linux/interrupt.h> 9 - #include <linux/kobject.h> 10 - #include <linux/percpu.h> 11 - #include <linux/init.h> 12 - #include <linux/mm.h> 13 - #include <linux/nmi.h> 14 - #include <linux/sched.h> 15 - #include <linux/sched/loadavg.h> 16 - #include <linux/sched/clock.h> 17 - #include <linux/syscore_ops.h> 6 + #include <linux/audit.h> 18 7 #include <linux/clocksource.h> 8 + #include <linux/compiler.h> 19 9 #include <linux/jiffies.h> 10 + #include <linux/kobject.h> 11 + #include <linux/module.h> 12 + #include <linux/nmi.h> 13 + #include <linux/pvclock_gtod.h> 14 + #include <linux/random.h> 15 + #include <linux/sched/clock.h> 16 + #include <linux/sched/loadavg.h> 17 + #include <linux/static_key.h> 18 + #include <linux/stop_machine.h> 19 + #include <linux/syscore_ops.h> 20 + #include <linux/tick.h> 20 21 #include <linux/time.h> 21 22 #include <linux/timex.h> 22 - #include <linux/tick.h> 23 - #include <linux/stop_machine.h> 24 - #include <linux/pvclock_gtod.h> 25 - #include <linux/compiler.h> 26 - #include <linux/audit.h> 27 - #include <linux/random.h> 23 + #include <linux/timekeeper_internal.h> 28 24 29 25 #include <vdso/auxclock.h> 30 26 31 27 #include "tick-internal.h" 32 - #include "ntp_internal.h" 33 28 #include "timekeeping_internal.h" 29 + #include "ntp_internal.h" 34 30 35 31 #define TK_CLEAR_NTP (1 << 0) 36 32 #define TK_CLOCK_WAS_SET (1 << 1) ··· 271 275 tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); 272 276 } 273 277 278 + #ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE 279 + #include <asm/clock_inlined.h> 280 + 281 + static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined); 282 + 274 283 /* 275 284 * tk_clock_read - atomic clocksource read() helper 276 285 * ··· 289 288 * a read of the fast-timekeeper tkrs (which is protected by its own locking 290 289 * and update logic). 291 290 */ 292 - static inline u64 tk_clock_read(const struct tk_read_base *tkr) 291 + static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) 292 + { 293 + struct clocksource *clock = READ_ONCE(tkr->clock); 294 + 295 + if (static_branch_likely(&clocksource_read_inlined)) 296 + return arch_inlined_clocksource_read(clock); 297 + 298 + return clock->read(clock); 299 + } 300 + 301 + static inline void clocksource_disable_inline_read(void) 302 + { 303 + static_branch_disable(&clocksource_read_inlined); 304 + } 305 + 306 + static inline void clocksource_enable_inline_read(void) 307 + { 308 + static_branch_enable(&clocksource_read_inlined); 309 + } 310 + #else 311 + static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) 293 312 { 294 313 struct clocksource *clock = READ_ONCE(tkr->clock); 295 314 296 315 return clock->read(clock); 297 316 } 317 + static inline void clocksource_disable_inline_read(void) { } 318 + static inline void clocksource_enable_inline_read(void) { } 319 + #endif 298 320 299 321 /** 300 322 * tk_setup_internals - Set up internals to use clocksource clock. ··· 391 367 tk->tkr_raw.mult = clock->mult; 392 368 tk->ntp_err_mult = 0; 393 369 tk->skip_second_overflow = 0; 370 + 371 + tk->cs_id = clock->id; 372 + 373 + /* Coupled clockevent data */ 374 + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) && 375 + clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) { 376 + /* 377 + * Aim for an one hour maximum delta and use KHz to handle 378 + * clocksources with a frequency above 4GHz correctly as 379 + * the frequency argument of clocks_calc_mult_shift() is u32. 380 + */ 381 + clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift, 382 + NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000); 383 + /* 384 + * Initialize the conversion limit as the previous clocksource 385 + * might have the same shift/mult pair so the quick check in 386 + * tk_update_ns_to_cyc() fails to update it after a clocksource 387 + * change leaving it effectivly zero. 388 + */ 389 + tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult); 390 + } 394 391 } 395 392 396 393 /* Timekeeper helper functions. */ ··· 420 375 return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); 421 376 } 422 377 423 - static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) 378 + static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) 424 379 { 425 380 /* Calculate the delta since the last update_wall_time() */ 426 381 u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; ··· 741 696 tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); 742 697 } 743 698 699 + static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc) 700 + { 701 + struct tk_read_base *tkrs = &tks->tkr_mono; 702 + struct tk_read_base *tkrc = &tkc->tkr_mono; 703 + unsigned int shift; 704 + 705 + if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) || 706 + !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) 707 + return; 708 + 709 + if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift) 710 + return; 711 + /* 712 + * The conversion math is simple: 713 + * 714 + * CS::MULT (1 << NS_TO_CYC_SHIFT) 715 + * --------------- = ---------------------- 716 + * (1 << CS:SHIFT) NS_TO_CYC_MULT 717 + * 718 + * Ergo: 719 + * 720 + * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT 721 + * 722 + * NS_TO_CYC_SHIFT has been set up in tk_setup_internals() 723 + */ 724 + shift = tkrs->shift + tks->cs_ns_to_cyc_shift; 725 + tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult); 726 + tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult); 727 + } 728 + 744 729 /* 745 730 * Restore the shadow timekeeper from the real timekeeper. 746 731 */ ··· 805 730 tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; 806 731 807 732 if (tk->id == TIMEKEEPER_CORE) { 733 + tk_update_ns_to_cyc(tk, &tkd->timekeeper); 808 734 update_vsyscall(tk); 809 735 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); 810 736 ··· 858 782 delta -= incr; 859 783 } 860 784 tk_update_coarse_nsecs(tk); 785 + } 786 + 787 + /* 788 + * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles 789 + * @id: Clocksource ID which is required for validity 790 + * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted 791 + * @cycles: Pointer to storage for corresponding absolute cycles value 792 + * 793 + * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value 794 + * based on the correlated clocksource of the clockevent device by using 795 + * the base nanoseconds and cycles values of the last timekeeper update and 796 + * converting the delta between @expires_ns and base nanoseconds to cycles. 797 + * 798 + * This only works for clockevent devices which are using a less than or 799 + * equal comparator against the clocksource. 800 + * 801 + * Utilizing this avoids two clocksource reads for such devices, the 802 + * ktime_get() in clockevents_program_event() to calculate the delta expiry 803 + * value and the readout in the device::set_next_event() callback to 804 + * convert the delta back to a absolute comparator value. 805 + * 806 + * Returns: True if @id matches the current clocksource ID, false otherwise 807 + */ 808 + bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles) 809 + { 810 + struct timekeeper *tk = &tk_core.timekeeper; 811 + struct tk_read_base *tkrm = &tk->tkr_mono; 812 + ktime_t base_ns, delta_ns, max_ns; 813 + u64 base_cycles, delta_cycles; 814 + unsigned int seq; 815 + u32 mult, shift; 816 + 817 + /* 818 + * Racy check to avoid the seqcount overhead when ID does not match. If 819 + * the relevant clocksource is installed concurrently, then this will 820 + * just delay the switch over to this mechanism until the next event is 821 + * programmed. If the ID is not matching the clock events code will use 822 + * the regular relative set_next_event() callback as before. 823 + */ 824 + if (data_race(tk->cs_id) != id) 825 + return false; 826 + 827 + do { 828 + seq = read_seqcount_begin(&tk_core.seq); 829 + 830 + if (tk->cs_id != id) 831 + return false; 832 + 833 + base_cycles = tkrm->cycle_last; 834 + base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift); 835 + 836 + mult = tk->cs_ns_to_cyc_mult; 837 + shift = tk->cs_ns_to_cyc_shift; 838 + max_ns = tk->cs_ns_to_cyc_maxns; 839 + 840 + } while (read_seqcount_retry(&tk_core.seq, seq)); 841 + 842 + /* Prevent negative deltas and multiplication overflows */ 843 + delta_ns = min(expires_ns - base_ns, max_ns); 844 + delta_ns = max(delta_ns, 0); 845 + 846 + /* Convert to cycles */ 847 + delta_cycles = ((u64)delta_ns * mult) >> shift; 848 + *cycles = base_cycles + delta_cycles; 849 + return true; 861 850 } 862 851 863 852 /** ··· 1772 1631 1773 1632 if (tk->tkr_mono.clock == clock) 1774 1633 return 0; 1634 + 1635 + /* Disable inlined reads accross the clocksource switch */ 1636 + clocksource_disable_inline_read(); 1637 + 1775 1638 stop_machine(change_clocksource, clock, NULL); 1639 + 1640 + /* 1641 + * If the clocksource has been selected and supports inlined reads 1642 + * enable the branch. 1643 + */ 1644 + if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ) 1645 + clocksource_enable_inline_read(); 1646 + 1776 1647 tick_clock_notify(); 1777 1648 return tk->tkr_mono.clock == clock ? 0 : -1; 1778 1649 }

+2

kernel/time/timekeeping.h

··· 9 9 ktime_t *offs_boot, 10 10 ktime_t *offs_tai); 11 11 12 + bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles); 13 + 12 14 extern int timekeeping_valid_for_hres(void); 13 15 extern u64 timekeeping_max_deferment(void); 14 16 extern void timekeeping_warp_clock(void);

+5 -7

kernel/time/timer_list.c

··· 47 47 int idx, u64 now) 48 48 { 49 49 SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); 50 - SEQ_printf(m, ", S:%02x", timer->state); 50 + SEQ_printf(m, ", S:%02x", timer->is_queued); 51 51 SEQ_printf(m, "\n"); 52 52 SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", 53 53 (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), ··· 56 56 (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); 57 57 } 58 58 59 - static void 60 - print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, 61 - u64 now) 59 + static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 62 60 { 61 + struct timerqueue_linked_node *curr; 63 62 struct hrtimer *timer, tmp; 64 63 unsigned long next = 0, i; 65 - struct timerqueue_node *curr; 66 64 unsigned long flags; 67 65 68 66 next_one: ··· 70 72 71 73 raw_spin_lock_irqsave(&base->cpu_base->lock, flags); 72 74 73 - curr = timerqueue_getnext(&base->active); 75 + curr = timerqueue_linked_first(&base->active); 74 76 /* 75 77 * Crude but we have to do this O(N*N) thing, because 76 78 * we have to unlock the base when printing: 77 79 */ 78 80 while (curr && i < next) { 79 - curr = timerqueue_iterate_next(curr); 81 + curr = timerqueue_linked_next(curr); 80 82 i++; 81 83 } 82 84

+17

lib/rbtree.c

··· 446 446 } 447 447 EXPORT_SYMBOL(rb_erase); 448 448 449 + bool rb_erase_linked(struct rb_node_linked *node, struct rb_root_linked *root) 450 + { 451 + if (node->prev) 452 + node->prev->next = node->next; 453 + else 454 + root->rb_leftmost = node->next; 455 + 456 + if (node->next) 457 + node->next->prev = node->prev; 458 + 459 + rb_erase(&node->node, &root->rb_root); 460 + RB_CLEAR_LINKED_NODE(node); 461 + 462 + return !!root->rb_leftmost; 463 + } 464 + EXPORT_SYMBOL_GPL(rb_erase_linked); 465 + 449 466 /* 450 467 * Augmented rbtree manipulation functions. 451 468 *

+14

lib/timerqueue.c

··· 82 82 return container_of(next, struct timerqueue_node, node); 83 83 } 84 84 EXPORT_SYMBOL_GPL(timerqueue_iterate_next); 85 + 86 + #define __node_2_tq_linked(_n) \ 87 + container_of(rb_entry((_n), struct rb_node_linked, node), struct timerqueue_linked_node, node) 88 + 89 + static __always_inline bool __tq_linked_less(struct rb_node *a, const struct rb_node *b) 90 + { 91 + return __node_2_tq_linked(a)->expires < __node_2_tq_linked(b)->expires; 92 + } 93 + 94 + bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) 95 + { 96 + return rb_add_linked(&node->node, &head->rb_root, __tq_linked_less); 97 + } 98 + EXPORT_SYMBOL_GPL(timerqueue_linked_add);

Configure Feed

Configure Feed