Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rseq: Expose lightweight statistics in debugfs

Analyzing the call frequency without actually using tracing is helpful for
analysis of this infrastructure. The overhead is minimal as it just
increments a per CPU counter associated to each operation.

The debugfs readout provides a racy sum of all counters.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.027916598@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
54129104 dab34475

+135 -25
-16
include/linux/rseq.h
··· 29 29 } 30 30 } 31 31 32 - static __always_inline void rseq_exit_to_user_mode(void) 33 - { 34 - struct rseq_event *ev = &current->rseq.event; 35 - 36 - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) 37 - WARN_ON_ONCE(ev->sched_switch); 38 - 39 - /* 40 - * Ensure that event (especially user_irq) is cleared when the 41 - * interrupt did not result in a schedule and therefore the 42 - * rseq processing did not clear it. 43 - */ 44 - ev->events = 0; 45 - } 46 - 47 32 /* 48 33 * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, 49 34 * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in ··· 77 92 static inline void rseq_virt_userspace_exit(void) { } 78 93 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } 79 94 static inline void rseq_execve(struct task_struct *t) { } 80 - static inline void rseq_exit_to_user_mode(void) { } 81 95 #endif /* !CONFIG_RSEQ */ 82 96 83 97 #ifdef CONFIG_DEBUG_RSEQ
+49
include/linux/rseq_entry.h
··· 2 2 #ifndef _LINUX_RSEQ_ENTRY_H 3 3 #define _LINUX_RSEQ_ENTRY_H 4 4 5 + /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ 6 + #ifdef CONFIG_RSEQ_STATS 7 + #include <linux/percpu.h> 8 + 9 + struct rseq_stats { 10 + unsigned long exit; 11 + unsigned long signal; 12 + unsigned long slowpath; 13 + unsigned long ids; 14 + unsigned long cs; 15 + unsigned long clear; 16 + unsigned long fixup; 17 + }; 18 + 19 + DECLARE_PER_CPU(struct rseq_stats, rseq_stats); 20 + 21 + /* 22 + * Slow path has interrupts and preemption enabled, but the fast path 23 + * runs with interrupts disabled so there is no point in having the 24 + * preemption checks implied in __this_cpu_inc() for every operation. 25 + */ 26 + #ifdef RSEQ_BUILD_SLOW_PATH 27 + #define rseq_stat_inc(which) this_cpu_inc((which)) 28 + #else 29 + #define rseq_stat_inc(which) raw_cpu_inc((which)) 30 + #endif 31 + 32 + #else /* CONFIG_RSEQ_STATS */ 33 + #define rseq_stat_inc(x) do { } while (0) 34 + #endif /* !CONFIG_RSEQ_STATS */ 35 + 5 36 #ifdef CONFIG_RSEQ 6 37 #include <linux/rseq.h> 7 38 ··· 70 39 current->rseq.event.user_irq = true; 71 40 } 72 41 42 + static __always_inline void rseq_exit_to_user_mode(void) 43 + { 44 + struct rseq_event *ev = &current->rseq.event; 45 + 46 + rseq_stat_inc(rseq_stats.exit); 47 + 48 + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) 49 + WARN_ON_ONCE(ev->sched_switch); 50 + 51 + /* 52 + * Ensure that event (especially user_irq) is cleared when the 53 + * interrupt did not result in a schedule and therefore the 54 + * rseq processing did not clear it. 55 + */ 56 + ev->events = 0; 57 + } 58 + 73 59 #else /* CONFIG_RSEQ */ 74 60 static inline void rseq_note_user_irq_entry(void) { } 61 + static inline void rseq_exit_to_user_mode(void) { } 75 62 #endif /* !CONFIG_RSEQ */ 76 63 77 64 #endif /* _LINUX_RSEQ_ENTRY_H */
+12
init/Kconfig
··· 1913 1913 1914 1914 If unsure, say Y. 1915 1915 1916 + config RSEQ_STATS 1917 + default n 1918 + bool "Enable lightweight statistics of restartable sequences" if EXPERT 1919 + depends on RSEQ && DEBUG_FS 1920 + help 1921 + Enable lightweight counters which expose information about the 1922 + frequency of RSEQ operations via debugfs. Mostly interesting for 1923 + kernel debugging or performance analysis. While lightweight it's 1924 + still adding code into the user/kernel mode transitions. 1925 + 1926 + If unsure, say N. 1927 + 1916 1928 config DEBUG_RSEQ 1917 1929 default n 1918 1930 bool "Enable debugging of rseq() system call" if EXPERT
+74 -9
kernel/rseq.c
··· 67 67 * F1. <failure> 68 68 */ 69 69 70 - #include <linux/sched.h> 71 - #include <linux/uaccess.h> 72 - #include <linux/syscalls.h> 73 - #include <linux/rseq_entry.h> 74 - #include <linux/types.h> 70 + /* Required to select the proper per_cpu ops for rseq_stats_inc() */ 71 + #define RSEQ_BUILD_SLOW_PATH 72 + 73 + #include <linux/debugfs.h> 75 74 #include <linux/ratelimit.h> 75 + #include <linux/rseq_entry.h> 76 + #include <linux/sched.h> 77 + #include <linux/syscalls.h> 78 + #include <linux/uaccess.h> 79 + #include <linux/types.h> 76 80 #include <asm/ptrace.h> 77 81 78 82 #define CREATE_TRACE_POINTS ··· 111 107 trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); 112 108 } 113 109 #endif /* CONFIG_TRACEPOINTS */ 110 + 111 + #ifdef CONFIG_RSEQ_STATS 112 + DEFINE_PER_CPU(struct rseq_stats, rseq_stats); 113 + 114 + static int rseq_debug_show(struct seq_file *m, void *p) 115 + { 116 + struct rseq_stats stats = { }; 117 + unsigned int cpu; 118 + 119 + for_each_possible_cpu(cpu) { 120 + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); 121 + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); 122 + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); 123 + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); 124 + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); 125 + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); 126 + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); 127 + } 128 + 129 + seq_printf(m, "exit: %16lu\n", stats.exit); 130 + seq_printf(m, "signal: %16lu\n", stats.signal); 131 + seq_printf(m, "slowp: %16lu\n", stats.slowpath); 132 + seq_printf(m, "ids: %16lu\n", stats.ids); 133 + seq_printf(m, "cs: %16lu\n", stats.cs); 134 + seq_printf(m, "clear: %16lu\n", stats.clear); 135 + seq_printf(m, "fixup: %16lu\n", stats.fixup); 136 + return 0; 137 + } 138 + 139 + static int rseq_debug_open(struct inode *inode, struct file *file) 140 + { 141 + return single_open(file, rseq_debug_show, inode->i_private); 142 + } 143 + 144 + static const struct file_operations dfs_ops = { 145 + .open = rseq_debug_open, 146 + .read = seq_read, 147 + .llseek = seq_lseek, 148 + .release = single_release, 149 + }; 150 + 151 + static int __init rseq_debugfs_init(void) 152 + { 153 + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); 154 + 155 + debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops); 156 + return 0; 157 + } 158 + __initcall(rseq_debugfs_init); 159 + #endif /* CONFIG_RSEQ_STATS */ 114 160 115 161 #ifdef CONFIG_DEBUG_RSEQ 116 162 static struct rseq *rseq_kernel_fields(struct task_struct *t) ··· 241 187 u32 node_id = cpu_to_node(cpu_id); 242 188 u32 mm_cid = task_mm_cid(t); 243 189 244 - /* 245 - * Validate read-only rseq fields. 246 - */ 190 + rseq_stat_inc(rseq_stats.ids); 191 + 192 + /* Validate read-only rseq fields on debug kernels */ 247 193 if (rseq_validate_ro_fields(t)) 248 194 goto efault; 249 195 WARN_ON_ONCE((int) mm_cid < 0); 196 + 250 197 if (!user_write_access_begin(rseq, t->rseq.len)) 251 198 goto efault; 252 199 ··· 458 403 struct rseq_cs rseq_cs; 459 404 int ret; 460 405 406 + rseq_stat_inc(rseq_stats.cs); 407 + 461 408 ret = rseq_get_rseq_cs(t, &rseq_cs); 462 409 if (ret) 463 410 return ret; ··· 469 412 * If not nested over a rseq critical section, restart is useless. 470 413 * Clear the rseq_cs pointer and return. 471 414 */ 472 - if (!in_rseq_cs(ip, &rseq_cs)) 415 + if (!in_rseq_cs(ip, &rseq_cs)) { 416 + rseq_stat_inc(rseq_stats.clear); 473 417 return clear_rseq_cs(t->rseq.usrptr); 418 + } 474 419 ret = rseq_check_flags(t, rseq_cs.flags); 475 420 if (ret < 0) 476 421 return ret; ··· 481 422 ret = clear_rseq_cs(t->rseq.usrptr); 482 423 if (ret) 483 424 return ret; 425 + rseq_stat_inc(rseq_stats.fixup); 484 426 trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, 485 427 rseq_cs.abort_ip); 486 428 instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); ··· 521 461 522 462 if (unlikely(t->flags & PF_EXITING)) 523 463 return; 464 + 465 + if (ksig) 466 + rseq_stat_inc(rseq_stats.signal); 467 + else 468 + rseq_stat_inc(rseq_stats.slowpath); 524 469 525 470 /* 526 471 * Read and clear the event pending bit first. If the task