perf lock contention: Add -J/--inject-delay option

This is to slow down lock acquistion (on contention locks) deliberately.

A possible use case is to estimate impact on application performance by
optimization of kernel locking behavior. By delaying the lock it can
simulate the worse condition as a control group, and then compare with
the current behavior as a optimized condition.

The syntax is 'time@function' and the time can have unit suffix like
"us" and "ms". For example, I ran a simple test like below.

$ sudo perf lock con -abl -L tasklist_lock -- \
sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol

92 1.18 ms 199.54 us 12.79 us ffffffff8a806080 tasklist_lock (rwlock)

The contention count was 92 and the average wait time was around 10 us.
But if I add 100 usec of delay to the tasklist_lock,

$ sudo perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- \
sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol

190 15.67 ms 230.10 us 82.46 us ffffffff8a806080 tasklist_lock (rwlock)

The contention count increased and the average wait time was up closed
to 100 usec. If I increase the delay even more,

$ sudo perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- \
sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol

1002 2.80 s 3.01 ms 2.80 ms ffffffff8a806080 tasklist_lock (rwlock)

Now every sleep process had contention and the wait time was more than 1
msec. This is on my 4 CPU laptop so I guess one CPU has the lock while
other 3 are waiting for it mostly.

For simplicity, it only supports global locks for now.

Committer testing:

root@number:~# grep -m1 'model name' /proc/cpuinfo
model name : AMD Ryzen 9 9950X3D 16-Core Processor
root@number:~# perf lock con -abl -L tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol

142 453.85 us 25.39 us 3.20 us ffffffffae808080 tasklist_lock (rwlock)
root@number:~# perf lock con -abl -L tasklist_lock -J 100us@tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol

1040 2.39 s 3.11 ms 2.30 ms ffffffffae808080 tasklist_lock (rwlock)
root@number:~# perf lock con -abl -L tasklist_lock -J 1ms@tasklist_lock -- sh -c 'for i in $(seq 1000); do sleep 1 & done; wait'
contended total wait max wait avg wait address symbol

1025 24.72 s 31.01 ms 24.12 ms ffffffffae808080 tasklist_lock (rwlock)
root@number:~#

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20250509171950.183591-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Namhyung Kim and committed by

Arnaldo Carvalho de Melo 1 year ago c42e2199 4bfe2714

+164

5 changed files

expand all

tools

perf

Documentation

perf-lock.txt

builtin-lock.c

util

bpf_lock_contention.c

bpf_skel

lock_contention.bpf.c

lock-contention.h

+11

tools/perf/Documentation/perf-lock.txt

··· 216 216 --cgroup-filter=<value>:: 217 217 Show lock contention only in the given cgroups (comma separated list). 218 218 219 + -J:: 220 + --inject-delay=<time@function>:: 221 + Add delays to the given lock. It's added to the contention-end part so 222 + that the (new) owner of the lock will be delayed. But by slowing down 223 + the owner, the waiters will also be delayed as well. This is working 224 + only with -b/--use-bpf. 225 + 226 + The 'time' is specified in nsec but it can have a unit suffix. Available 227 + units are "ms" and "us". Note that it will busy-wait after it gets the 228 + lock. Please use it at your own risk. 229 + 219 230 220 231 SEE ALSO 221 232 --------

+74

tools/perf/builtin-lock.c

··· 62 62 static FILE *lock_output; 63 63 64 64 static struct lock_filter filters; 65 + static struct lock_delay *delays; 66 + static int nr_delays; 65 67 66 68 static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR; 67 69 ··· 2003 2001 .max_stack = max_stack_depth, 2004 2002 .stack_skip = stack_skip, 2005 2003 .filters = &filters, 2004 + .delays = delays, 2005 + .nr_delays = nr_delays, 2006 2006 .save_callstack = needs_callstack(), 2007 2007 .owner = show_lock_owner, 2008 2008 .cgroups = RB_ROOT, ··· 2508 2504 return ret; 2509 2505 } 2510 2506 2507 + static bool add_lock_delay(char *spec) 2508 + { 2509 + char *at, *pos; 2510 + struct lock_delay *tmp; 2511 + unsigned long duration; 2512 + 2513 + at = strchr(spec, '@'); 2514 + if (at == NULL) { 2515 + pr_err("lock delay should have '@' sign: %s\n", spec); 2516 + return false; 2517 + } 2518 + if (at == spec) { 2519 + pr_err("lock delay should have time before '@': %s\n", spec); 2520 + return false; 2521 + } 2522 + 2523 + *at = '\0'; 2524 + duration = strtoul(spec, &pos, 0); 2525 + if (!strcmp(pos, "ns")) 2526 + duration *= 1; 2527 + else if (!strcmp(pos, "us")) 2528 + duration *= 1000; 2529 + else if (!strcmp(pos, "ms")) 2530 + duration *= 1000 * 1000; 2531 + else if (*pos) { 2532 + pr_err("invalid delay time: %s@%s\n", spec, at + 1); 2533 + return false; 2534 + } 2535 + 2536 + tmp = realloc(delays, (nr_delays + 1) * sizeof(*delays)); 2537 + if (tmp == NULL) { 2538 + pr_err("Memory allocation failure\n"); 2539 + return false; 2540 + } 2541 + delays = tmp; 2542 + 2543 + delays[nr_delays].sym = strdup(at + 1); 2544 + if (delays[nr_delays].sym == NULL) { 2545 + pr_err("Memory allocation failure\n"); 2546 + return false; 2547 + } 2548 + delays[nr_delays].time = duration; 2549 + 2550 + nr_delays++; 2551 + return true; 2552 + } 2553 + 2554 + static int parse_lock_delay(const struct option *opt __maybe_unused, const char *str, 2555 + int unset __maybe_unused) 2556 + { 2557 + char *s, *tmp, *tok; 2558 + int ret = 0; 2559 + 2560 + s = strdup(str); 2561 + if (s == NULL) 2562 + return -1; 2563 + 2564 + for (tok = strtok_r(s, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) { 2565 + if (!add_lock_delay(tok)) { 2566 + ret = -1; 2567 + break; 2568 + } 2569 + } 2570 + 2571 + free(s); 2572 + return ret; 2573 + } 2574 + 2511 2575 int cmd_lock(int argc, const char **argv) 2512 2576 { 2513 2577 const struct option lock_options[] = { ··· 2652 2580 OPT_BOOLEAN(0, "lock-cgroup", &show_lock_cgroups, "show lock stats by cgroup"), 2653 2581 OPT_CALLBACK('G', "cgroup-filter", NULL, "CGROUPS", 2654 2582 "Filter specific cgroups", parse_cgroup_filter), 2583 + OPT_CALLBACK('J', "inject-delay", NULL, "TIME@FUNC", 2584 + "Inject delays to specific locks", parse_lock_delay), 2655 2585 OPT_PARENT(lock_options) 2656 2586 }; 2657 2587

+28

tools/perf/util/bpf_lock_contention.c

··· 261 261 skel->rodata->has_addr = 1; 262 262 } 263 263 264 + /* resolve lock name in delays */ 265 + if (con->nr_delays) { 266 + struct symbol *sym; 267 + struct map *kmap; 268 + 269 + for (i = 0; i < con->nr_delays; i++) { 270 + sym = machine__find_kernel_symbol_by_name(con->machine, 271 + con->delays[i].sym, 272 + &kmap); 273 + if (sym == NULL) { 274 + pr_warning("ignore unknown symbol: %s\n", 275 + con->delays[i].sym); 276 + continue; 277 + } 278 + 279 + con->delays[i].addr = map__unmap_ip(kmap, sym->start); 280 + } 281 + skel->rodata->lock_delay = 1; 282 + bpf_map__set_max_entries(skel->maps.lock_delays, con->nr_delays); 283 + } 284 + 264 285 bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus); 265 286 bpf_map__set_max_entries(skel->maps.task_filter, ntasks); 266 287 bpf_map__set_max_entries(skel->maps.type_filter, ntypes); ··· 371 350 372 351 for (i = 0; i < con->filters->nr_cgrps; i++) 373 352 bpf_map_update_elem(fd, &con->filters->cgrps[i], &val, BPF_ANY); 353 + } 354 + 355 + if (con->nr_delays) { 356 + fd = bpf_map__fd(skel->maps.lock_delays); 357 + 358 + for (i = 0; i < con->nr_delays; i++) 359 + bpf_map_update_elem(fd, &con->delays[i].addr, &con->delays[i].time, BPF_ANY); 374 360 } 375 361 376 362 if (con->aggr_mode == LOCK_AGGR_CGROUP)

+43

tools/perf/util/bpf_skel/lock_contention.bpf.c

··· 14 14 /* for collect_zone_lock(). It should be more than the actual zones. */ 15 15 #define MAX_ZONES 10 16 16 17 + /* for do_lock_delay(). Arbitrarily set to 1 million. */ 18 + #define MAX_LOOP (1U << 20) 19 + 17 20 /* lock contention flags from include/trace/events/lock.h */ 18 21 #define LCB_F_SPIN (1U << 0) 19 22 #define LCB_F_READ (1U << 1) ··· 152 149 __uint(max_entries, 1); 153 150 } slab_caches SEC(".maps"); 154 151 152 + struct { 153 + __uint(type, BPF_MAP_TYPE_HASH); 154 + __uint(key_size, sizeof(__u64)); 155 + __uint(value_size, sizeof(__u64)); 156 + __uint(max_entries, 1); 157 + } lock_delays SEC(".maps"); 158 + 155 159 struct rw_semaphore___old { 156 160 struct task_struct *owner; 157 161 } __attribute__((preserve_access_index)); ··· 189 179 const volatile int lock_owner; 190 180 const volatile int use_cgroup_v2; 191 181 const volatile int max_stack; 182 + const volatile int lock_delay; 192 183 193 184 /* determine the key of lock stat */ 194 185 const volatile int aggr_mode; ··· 396 385 break; 397 386 } 398 387 return 0; 388 + } 389 + 390 + static inline long delay_callback(__u64 idx, void *arg) 391 + { 392 + __u64 target = *(__u64 *)arg; 393 + 394 + if (target <= bpf_ktime_get_ns()) 395 + return 1; 396 + 397 + /* just to kill time */ 398 + (void)bpf_get_prandom_u32(); 399 + 400 + return 0; 401 + } 402 + 403 + static inline void do_lock_delay(__u64 duration) 404 + { 405 + __u64 target = bpf_ktime_get_ns() + duration; 406 + 407 + bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0); 408 + } 409 + 410 + static inline void check_lock_delay(__u64 lock) 411 + { 412 + __u64 *delay; 413 + 414 + delay = bpf_map_lookup_elem(&lock_delays, &lock); 415 + if (delay) 416 + do_lock_delay(*delay); 399 417 } 400 418 401 419 static inline struct tstamp_data *get_tstamp_elem(__u32 flags) ··· 836 796 update_contention_data(data, duration, 1); 837 797 838 798 out: 799 + if (lock_delay) 800 + check_lock_delay(pelem->lock); 801 + 839 802 pelem->lock = 0; 840 803 if (need_delete) 841 804 bpf_map_delete_elem(&tstamp, &pid);

tools/perf/util/lock-contention.h

··· 18 18 char **slabs; 19 19 }; 20 20 21 + struct lock_delay { 22 + char *sym; 23 + unsigned long addr; 24 + unsigned long time; 25 + }; 26 + 21 27 struct lock_stat { 22 28 struct hlist_node hash_entry; 23 29 struct rb_node rb; /* used for sorting */ ··· 146 140 struct machine *machine; 147 141 struct hlist_head *result; 148 142 struct lock_filter *filters; 143 + struct lock_delay *delays; 149 144 struct lock_contention_fails fails; 150 145 struct rb_root cgroups; 151 146 void *btf; ··· 156 149 int aggr_mode; 157 150 int owner; 158 151 int nr_filtered; 152 + int nr_delays; 159 153 bool save_callstack; 160 154 }; 161 155

Configure Feed

Configure Feed