Merge tag 'perf_urgent_for_v6.1_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+15 -4

include/linux/perf_event.h

··· 756 756 struct fasync_struct *fasync; 757 757 758 758 /* delayed work for NMIs and such */ 759 - int pending_wakeup; 760 - int pending_kill; 761 - int pending_disable; 759 + unsigned int pending_wakeup; 760 + unsigned int pending_kill; 761 + unsigned int pending_disable; 762 + unsigned int pending_sigtrap; 762 763 unsigned long pending_addr; /* SIGTRAP */ 763 - struct irq_work pending; 764 + struct irq_work pending_irq; 765 + struct callback_head pending_task; 766 + unsigned int pending_work; 764 767 765 768 atomic_t event_limit; 766 769 ··· 880 877 #endif 881 878 void *task_ctx_data; /* pmu specific data */ 882 879 struct rcu_head rcu_head; 880 + 881 + /* 882 + * Sum (event->pending_sigtrap + event->pending_work) 883 + * 884 + * The SIGTRAP is targeted at ctx->task, as such it won't do changing 885 + * that until the signal is delivered. 886 + */ 887 + local_t nr_pending; 883 888 }; 884 889 885 890 /*

+113 -38

kernel/events/core.c

··· 54 54 #include <linux/highmem.h> 55 55 #include <linux/pgtable.h> 56 56 #include <linux/buildid.h> 57 + #include <linux/task_work.h> 57 58 58 59 #include "internal.h" 59 60 ··· 2277 2276 event->pmu->del(event, 0); 2278 2277 event->oncpu = -1; 2279 2278 2280 - if (READ_ONCE(event->pending_disable) >= 0) { 2281 - WRITE_ONCE(event->pending_disable, -1); 2279 + if (event->pending_disable) { 2280 + event->pending_disable = 0; 2282 2281 perf_cgroup_event_disable(event, ctx); 2283 2282 state = PERF_EVENT_STATE_OFF; 2284 2283 } 2284 + 2285 + if (event->pending_sigtrap) { 2286 + bool dec = true; 2287 + 2288 + event->pending_sigtrap = 0; 2289 + if (state != PERF_EVENT_STATE_OFF && 2290 + !event->pending_work) { 2291 + event->pending_work = 1; 2292 + dec = false; 2293 + task_work_add(current, &event->pending_task, TWA_RESUME); 2294 + } 2295 + if (dec) 2296 + local_dec(&event->ctx->nr_pending); 2297 + } 2298 + 2285 2299 perf_event_set_state(event, state); 2286 2300 2287 2301 if (!is_software_event(event)) ··· 2448 2432 * hold the top-level event's child_mutex, so any descendant that 2449 2433 * goes to exit will block in perf_event_exit_event(). 2450 2434 * 2451 - * When called from perf_pending_event it's OK because event->ctx 2435 + * When called from perf_pending_irq it's OK because event->ctx 2452 2436 * is the current context on this CPU and preemption is disabled, 2453 2437 * hence we can't get into perf_event_task_sched_out for this context. 2454 2438 */ ··· 2487 2471 2488 2472 void perf_event_disable_inatomic(struct perf_event *event) 2489 2473 { 2490 - WRITE_ONCE(event->pending_disable, smp_processor_id()); 2491 - /* can fail, see perf_pending_event_disable() */ 2492 - irq_work_queue(&event->pending); 2474 + event->pending_disable = 1; 2475 + irq_work_queue(&event->pending_irq); 2493 2476 } 2494 2477 2495 2478 #define MAX_INTERRUPTS (~0ULL) ··· 3443 3428 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 3444 3429 if (context_equiv(ctx, next_ctx)) { 3445 3430 3431 + perf_pmu_disable(pmu); 3432 + 3433 + /* PMIs are disabled; ctx->nr_pending is stable. */ 3434 + if (local_read(&ctx->nr_pending) || 3435 + local_read(&next_ctx->nr_pending)) { 3436 + /* 3437 + * Must not swap out ctx when there's pending 3438 + * events that rely on the ctx->task relation. 3439 + */ 3440 + raw_spin_unlock(&next_ctx->lock); 3441 + rcu_read_unlock(); 3442 + goto inside_switch; 3443 + } 3444 + 3446 3445 WRITE_ONCE(ctx->task, next); 3447 3446 WRITE_ONCE(next_ctx->task, task); 3448 - 3449 - perf_pmu_disable(pmu); 3450 3447 3451 3448 if (cpuctx->sched_cb_usage && pmu->sched_task) 3452 3449 pmu->sched_task(ctx, false); ··· 3500 3473 raw_spin_lock(&ctx->lock); 3501 3474 perf_pmu_disable(pmu); 3502 3475 3476 + inside_switch: 3503 3477 if (cpuctx->sched_cb_usage && pmu->sched_task) 3504 3478 pmu->sched_task(ctx, false); 3505 3479 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); ··· 4967 4939 4968 4940 static void _free_event(struct perf_event *event) 4969 4941 { 4970 - irq_work_sync(&event->pending); 4942 + irq_work_sync(&event->pending_irq); 4971 4943 4972 4944 unaccount_event(event); 4973 4945 ··· 6467 6439 return; 6468 6440 6469 6441 /* 6470 - * perf_pending_event() can race with the task exiting. 6442 + * Both perf_pending_task() and perf_pending_irq() can race with the 6443 + * task exiting. 6471 6444 */ 6472 6445 if (current->flags & PF_EXITING) 6473 6446 return; ··· 6477 6448 event->attr.type, event->attr.sig_data); 6478 6449 } 6479 6450 6480 - static void perf_pending_event_disable(struct perf_event *event) 6451 + /* 6452 + * Deliver the pending work in-event-context or follow the context. 6453 + */ 6454 + static void __perf_pending_irq(struct perf_event *event) 6481 6455 { 6482 - int cpu = READ_ONCE(event->pending_disable); 6456 + int cpu = READ_ONCE(event->oncpu); 6483 6457 6458 + /* 6459 + * If the event isn't running; we done. event_sched_out() will have 6460 + * taken care of things. 6461 + */ 6484 6462 if (cpu < 0) 6485 6463 return; 6486 6464 6465 + /* 6466 + * Yay, we hit home and are in the context of the event. 6467 + */ 6487 6468 if (cpu == smp_processor_id()) { 6488 - WRITE_ONCE(event->pending_disable, -1); 6489 - 6490 - if (event->attr.sigtrap) { 6469 + if (event->pending_sigtrap) { 6470 + event->pending_sigtrap = 0; 6491 6471 perf_sigtrap(event); 6492 - atomic_set_release(&event->event_limit, 1); /* rearm event */ 6493 - return; 6472 + local_dec(&event->ctx->nr_pending); 6494 6473 } 6495 - 6496 - perf_event_disable_local(event); 6474 + if (event->pending_disable) { 6475 + event->pending_disable = 0; 6476 + perf_event_disable_local(event); 6477 + } 6497 6478 return; 6498 6479 } 6499 6480 ··· 6523 6484 * irq_work_queue(); // FAILS 6524 6485 * 6525 6486 * irq_work_run() 6526 - * perf_pending_event() 6487 + * perf_pending_irq() 6527 6488 * 6528 6489 * But the event runs on CPU-B and wants disabling there. 6529 6490 */ 6530 - irq_work_queue_on(&event->pending, cpu); 6491 + irq_work_queue_on(&event->pending_irq, cpu); 6531 6492 } 6532 6493 6533 - static void perf_pending_event(struct irq_work *entry) 6494 + static void perf_pending_irq(struct irq_work *entry) 6534 6495 { 6535 - struct perf_event *event = container_of(entry, struct perf_event, pending); 6496 + struct perf_event *event = container_of(entry, struct perf_event, pending_irq); 6536 6497 int rctx; 6537 6498 6538 - rctx = perf_swevent_get_recursion_context(); 6539 6499 /* 6540 6500 * If we 'fail' here, that's OK, it means recursion is already disabled 6541 6501 * and we won't recurse 'further'. 6542 6502 */ 6503 + rctx = perf_swevent_get_recursion_context(); 6543 6504 6544 - perf_pending_event_disable(event); 6545 - 6505 + /* 6506 + * The wakeup isn't bound to the context of the event -- it can happen 6507 + * irrespective of where the event is. 6508 + */ 6546 6509 if (event->pending_wakeup) { 6547 6510 event->pending_wakeup = 0; 6548 6511 perf_event_wakeup(event); 6549 6512 } 6550 6513 6514 + __perf_pending_irq(event); 6515 + 6551 6516 if (rctx >= 0) 6552 6517 perf_swevent_put_recursion_context(rctx); 6518 + } 6519 + 6520 + static void perf_pending_task(struct callback_head *head) 6521 + { 6522 + struct perf_event *event = container_of(head, struct perf_event, pending_task); 6523 + int rctx; 6524 + 6525 + /* 6526 + * If we 'fail' here, that's OK, it means recursion is already disabled 6527 + * and we won't recurse 'further'. 6528 + */ 6529 + preempt_disable_notrace(); 6530 + rctx = perf_swevent_get_recursion_context(); 6531 + 6532 + if (event->pending_work) { 6533 + event->pending_work = 0; 6534 + perf_sigtrap(event); 6535 + local_dec(&event->ctx->nr_pending); 6536 + } 6537 + 6538 + if (rctx >= 0) 6539 + perf_swevent_put_recursion_context(rctx); 6540 + preempt_enable_notrace(); 6553 6541 } 6554 6542 6555 6543 #ifdef CONFIG_GUEST_PERF_EVENTS ··· 9278 9212 */ 9279 9213 9280 9214 static int __perf_event_overflow(struct perf_event *event, 9281 - int throttle, struct perf_sample_data *data, 9282 - struct pt_regs *regs) 9215 + int throttle, struct perf_sample_data *data, 9216 + struct pt_regs *regs) 9283 9217 { 9284 9218 int events = atomic_read(&event->event_limit); 9285 9219 int ret = 0; ··· 9302 9236 if (events && atomic_dec_and_test(&event->event_limit)) { 9303 9237 ret = 1; 9304 9238 event->pending_kill = POLL_HUP; 9305 - event->pending_addr = data->addr; 9306 - 9307 9239 perf_event_disable_inatomic(event); 9240 + } 9241 + 9242 + if (event->attr.sigtrap) { 9243 + /* 9244 + * Should not be able to return to user space without processing 9245 + * pending_sigtrap (kernel events can overflow multiple times). 9246 + */ 9247 + WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel); 9248 + if (!event->pending_sigtrap) { 9249 + event->pending_sigtrap = 1; 9250 + local_inc(&event->ctx->nr_pending); 9251 + } 9252 + event->pending_addr = data->addr; 9253 + irq_work_queue(&event->pending_irq); 9308 9254 } 9309 9255 9310 9256 READ_ONCE(event->overflow_handler)(event, data, regs); 9311 9257 9312 9258 if (*perf_event_fasync(event) && event->pending_kill) { 9313 9259 event->pending_wakeup = 1; 9314 - irq_work_queue(&event->pending); 9260 + irq_work_queue(&event->pending_irq); 9315 9261 } 9316 9262 9317 9263 return ret; 9318 9264 } 9319 9265 9320 9266 int perf_event_overflow(struct perf_event *event, 9321 - struct perf_sample_data *data, 9322 - struct pt_regs *regs) 9267 + struct perf_sample_data *data, 9268 + struct pt_regs *regs) 9323 9269 { 9324 9270 return __perf_event_overflow(event, 1, data, regs); 9325 9271 } ··· 11648 11570 11649 11571 11650 11572 init_waitqueue_head(&event->waitq); 11651 - event->pending_disable = -1; 11652 - init_irq_work(&event->pending, perf_pending_event); 11573 + init_irq_work(&event->pending_irq, perf_pending_irq); 11574 + init_task_work(&event->pending_task, perf_pending_task); 11653 11575 11654 11576 mutex_init(&event->mmap_mutex); 11655 11577 raw_spin_lock_init(&event->addr_filters.lock); ··· 11670 11592 11671 11593 if (parent_event) 11672 11594 event->event_caps = parent_event->event_caps; 11673 - 11674 - if (event->attr.sigtrap) 11675 - atomic_set(&event->event_limit, 1); 11676 11595 11677 11596 if (task) { 11678 11597 event->attach_state = PERF_ATTACH_TASK;

+1 -1

kernel/events/ring_buffer.c

··· 22 22 atomic_set(&handle->rb->poll, EPOLLIN); 23 23 24 24 handle->event->pending_wakeup = 1; 25 - irq_work_queue(&handle->event->pending); 25 + irq_work_queue(&handle->event->pending_irq); 26 26 } 27 27 28 28 /*

+2

kernel/trace/bpf_trace.c

··· 687 687 688 688 perf_sample_data_init(sd, 0, 0); 689 689 sd->raw = &raw; 690 + sd->sample_flags |= PERF_SAMPLE_RAW; 690 691 691 692 err = __bpf_perf_event_output(regs, map, flags, sd); 692 693 ··· 746 745 perf_fetch_caller_regs(regs); 747 746 perf_sample_data_init(sd, 0, 0); 748 747 sd->raw = &raw; 748 + sd->sample_flags |= PERF_SAMPLE_RAW; 749 749 750 750 ret = __bpf_perf_event_output(regs, map, flags, sd); 751 751 out:

+32 -3

tools/testing/selftests/perf_events/sigtrap_threads.c

··· 62 62 .remove_on_exec = 1, /* Required by sigtrap. */ 63 63 .sigtrap = 1, /* Request synchronous SIGTRAP on event. */ 64 64 .sig_data = TEST_SIG_DATA(addr, id), 65 + .exclude_kernel = 1, /* To allow */ 66 + .exclude_hv = 1, /* running as !root */ 65 67 }; 66 68 return attr; 67 69 } ··· 95 93 96 94 __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); 97 95 iter = ctx.iterate_on; /* read */ 98 - for (i = 0; i < iter - 1; i++) { 99 - __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); 100 - ctx.iterate_on = iter; /* idempotent write */ 96 + if (iter >= 0) { 97 + for (i = 0; i < iter - 1; i++) { 98 + __atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED); 99 + ctx.iterate_on = iter; /* idempotent write */ 100 + } 101 + } else { 102 + while (ctx.iterate_on); 101 103 } 102 104 103 105 return NULL; ··· 209 203 210 204 EXPECT_EQ(ctx.signal_count, NUM_THREADS * ctx.iterate_on); 211 205 EXPECT_EQ(ctx.tids_want_signal, 0); 206 + EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); 207 + EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); 208 + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0)); 209 + } 210 + 211 + TEST_F(sigtrap_threads, signal_stress_with_disable) 212 + { 213 + const int target_count = NUM_THREADS * 3000; 214 + int i; 215 + 216 + ctx.iterate_on = -1; 217 + 218 + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); 219 + pthread_barrier_wait(&self->barrier); 220 + while (__atomic_load_n(&ctx.signal_count, __ATOMIC_RELAXED) < target_count) { 221 + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0); 222 + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0); 223 + } 224 + ctx.iterate_on = 0; 225 + for (i = 0; i < NUM_THREADS; i++) 226 + ASSERT_EQ(pthread_join(self->threads[i], NULL), 0); 227 + EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0); 228 + 212 229 EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); 213 230 EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); 214 231 EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0));

Configure Feed

Configure Feed