Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
"Four fixes. The mmap ones are unfortunately larger than desired -
fuzzing uncovered bugs that needed perf context life time management
changes to fix properly"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86: Fix broken PEBS-LL support on SNB-EP/IVB-EP
perf: Fix mmap() accounting hole
perf: Fix perf mmap bugs
kprobes: Fix to free gone and unused optprobes

+188 -86
+1 -1
arch/x86/kernel/cpu/perf_event_intel.c
··· 165 165 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), 166 166 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), 167 167 INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), 168 - INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), 169 168 EVENT_EXTRA_END 170 169 }; 171 170 172 171 static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { 173 172 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), 174 173 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), 174 + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), 175 175 EVENT_EXTRA_END 176 176 }; 177 177
+1 -2
include/linux/perf_event.h
··· 389 389 /* mmap bits */ 390 390 struct mutex mmap_mutex; 391 391 atomic_t mmap_count; 392 - int mmap_locked; 393 - struct user_struct *mmap_user; 392 + 394 393 struct ring_buffer *rb; 395 394 struct list_head rb_entry; 396 395
+162 -73
kernel/events/core.c
··· 196 196 static void update_context_time(struct perf_event_context *ctx); 197 197 static u64 perf_event_time(struct perf_event *event); 198 198 199 - static void ring_buffer_attach(struct perf_event *event, 200 - struct ring_buffer *rb); 201 - 202 199 void __weak perf_event_print_debug(void) { } 203 200 204 201 extern __weak const char *perf_pmu_name(void) ··· 2915 2918 } 2916 2919 2917 2920 static void ring_buffer_put(struct ring_buffer *rb); 2921 + static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); 2918 2922 2919 2923 static void free_event(struct perf_event *event) 2920 2924 { ··· 2940 2942 if (has_branch_stack(event)) { 2941 2943 static_key_slow_dec_deferred(&perf_sched_events); 2942 2944 /* is system-wide event */ 2943 - if (!(event->attach_state & PERF_ATTACH_TASK)) 2945 + if (!(event->attach_state & PERF_ATTACH_TASK)) { 2944 2946 atomic_dec(&per_cpu(perf_branch_stack_events, 2945 2947 event->cpu)); 2948 + } 2946 2949 } 2947 2950 } 2948 2951 2949 2952 if (event->rb) { 2950 - ring_buffer_put(event->rb); 2951 - event->rb = NULL; 2953 + struct ring_buffer *rb; 2954 + 2955 + /* 2956 + * Can happen when we close an event with re-directed output. 2957 + * 2958 + * Since we have a 0 refcount, perf_mmap_close() will skip 2959 + * over us; possibly making our ring_buffer_put() the last. 2960 + */ 2961 + mutex_lock(&event->mmap_mutex); 2962 + rb = event->rb; 2963 + if (rb) { 2964 + rcu_assign_pointer(event->rb, NULL); 2965 + ring_buffer_detach(event, rb); 2966 + ring_buffer_put(rb); /* could be last */ 2967 + } 2968 + mutex_unlock(&event->mmap_mutex); 2952 2969 } 2953 2970 2954 2971 if (is_cgroup_event(event)) ··· 3201 3188 unsigned int events = POLL_HUP; 3202 3189 3203 3190 /* 3204 - * Race between perf_event_set_output() and perf_poll(): perf_poll() 3205 - * grabs the rb reference but perf_event_set_output() overrides it. 3206 - * Here is the timeline for two threads T1, T2: 3207 - * t0: T1, rb = rcu_dereference(event->rb) 3208 - * t1: T2, old_rb = event->rb 3209 - * t2: T2, event->rb = new rb 3210 - * t3: T2, ring_buffer_detach(old_rb) 3211 - * t4: T1, ring_buffer_attach(rb1) 3212 - * t5: T1, poll_wait(event->waitq) 3213 - * 3214 - * To avoid this problem, we grab mmap_mutex in perf_poll() 3215 - * thereby ensuring that the assignment of the new ring buffer 3216 - * and the detachment of the old buffer appear atomic to perf_poll() 3191 + * Pin the event->rb by taking event->mmap_mutex; otherwise 3192 + * perf_event_set_output() can swizzle our rb and make us miss wakeups. 3217 3193 */ 3218 3194 mutex_lock(&event->mmap_mutex); 3219 - 3220 - rcu_read_lock(); 3221 - rb = rcu_dereference(event->rb); 3222 - if (rb) { 3223 - ring_buffer_attach(event, rb); 3195 + rb = event->rb; 3196 + if (rb) 3224 3197 events = atomic_xchg(&rb->poll, 0); 3225 - } 3226 - rcu_read_unlock(); 3227 - 3228 3198 mutex_unlock(&event->mmap_mutex); 3229 3199 3230 3200 poll_wait(file, &event->waitq, wait); ··· 3517 3521 return; 3518 3522 3519 3523 spin_lock_irqsave(&rb->event_lock, flags); 3520 - if (!list_empty(&event->rb_entry)) 3521 - goto unlock; 3522 - 3523 - list_add(&event->rb_entry, &rb->event_list); 3524 - unlock: 3524 + if (list_empty(&event->rb_entry)) 3525 + list_add(&event->rb_entry, &rb->event_list); 3525 3526 spin_unlock_irqrestore(&rb->event_lock, flags); 3526 3527 } 3527 3528 3528 - static void ring_buffer_detach(struct perf_event *event, 3529 - struct ring_buffer *rb) 3529 + static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) 3530 3530 { 3531 3531 unsigned long flags; 3532 3532 ··· 3541 3549 3542 3550 rcu_read_lock(); 3543 3551 rb = rcu_dereference(event->rb); 3544 - if (!rb) 3545 - goto unlock; 3546 - 3547 - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3548 - wake_up_all(&event->waitq); 3549 - 3550 - unlock: 3552 + if (rb) { 3553 + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3554 + wake_up_all(&event->waitq); 3555 + } 3551 3556 rcu_read_unlock(); 3552 3557 } 3553 3558 ··· 3573 3584 3574 3585 static void ring_buffer_put(struct ring_buffer *rb) 3575 3586 { 3576 - struct perf_event *event, *n; 3577 - unsigned long flags; 3578 - 3579 3587 if (!atomic_dec_and_test(&rb->refcount)) 3580 3588 return; 3581 3589 3582 - spin_lock_irqsave(&rb->event_lock, flags); 3583 - list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { 3584 - list_del_init(&event->rb_entry); 3585 - wake_up_all(&event->waitq); 3586 - } 3587 - spin_unlock_irqrestore(&rb->event_lock, flags); 3590 + WARN_ON_ONCE(!list_empty(&rb->event_list)); 3588 3591 3589 3592 call_rcu(&rb->rcu_head, rb_free_rcu); 3590 3593 } ··· 3586 3605 struct perf_event *event = vma->vm_file->private_data; 3587 3606 3588 3607 atomic_inc(&event->mmap_count); 3608 + atomic_inc(&event->rb->mmap_count); 3589 3609 } 3590 3610 3611 + /* 3612 + * A buffer can be mmap()ed multiple times; either directly through the same 3613 + * event, or through other events by use of perf_event_set_output(). 3614 + * 3615 + * In order to undo the VM accounting done by perf_mmap() we need to destroy 3616 + * the buffer here, where we still have a VM context. This means we need 3617 + * to detach all events redirecting to us. 3618 + */ 3591 3619 static void perf_mmap_close(struct vm_area_struct *vma) 3592 3620 { 3593 3621 struct perf_event *event = vma->vm_file->private_data; 3594 3622 3595 - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3596 - unsigned long size = perf_data_size(event->rb); 3597 - struct user_struct *user = event->mmap_user; 3598 - struct ring_buffer *rb = event->rb; 3623 + struct ring_buffer *rb = event->rb; 3624 + struct user_struct *mmap_user = rb->mmap_user; 3625 + int mmap_locked = rb->mmap_locked; 3626 + unsigned long size = perf_data_size(rb); 3599 3627 3600 - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3601 - vma->vm_mm->pinned_vm -= event->mmap_locked; 3602 - rcu_assign_pointer(event->rb, NULL); 3603 - ring_buffer_detach(event, rb); 3604 - mutex_unlock(&event->mmap_mutex); 3628 + atomic_dec(&rb->mmap_count); 3605 3629 3606 - ring_buffer_put(rb); 3607 - free_uid(user); 3630 + if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) 3631 + return; 3632 + 3633 + /* Detach current event from the buffer. */ 3634 + rcu_assign_pointer(event->rb, NULL); 3635 + ring_buffer_detach(event, rb); 3636 + mutex_unlock(&event->mmap_mutex); 3637 + 3638 + /* If there's still other mmap()s of this buffer, we're done. */ 3639 + if (atomic_read(&rb->mmap_count)) { 3640 + ring_buffer_put(rb); /* can't be last */ 3641 + return; 3608 3642 } 3643 + 3644 + /* 3645 + * No other mmap()s, detach from all other events that might redirect 3646 + * into the now unreachable buffer. Somewhat complicated by the 3647 + * fact that rb::event_lock otherwise nests inside mmap_mutex. 3648 + */ 3649 + again: 3650 + rcu_read_lock(); 3651 + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { 3652 + if (!atomic_long_inc_not_zero(&event->refcount)) { 3653 + /* 3654 + * This event is en-route to free_event() which will 3655 + * detach it and remove it from the list. 3656 + */ 3657 + continue; 3658 + } 3659 + rcu_read_unlock(); 3660 + 3661 + mutex_lock(&event->mmap_mutex); 3662 + /* 3663 + * Check we didn't race with perf_event_set_output() which can 3664 + * swizzle the rb from under us while we were waiting to 3665 + * acquire mmap_mutex. 3666 + * 3667 + * If we find a different rb; ignore this event, a next 3668 + * iteration will no longer find it on the list. We have to 3669 + * still restart the iteration to make sure we're not now 3670 + * iterating the wrong list. 3671 + */ 3672 + if (event->rb == rb) { 3673 + rcu_assign_pointer(event->rb, NULL); 3674 + ring_buffer_detach(event, rb); 3675 + ring_buffer_put(rb); /* can't be last, we still have one */ 3676 + } 3677 + mutex_unlock(&event->mmap_mutex); 3678 + put_event(event); 3679 + 3680 + /* 3681 + * Restart the iteration; either we're on the wrong list or 3682 + * destroyed its integrity by doing a deletion. 3683 + */ 3684 + goto again; 3685 + } 3686 + rcu_read_unlock(); 3687 + 3688 + /* 3689 + * It could be there's still a few 0-ref events on the list; they'll 3690 + * get cleaned up by free_event() -- they'll also still have their 3691 + * ref on the rb and will free it whenever they are done with it. 3692 + * 3693 + * Aside from that, this buffer is 'fully' detached and unmapped, 3694 + * undo the VM accounting. 3695 + */ 3696 + 3697 + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); 3698 + vma->vm_mm->pinned_vm -= mmap_locked; 3699 + free_uid(mmap_user); 3700 + 3701 + ring_buffer_put(rb); /* could be last */ 3609 3702 } 3610 3703 3611 3704 static const struct vm_operations_struct perf_mmap_vmops = { ··· 3729 3674 return -EINVAL; 3730 3675 3731 3676 WARN_ON_ONCE(event->ctx->parent_ctx); 3677 + again: 3732 3678 mutex_lock(&event->mmap_mutex); 3733 3679 if (event->rb) { 3734 - if (event->rb->nr_pages == nr_pages) 3735 - atomic_inc(&event->rb->refcount); 3736 - else 3680 + if (event->rb->nr_pages != nr_pages) { 3737 3681 ret = -EINVAL; 3682 + goto unlock; 3683 + } 3684 + 3685 + if (!atomic_inc_not_zero(&event->rb->mmap_count)) { 3686 + /* 3687 + * Raced against perf_mmap_close() through 3688 + * perf_event_set_output(). Try again, hope for better 3689 + * luck. 3690 + */ 3691 + mutex_unlock(&event->mmap_mutex); 3692 + goto again; 3693 + } 3694 + 3738 3695 goto unlock; 3739 3696 } 3740 3697 ··· 3787 3720 ret = -ENOMEM; 3788 3721 goto unlock; 3789 3722 } 3790 - rcu_assign_pointer(event->rb, rb); 3723 + 3724 + atomic_set(&rb->mmap_count, 1); 3725 + rb->mmap_locked = extra; 3726 + rb->mmap_user = get_current_user(); 3791 3727 3792 3728 atomic_long_add(user_extra, &user->locked_vm); 3793 - event->mmap_locked = extra; 3794 - event->mmap_user = get_current_user(); 3795 - vma->vm_mm->pinned_vm += event->mmap_locked; 3729 + vma->vm_mm->pinned_vm += extra; 3730 + 3731 + ring_buffer_attach(event, rb); 3732 + rcu_assign_pointer(event->rb, rb); 3796 3733 3797 3734 perf_event_update_userpage(event); 3798 3735 ··· 3805 3734 atomic_inc(&event->mmap_count); 3806 3735 mutex_unlock(&event->mmap_mutex); 3807 3736 3808 - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3737 + /* 3738 + * Since pinned accounting is per vm we cannot allow fork() to copy our 3739 + * vma. 3740 + */ 3741 + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; 3809 3742 vma->vm_ops = &perf_mmap_vmops; 3810 3743 3811 3744 return ret; ··· 6487 6412 if (atomic_read(&event->mmap_count)) 6488 6413 goto unlock; 6489 6414 6415 + old_rb = event->rb; 6416 + 6490 6417 if (output_event) { 6491 6418 /* get the rb we want to redirect to */ 6492 6419 rb = ring_buffer_get(output_event); ··· 6496 6419 goto unlock; 6497 6420 } 6498 6421 6499 - old_rb = event->rb; 6500 - rcu_assign_pointer(event->rb, rb); 6501 6422 if (old_rb) 6502 6423 ring_buffer_detach(event, old_rb); 6424 + 6425 + if (rb) 6426 + ring_buffer_attach(event, rb); 6427 + 6428 + rcu_assign_pointer(event->rb, rb); 6429 + 6430 + if (old_rb) { 6431 + ring_buffer_put(old_rb); 6432 + /* 6433 + * Since we detached before setting the new rb, so that we 6434 + * could attach the new rb, we could have missed a wakeup. 6435 + * Provide it now. 6436 + */ 6437 + wake_up_all(&event->waitq); 6438 + } 6439 + 6503 6440 ret = 0; 6504 6441 unlock: 6505 6442 mutex_unlock(&event->mmap_mutex); 6506 6443 6507 - if (old_rb) 6508 - ring_buffer_put(old_rb); 6509 6444 out: 6510 6445 return ret; 6511 6446 }
+4
kernel/events/internal.h
··· 31 31 spinlock_t event_lock; 32 32 struct list_head event_list; 33 33 34 + atomic_t mmap_count; 35 + unsigned long mmap_locked; 36 + struct user_struct *mmap_user; 37 + 34 38 struct perf_event_mmap_page *user_page; 35 39 void *data_pages[0]; 36 40 };
+20 -10
kernel/kprobes.c
··· 467 467 /* Optimization staging list, protected by kprobe_mutex */ 468 468 static LIST_HEAD(optimizing_list); 469 469 static LIST_HEAD(unoptimizing_list); 470 + static LIST_HEAD(freeing_list); 470 471 471 472 static void kprobe_optimizer(struct work_struct *work); 472 473 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); ··· 505 504 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint 506 505 * if need) kprobes listed on unoptimizing_list. 507 506 */ 508 - static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) 507 + static __kprobes void do_unoptimize_kprobes(void) 509 508 { 510 509 struct optimized_kprobe *op, *tmp; 511 510 ··· 516 515 /* Ditto to do_optimize_kprobes */ 517 516 get_online_cpus(); 518 517 mutex_lock(&text_mutex); 519 - arch_unoptimize_kprobes(&unoptimizing_list, free_list); 518 + arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); 520 519 /* Loop free_list for disarming */ 521 - list_for_each_entry_safe(op, tmp, free_list, list) { 520 + list_for_each_entry_safe(op, tmp, &freeing_list, list) { 522 521 /* Disarm probes if marked disabled */ 523 522 if (kprobe_disabled(&op->kp)) 524 523 arch_disarm_kprobe(&op->kp); ··· 537 536 } 538 537 539 538 /* Reclaim all kprobes on the free_list */ 540 - static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) 539 + static __kprobes void do_free_cleaned_kprobes(void) 541 540 { 542 541 struct optimized_kprobe *op, *tmp; 543 542 544 - list_for_each_entry_safe(op, tmp, free_list, list) { 543 + list_for_each_entry_safe(op, tmp, &freeing_list, list) { 545 544 BUG_ON(!kprobe_unused(&op->kp)); 546 545 list_del_init(&op->list); 547 546 free_aggr_kprobe(&op->kp); ··· 557 556 /* Kprobe jump optimizer */ 558 557 static __kprobes void kprobe_optimizer(struct work_struct *work) 559 558 { 560 - LIST_HEAD(free_list); 561 - 562 559 mutex_lock(&kprobe_mutex); 563 560 /* Lock modules while optimizing kprobes */ 564 561 mutex_lock(&module_mutex); ··· 565 566 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) 566 567 * kprobes before waiting for quiesence period. 567 568 */ 568 - do_unoptimize_kprobes(&free_list); 569 + do_unoptimize_kprobes(); 569 570 570 571 /* 571 572 * Step 2: Wait for quiesence period to ensure all running interrupts ··· 580 581 do_optimize_kprobes(); 581 582 582 583 /* Step 4: Free cleaned kprobes after quiesence period */ 583 - do_free_cleaned_kprobes(&free_list); 584 + do_free_cleaned_kprobes(); 584 585 585 586 mutex_unlock(&module_mutex); 586 587 mutex_unlock(&kprobe_mutex); ··· 722 723 if (!list_empty(&op->list)) 723 724 /* Dequeue from the (un)optimization queue */ 724 725 list_del_init(&op->list); 725 - 726 726 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 727 + 728 + if (kprobe_unused(p)) { 729 + /* Enqueue if it is unused */ 730 + list_add(&op->list, &freeing_list); 731 + /* 732 + * Remove unused probes from the hash list. After waiting 733 + * for synchronization, this probe is reclaimed. 734 + * (reclaiming is done by do_free_cleaned_kprobes().) 735 + */ 736 + hlist_del_rcu(&op->kp.hlist); 737 + } 738 + 727 739 /* Don't touch the code, because it is already freed. */ 728 740 arch_remove_optimized_kprobe(op); 729 741 }