Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
perf_counter: Report the cloning task as parent on perf_counter_fork()
perf_counter: Fix an ipi-deadlock
perf: Rework/fix the whole read vs group stuff
perf_counter: Fix swcounter context invariance
perf report: Don't show unresolved DSOs and symbols when -S/-d is used
perf tools: Add a general option to enable raw sample records
perf tools: Add a per tracepoint counter attribute to get raw sample
perf_counter: Provide hw_perf_counter_setup_online() APIs
perf list: Fix large list output by using the pager
perf_counter, x86: Fix/improve apic fallback
perf record: Add missing -C option support for specifying profile cpu
perf tools: Fix dso__new handle() to handle deleted DSOs
perf tools: Fix fallback to cplus_demangle() when bfd_demangle() is not available
perf report: Show the tid too in -D
perf record: Fix .tid and .pid fill-in when synthesizing events
perf_counter, x86: Fix generic cache events on P6-mobile CPUs
perf_counter, x86: Fix lapic printk message

+429 -190
+1 -1
arch/x86/Kconfig
··· 24 24 select HAVE_UNSTABLE_SCHED_CLOCK 25 25 select HAVE_IDE 26 26 select HAVE_OPROFILE 27 + select HAVE_PERF_COUNTERS if (!M386 && !M486) 27 28 select HAVE_IOREMAP_PROT 28 29 select HAVE_KPROBES 29 30 select ARCH_WANT_OPTIONAL_GPIOLIB ··· 743 742 config X86_LOCAL_APIC 744 743 def_bool y 745 744 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 746 - select HAVE_PERF_COUNTERS if (!M386 && !M486) 747 745 748 746 config X86_IO_APIC 749 747 def_bool y
+34 -8
arch/x86/kernel/cpu/perf_counter.c
··· 55 55 int num_counters_fixed; 56 56 int counter_bits; 57 57 u64 counter_mask; 58 + int apic; 58 59 u64 max_period; 59 60 u64 intel_ctrl; 60 61 }; ··· 73 72 { 74 73 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, 75 74 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 76 - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0000, 77 - [PERF_COUNT_HW_CACHE_MISSES] = 0x0000, 75 + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, 76 + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, 78 77 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 79 78 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 80 79 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, ··· 614 613 615 614 static bool reserve_pmc_hardware(void) 616 615 { 616 + #ifdef CONFIG_X86_LOCAL_APIC 617 617 int i; 618 618 619 619 if (nmi_watchdog == NMI_LOCAL_APIC) ··· 629 627 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 630 628 goto eventsel_fail; 631 629 } 630 + #endif 632 631 633 632 return true; 634 633 634 + #ifdef CONFIG_X86_LOCAL_APIC 635 635 eventsel_fail: 636 636 for (i--; i >= 0; i--) 637 637 release_evntsel_nmi(x86_pmu.eventsel + i); ··· 648 644 enable_lapic_nmi_watchdog(); 649 645 650 646 return false; 647 + #endif 651 648 } 652 649 653 650 static void release_pmc_hardware(void) 654 651 { 652 + #ifdef CONFIG_X86_LOCAL_APIC 655 653 int i; 656 654 657 655 for (i = 0; i < x86_pmu.num_counters; i++) { ··· 663 657 664 658 if (nmi_watchdog == NMI_LOCAL_APIC) 665 659 enable_lapic_nmi_watchdog(); 660 + #endif 666 661 } 667 662 668 663 static void hw_perf_counter_destroy(struct perf_counter *counter) ··· 755 748 hwc->sample_period = x86_pmu.max_period; 756 749 hwc->last_period = hwc->sample_period; 757 750 atomic64_set(&hwc->period_left, hwc->sample_period); 751 + } else { 752 + /* 753 + * If we have a PMU initialized but no APIC 754 + * interrupts, we cannot sample hardware 755 + * counters (user-space has to fall back and 756 + * sample via a hrtimer based software counter): 757 + */ 758 + if (!x86_pmu.apic) 759 + return -EOPNOTSUPP; 758 760 } 759 761 760 762 counter->destroy = hw_perf_counter_destroy; ··· 1465 1449 1466 1450 void set_perf_counter_pending(void) 1467 1451 { 1452 + #ifdef CONFIG_X86_LOCAL_APIC 1468 1453 apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1454 + #endif 1469 1455 } 1470 1456 1471 1457 void perf_counters_lapic_init(void) 1472 1458 { 1473 - if (!x86_pmu_initialized()) 1459 + #ifdef CONFIG_X86_LOCAL_APIC 1460 + if (!x86_pmu.apic || !x86_pmu_initialized()) 1474 1461 return; 1475 1462 1476 1463 /* 1477 1464 * Always use NMI for PMU 1478 1465 */ 1479 1466 apic_write(APIC_LVTPC, APIC_DM_NMI); 1467 + #endif 1480 1468 } 1481 1469 1482 1470 static int __kprobes ··· 1504 1484 1505 1485 regs = args->regs; 1506 1486 1487 + #ifdef CONFIG_X86_LOCAL_APIC 1507 1488 apic_write(APIC_LVTPC, APIC_DM_NMI); 1489 + #endif 1508 1490 /* 1509 1491 * Can't rely on the handled return value to say it was our NMI, two 1510 1492 * counters could trigger 'simultaneously' raising two back-to-back NMIs. ··· 1537 1515 .event_map = p6_pmu_event_map, 1538 1516 .raw_event = p6_pmu_raw_event, 1539 1517 .max_events = ARRAY_SIZE(p6_perfmon_event_map), 1518 + .apic = 1, 1540 1519 .max_period = (1ULL << 31) - 1, 1541 1520 .version = 0, 1542 1521 .num_counters = 2, ··· 1564 1541 .event_map = intel_pmu_event_map, 1565 1542 .raw_event = intel_pmu_raw_event, 1566 1543 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 1544 + .apic = 1, 1567 1545 /* 1568 1546 * Intel PMCs cannot be accessed sanely above 32 bit width, 1569 1547 * so we install an artificial 1<<31 period regardless of ··· 1588 1564 .num_counters = 4, 1589 1565 .counter_bits = 48, 1590 1566 .counter_mask = (1ULL << 48) - 1, 1567 + .apic = 1, 1591 1568 /* use highest bit to detect overflow */ 1592 1569 .max_period = (1ULL << 47) - 1, 1593 1570 }; ··· 1614 1589 return -ENODEV; 1615 1590 } 1616 1591 1617 - if (!cpu_has_apic) { 1618 - pr_info("no Local APIC, try rebooting with lapic"); 1619 - return -ENODEV; 1620 - } 1592 + x86_pmu = p6_pmu; 1621 1593 1622 - x86_pmu = p6_pmu; 1594 + if (!cpu_has_apic) { 1595 + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); 1596 + pr_info("no hardware sampling interrupt available.\n"); 1597 + x86_pmu.apic = 0; 1598 + } 1623 1599 1624 1600 return 0; 1625 1601 }
+38 -11
include/linux/perf_counter.h
··· 115 115 PERF_SAMPLE_TID = 1U << 1, 116 116 PERF_SAMPLE_TIME = 1U << 2, 117 117 PERF_SAMPLE_ADDR = 1U << 3, 118 - PERF_SAMPLE_GROUP = 1U << 4, 118 + PERF_SAMPLE_READ = 1U << 4, 119 119 PERF_SAMPLE_CALLCHAIN = 1U << 5, 120 120 PERF_SAMPLE_ID = 1U << 6, 121 121 PERF_SAMPLE_CPU = 1U << 7, ··· 127 127 }; 128 128 129 129 /* 130 - * Bits that can be set in attr.read_format to request that 131 - * reads on the counter should return the indicated quantities, 132 - * in increasing order of bit value, after the counter value. 130 + * The format of the data returned by read() on a perf counter fd, 131 + * as specified by attr.read_format: 132 + * 133 + * struct read_format { 134 + * { u64 value; 135 + * { u64 time_enabled; } && PERF_FORMAT_ENABLED 136 + * { u64 time_running; } && PERF_FORMAT_RUNNING 137 + * { u64 id; } && PERF_FORMAT_ID 138 + * } && !PERF_FORMAT_GROUP 139 + * 140 + * { u64 nr; 141 + * { u64 time_enabled; } && PERF_FORMAT_ENABLED 142 + * { u64 time_running; } && PERF_FORMAT_RUNNING 143 + * { u64 value; 144 + * { u64 id; } && PERF_FORMAT_ID 145 + * } cntr[nr]; 146 + * } && PERF_FORMAT_GROUP 147 + * }; 133 148 */ 134 149 enum perf_counter_read_format { 135 150 PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, 136 151 PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, 137 152 PERF_FORMAT_ID = 1U << 2, 153 + PERF_FORMAT_GROUP = 1U << 3, 138 154 139 - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ 155 + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ 140 156 }; 141 157 142 158 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ ··· 359 343 * struct { 360 344 * struct perf_event_header header; 361 345 * u32 pid, tid; 362 - * u64 value; 363 - * { u64 time_enabled; } && PERF_FORMAT_ENABLED 364 - * { u64 time_running; } && PERF_FORMAT_RUNNING 365 - * { u64 parent_id; } && PERF_FORMAT_ID 346 + * 347 + * struct read_format values; 366 348 * }; 367 349 */ 368 350 PERF_EVENT_READ = 8, ··· 378 364 * { u32 cpu, res; } && PERF_SAMPLE_CPU 379 365 * { u64 period; } && PERF_SAMPLE_PERIOD 380 366 * 381 - * { u64 nr; 382 - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP 367 + * { struct read_format values; } && PERF_SAMPLE_READ 383 368 * 384 369 * { u64 nr, 385 370 * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN 371 + * 372 + * # 373 + * # The RAW record below is opaque data wrt the ABI 374 + * # 375 + * # That is, the ABI doesn't make any promises wrt to 376 + * # the stability of its content, it may vary depending 377 + * # on event, hardware, kernel version and phase of 378 + * # the moon. 379 + * # 380 + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. 381 + * # 382 + * 386 383 * { u32 size; 387 384 * char data[size];}&& PERF_SAMPLE_RAW 388 385 * }; ··· 719 694 720 695 extern int perf_counter_overflow(struct perf_counter *counter, int nmi, 721 696 struct perf_sample_data *data); 697 + extern void perf_counter_output(struct perf_counter *counter, int nmi, 698 + struct perf_sample_data *data); 722 699 723 700 /* 724 701 * Return 1 for a software counter, 0 for a hardware counter
+236 -102
kernel/perf_counter.c
··· 88 88 void __weak hw_perf_enable(void) { barrier(); } 89 89 90 90 void __weak hw_perf_counter_setup(int cpu) { barrier(); } 91 + void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } 91 92 92 93 int __weak 93 94 hw_perf_group_sched_in(struct perf_counter *group_leader, ··· 307 306 return; 308 307 309 308 counter->state = PERF_COUNTER_STATE_INACTIVE; 309 + if (counter->pending_disable) { 310 + counter->pending_disable = 0; 311 + counter->state = PERF_COUNTER_STATE_OFF; 312 + } 310 313 counter->tstamp_stopped = ctx->time; 311 314 counter->pmu->disable(counter); 312 315 counter->oncpu = -1; ··· 1696 1691 return 0; 1697 1692 } 1698 1693 1699 - static u64 perf_counter_read_tree(struct perf_counter *counter) 1694 + static int perf_counter_read_size(struct perf_counter *counter) 1695 + { 1696 + int entry = sizeof(u64); /* value */ 1697 + int size = 0; 1698 + int nr = 1; 1699 + 1700 + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1701 + size += sizeof(u64); 1702 + 1703 + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1704 + size += sizeof(u64); 1705 + 1706 + if (counter->attr.read_format & PERF_FORMAT_ID) 1707 + entry += sizeof(u64); 1708 + 1709 + if (counter->attr.read_format & PERF_FORMAT_GROUP) { 1710 + nr += counter->group_leader->nr_siblings; 1711 + size += sizeof(u64); 1712 + } 1713 + 1714 + size += entry * nr; 1715 + 1716 + return size; 1717 + } 1718 + 1719 + static u64 perf_counter_read_value(struct perf_counter *counter) 1700 1720 { 1701 1721 struct perf_counter *child; 1702 1722 u64 total = 0; ··· 1733 1703 return total; 1734 1704 } 1735 1705 1706 + static int perf_counter_read_entry(struct perf_counter *counter, 1707 + u64 read_format, char __user *buf) 1708 + { 1709 + int n = 0, count = 0; 1710 + u64 values[2]; 1711 + 1712 + values[n++] = perf_counter_read_value(counter); 1713 + if (read_format & PERF_FORMAT_ID) 1714 + values[n++] = primary_counter_id(counter); 1715 + 1716 + count = n * sizeof(u64); 1717 + 1718 + if (copy_to_user(buf, values, count)) 1719 + return -EFAULT; 1720 + 1721 + return count; 1722 + } 1723 + 1724 + static int perf_counter_read_group(struct perf_counter *counter, 1725 + u64 read_format, char __user *buf) 1726 + { 1727 + struct perf_counter *leader = counter->group_leader, *sub; 1728 + int n = 0, size = 0, err = -EFAULT; 1729 + u64 values[3]; 1730 + 1731 + values[n++] = 1 + leader->nr_siblings; 1732 + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1733 + values[n++] = leader->total_time_enabled + 1734 + atomic64_read(&leader->child_total_time_enabled); 1735 + } 1736 + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1737 + values[n++] = leader->total_time_running + 1738 + atomic64_read(&leader->child_total_time_running); 1739 + } 1740 + 1741 + size = n * sizeof(u64); 1742 + 1743 + if (copy_to_user(buf, values, size)) 1744 + return -EFAULT; 1745 + 1746 + err = perf_counter_read_entry(leader, read_format, buf + size); 1747 + if (err < 0) 1748 + return err; 1749 + 1750 + size += err; 1751 + 1752 + list_for_each_entry(sub, &leader->sibling_list, list_entry) { 1753 + err = perf_counter_read_entry(counter, read_format, 1754 + buf + size); 1755 + if (err < 0) 1756 + return err; 1757 + 1758 + size += err; 1759 + } 1760 + 1761 + return size; 1762 + } 1763 + 1764 + static int perf_counter_read_one(struct perf_counter *counter, 1765 + u64 read_format, char __user *buf) 1766 + { 1767 + u64 values[4]; 1768 + int n = 0; 1769 + 1770 + values[n++] = perf_counter_read_value(counter); 1771 + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1772 + values[n++] = counter->total_time_enabled + 1773 + atomic64_read(&counter->child_total_time_enabled); 1774 + } 1775 + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1776 + values[n++] = counter->total_time_running + 1777 + atomic64_read(&counter->child_total_time_running); 1778 + } 1779 + if (read_format & PERF_FORMAT_ID) 1780 + values[n++] = primary_counter_id(counter); 1781 + 1782 + if (copy_to_user(buf, values, n * sizeof(u64))) 1783 + return -EFAULT; 1784 + 1785 + return n * sizeof(u64); 1786 + } 1787 + 1736 1788 /* 1737 1789 * Read the performance counter - simple non blocking version for now 1738 1790 */ 1739 1791 static ssize_t 1740 1792 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) 1741 1793 { 1742 - u64 values[4]; 1743 - int n; 1794 + u64 read_format = counter->attr.read_format; 1795 + int ret; 1744 1796 1745 1797 /* 1746 1798 * Return end-of-file for a read on a counter that is in ··· 1832 1720 if (counter->state == PERF_COUNTER_STATE_ERROR) 1833 1721 return 0; 1834 1722 1723 + if (count < perf_counter_read_size(counter)) 1724 + return -ENOSPC; 1725 + 1835 1726 WARN_ON_ONCE(counter->ctx->parent_ctx); 1836 1727 mutex_lock(&counter->child_mutex); 1837 - values[0] = perf_counter_read_tree(counter); 1838 - n = 1; 1839 - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 1840 - values[n++] = counter->total_time_enabled + 1841 - atomic64_read(&counter->child_total_time_enabled); 1842 - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 1843 - values[n++] = counter->total_time_running + 1844 - atomic64_read(&counter->child_total_time_running); 1845 - if (counter->attr.read_format & PERF_FORMAT_ID) 1846 - values[n++] = primary_counter_id(counter); 1728 + if (read_format & PERF_FORMAT_GROUP) 1729 + ret = perf_counter_read_group(counter, read_format, buf); 1730 + else 1731 + ret = perf_counter_read_one(counter, read_format, buf); 1847 1732 mutex_unlock(&counter->child_mutex); 1848 1733 1849 - if (count < n * sizeof(u64)) 1850 - return -EINVAL; 1851 - count = n * sizeof(u64); 1852 - 1853 - if (copy_to_user(buf, values, count)) 1854 - return -EFAULT; 1855 - 1856 - return count; 1734 + return ret; 1857 1735 } 1858 1736 1859 1737 static ssize_t ··· 2347 2245 2348 2246 if (counter->pending_disable) { 2349 2247 counter->pending_disable = 0; 2350 - perf_counter_disable(counter); 2248 + __perf_counter_disable(counter); 2351 2249 } 2352 2250 2353 2251 if (counter->pending_wakeup) { ··· 2732 2630 return task_pid_nr_ns(p, counter->ns); 2733 2631 } 2734 2632 2735 - static void perf_counter_output(struct perf_counter *counter, int nmi, 2633 + static void perf_output_read_one(struct perf_output_handle *handle, 2634 + struct perf_counter *counter) 2635 + { 2636 + u64 read_format = counter->attr.read_format; 2637 + u64 values[4]; 2638 + int n = 0; 2639 + 2640 + values[n++] = atomic64_read(&counter->count); 2641 + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 2642 + values[n++] = counter->total_time_enabled + 2643 + atomic64_read(&counter->child_total_time_enabled); 2644 + } 2645 + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 2646 + values[n++] = counter->total_time_running + 2647 + atomic64_read(&counter->child_total_time_running); 2648 + } 2649 + if (read_format & PERF_FORMAT_ID) 2650 + values[n++] = primary_counter_id(counter); 2651 + 2652 + perf_output_copy(handle, values, n * sizeof(u64)); 2653 + } 2654 + 2655 + /* 2656 + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. 2657 + */ 2658 + static void perf_output_read_group(struct perf_output_handle *handle, 2659 + struct perf_counter *counter) 2660 + { 2661 + struct perf_counter *leader = counter->group_leader, *sub; 2662 + u64 read_format = counter->attr.read_format; 2663 + u64 values[5]; 2664 + int n = 0; 2665 + 2666 + values[n++] = 1 + leader->nr_siblings; 2667 + 2668 + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 2669 + values[n++] = leader->total_time_enabled; 2670 + 2671 + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 2672 + values[n++] = leader->total_time_running; 2673 + 2674 + if (leader != counter) 2675 + leader->pmu->read(leader); 2676 + 2677 + values[n++] = atomic64_read(&leader->count); 2678 + if (read_format & PERF_FORMAT_ID) 2679 + values[n++] = primary_counter_id(leader); 2680 + 2681 + perf_output_copy(handle, values, n * sizeof(u64)); 2682 + 2683 + list_for_each_entry(sub, &leader->sibling_list, list_entry) { 2684 + n = 0; 2685 + 2686 + if (sub != counter) 2687 + sub->pmu->read(sub); 2688 + 2689 + values[n++] = atomic64_read(&sub->count); 2690 + if (read_format & PERF_FORMAT_ID) 2691 + values[n++] = primary_counter_id(sub); 2692 + 2693 + perf_output_copy(handle, values, n * sizeof(u64)); 2694 + } 2695 + } 2696 + 2697 + static void perf_output_read(struct perf_output_handle *handle, 2698 + struct perf_counter *counter) 2699 + { 2700 + if (counter->attr.read_format & PERF_FORMAT_GROUP) 2701 + perf_output_read_group(handle, counter); 2702 + else 2703 + perf_output_read_one(handle, counter); 2704 + } 2705 + 2706 + void perf_counter_output(struct perf_counter *counter, int nmi, 2736 2707 struct perf_sample_data *data) 2737 2708 { 2738 2709 int ret; ··· 2816 2641 struct { 2817 2642 u32 pid, tid; 2818 2643 } tid_entry; 2819 - struct { 2820 - u64 id; 2821 - u64 counter; 2822 - } group_entry; 2823 2644 struct perf_callchain_entry *callchain = NULL; 2824 2645 int callchain_size = 0; 2825 2646 u64 time; ··· 2870 2699 if (sample_type & PERF_SAMPLE_PERIOD) 2871 2700 header.size += sizeof(u64); 2872 2701 2873 - if (sample_type & PERF_SAMPLE_GROUP) { 2874 - header.size += sizeof(u64) + 2875 - counter->nr_siblings * sizeof(group_entry); 2876 - } 2702 + if (sample_type & PERF_SAMPLE_READ) 2703 + header.size += perf_counter_read_size(counter); 2877 2704 2878 2705 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2879 2706 callchain = perf_callchain(data->regs); ··· 2928 2759 if (sample_type & PERF_SAMPLE_PERIOD) 2929 2760 perf_output_put(&handle, data->period); 2930 2761 2931 - /* 2932 - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. 2933 - */ 2934 - if (sample_type & PERF_SAMPLE_GROUP) { 2935 - struct perf_counter *leader, *sub; 2936 - u64 nr = counter->nr_siblings; 2937 - 2938 - perf_output_put(&handle, nr); 2939 - 2940 - leader = counter->group_leader; 2941 - list_for_each_entry(sub, &leader->sibling_list, list_entry) { 2942 - if (sub != counter) 2943 - sub->pmu->read(sub); 2944 - 2945 - group_entry.id = primary_counter_id(sub); 2946 - group_entry.counter = atomic64_read(&sub->count); 2947 - 2948 - perf_output_put(&handle, group_entry); 2949 - } 2950 - } 2762 + if (sample_type & PERF_SAMPLE_READ) 2763 + perf_output_read(&handle, counter); 2951 2764 2952 2765 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2953 2766 if (callchain) ··· 2968 2817 2969 2818 u32 pid; 2970 2819 u32 tid; 2971 - u64 value; 2972 - u64 format[3]; 2973 2820 }; 2974 2821 2975 2822 static void ··· 2979 2830 .header = { 2980 2831 .type = PERF_EVENT_READ, 2981 2832 .misc = 0, 2982 - .size = sizeof(event) - sizeof(event.format), 2833 + .size = sizeof(event) + perf_counter_read_size(counter), 2983 2834 }, 2984 2835 .pid = perf_counter_pid(counter, task), 2985 2836 .tid = perf_counter_tid(counter, task), 2986 - .value = atomic64_read(&counter->count), 2987 2837 }; 2988 - int ret, i = 0; 2989 - 2990 - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 2991 - event.header.size += sizeof(u64); 2992 - event.format[i++] = counter->total_time_enabled; 2993 - } 2994 - 2995 - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 2996 - event.header.size += sizeof(u64); 2997 - event.format[i++] = counter->total_time_running; 2998 - } 2999 - 3000 - if (counter->attr.read_format & PERF_FORMAT_ID) { 3001 - event.header.size += sizeof(u64); 3002 - event.format[i++] = primary_counter_id(counter); 3003 - } 2838 + int ret; 3004 2839 3005 2840 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); 3006 2841 if (ret) 3007 2842 return; 3008 2843 3009 - perf_output_copy(&handle, &event, event.header.size); 2844 + perf_output_put(&handle, event); 2845 + perf_output_read(&handle, counter); 2846 + 3010 2847 perf_output_end(&handle); 3011 2848 } 3012 2849 ··· 3028 2893 return; 3029 2894 3030 2895 task_event->event.pid = perf_counter_pid(counter, task); 3031 - task_event->event.ppid = perf_counter_pid(counter, task->real_parent); 2896 + task_event->event.ppid = perf_counter_pid(counter, current); 3032 2897 3033 2898 task_event->event.tid = perf_counter_tid(counter, task); 3034 - task_event->event.ptid = perf_counter_tid(counter, task->real_parent); 2899 + task_event->event.ptid = perf_counter_tid(counter, current); 3035 2900 3036 2901 perf_output_put(&handle, task_event->event); 3037 2902 perf_output_end(&handle); ··· 3578 3443 3579 3444 static int perf_swcounter_is_counting(struct perf_counter *counter) 3580 3445 { 3581 - struct perf_counter_context *ctx; 3582 - unsigned long flags; 3583 - int count; 3584 - 3446 + /* 3447 + * The counter is active, we're good! 3448 + */ 3585 3449 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 3586 3450 return 1; 3587 3451 3452 + /* 3453 + * The counter is off/error, not counting. 3454 + */ 3588 3455 if (counter->state != PERF_COUNTER_STATE_INACTIVE) 3589 3456 return 0; 3590 3457 3591 3458 /* 3592 - * If the counter is inactive, it could be just because 3593 - * its task is scheduled out, or because it's in a group 3594 - * which could not go on the PMU. We want to count in 3595 - * the first case but not the second. If the context is 3596 - * currently active then an inactive software counter must 3597 - * be the second case. If it's not currently active then 3598 - * we need to know whether the counter was active when the 3599 - * context was last active, which we can determine by 3600 - * comparing counter->tstamp_stopped with ctx->time. 3601 - * 3602 - * We are within an RCU read-side critical section, 3603 - * which protects the existence of *ctx. 3459 + * The counter is inactive, if the context is active 3460 + * we're part of a group that didn't make it on the 'pmu', 3461 + * not counting. 3604 3462 */ 3605 - ctx = counter->ctx; 3606 - spin_lock_irqsave(&ctx->lock, flags); 3607 - count = 1; 3608 - /* Re-check state now we have the lock */ 3609 - if (counter->state < PERF_COUNTER_STATE_INACTIVE || 3610 - counter->ctx->is_active || 3611 - counter->tstamp_stopped < ctx->time) 3612 - count = 0; 3613 - spin_unlock_irqrestore(&ctx->lock, flags); 3614 - return count; 3463 + if (counter->ctx->is_active) 3464 + return 0; 3465 + 3466 + /* 3467 + * We're inactive and the context is too, this means the 3468 + * task is scheduled out, we're counting events that happen 3469 + * to us, like migration events. 3470 + */ 3471 + return 1; 3615 3472 } 3616 3473 3617 3474 static int perf_swcounter_match(struct perf_counter *counter, ··· 4055 3928 atomic64_set(&hwc->period_left, hwc->sample_period); 4056 3929 4057 3930 /* 4058 - * we currently do not support PERF_SAMPLE_GROUP on inherited counters 3931 + * we currently do not support PERF_FORMAT_GROUP on inherited counters 4059 3932 */ 4060 - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) 3933 + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 4061 3934 goto done; 4062 3935 4063 3936 switch (attr->type) { ··· 4719 4592 perf_counter_init_cpu(cpu); 4720 4593 break; 4721 4594 4595 + case CPU_ONLINE: 4596 + case CPU_ONLINE_FROZEN: 4597 + hw_perf_counter_setup_online(cpu); 4598 + break; 4599 + 4722 4600 case CPU_DOWN_PREPARE: 4723 4601 case CPU_DOWN_PREPARE_FROZEN: 4724 4602 perf_counter_exit_cpu(cpu); ··· 4747 4615 void __init perf_counter_init(void) 4748 4616 { 4749 4617 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 4618 + (void *)(long)smp_processor_id()); 4619 + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 4750 4620 (void *)(long)smp_processor_id()); 4751 4621 register_cpu_notifier(&perf_cpu_nb); 4752 4622 }
+18 -11
tools/perf/Makefile
··· 382 382 ifdef NO_DEMANGLE 383 383 BASIC_CFLAGS += -DNO_DEMANGLE 384 384 else 385 - 386 385 has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd > /dev/null 2>&1 && echo y") 387 - 388 - has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y") 389 - 390 - has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") 391 386 392 387 ifeq ($(has_bfd),y) 393 388 EXTLIBS += -lbfd 394 - else ifeq ($(has_bfd_iberty),y) 395 - EXTLIBS += -lbfd -liberty 396 - else ifeq ($(has_bfd_iberty_z),y) 397 - EXTLIBS += -lbfd -liberty -lz 398 389 else 399 - msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) 400 - BASIC_CFLAGS += -DNO_DEMANGLE 390 + has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y") 391 + ifeq ($(has_bfd_iberty),y) 392 + EXTLIBS += -lbfd -liberty 393 + else 394 + has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") 395 + ifeq ($(has_bfd_iberty_z),y) 396 + EXTLIBS += -lbfd -liberty -lz 397 + else 398 + has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -liberty > /dev/null 2>&1 && echo y") 399 + ifeq ($(has_cplus_demangle),y) 400 + EXTLIBS += -liberty 401 + BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE 402 + else 403 + msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) 404 + BASIC_CFLAGS += -DNO_DEMANGLE 405 + endif 406 + endif 407 + endif 401 408 endif 402 409 endif 403 410
+2 -1
tools/perf/builtin-list.c
··· 10 10 11 11 #include "perf.h" 12 12 13 - #include "util/parse-options.h" 14 13 #include "util/parse-events.h" 14 + #include "util/cache.h" 15 15 16 16 int cmd_list(int argc __used, const char **argv __used, const char *prefix __used) 17 17 { 18 + setup_pager(); 18 19 print_events(); 19 20 return 0; 20 21 }
+57 -36
tools/perf/builtin-record.c
··· 34 34 static const char *output_name = "perf.data"; 35 35 static int group = 0; 36 36 static unsigned int realtime_prio = 0; 37 + static int raw_samples = 0; 37 38 static int system_wide = 0; 39 + static int profile_cpu = -1; 38 40 static pid_t target_pid = -1; 39 41 static int inherit = 1; 40 42 static int force = 0; ··· 205 203 kill(getpid(), signr); 206 204 } 207 205 208 - static void pid_synthesize_comm_event(pid_t pid, int full) 206 + static pid_t pid_synthesize_comm_event(pid_t pid, int full) 209 207 { 210 208 struct comm_event comm_ev; 211 209 char filename[PATH_MAX]; 212 210 char bf[BUFSIZ]; 213 - int fd; 214 - size_t size; 215 - char *field, *sep; 211 + FILE *fp; 212 + size_t size = 0; 216 213 DIR *tasks; 217 214 struct dirent dirent, *next; 215 + pid_t tgid = 0; 218 216 219 - snprintf(filename, sizeof(filename), "/proc/%d/stat", pid); 217 + snprintf(filename, sizeof(filename), "/proc/%d/status", pid); 220 218 221 - fd = open(filename, O_RDONLY); 222 - if (fd < 0) { 219 + fp = fopen(filename, "r"); 220 + if (fd == NULL) { 223 221 /* 224 222 * We raced with a task exiting - just return: 225 223 */ 226 224 if (verbose) 227 225 fprintf(stderr, "couldn't open %s\n", filename); 228 - return; 226 + return 0; 229 227 } 230 - if (read(fd, bf, sizeof(bf)) < 0) { 231 - fprintf(stderr, "couldn't read %s\n", filename); 232 - exit(EXIT_FAILURE); 233 - } 234 - close(fd); 235 228 236 - /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */ 237 229 memset(&comm_ev, 0, sizeof(comm_ev)); 238 - field = strchr(bf, '('); 239 - if (field == NULL) 240 - goto out_failure; 241 - sep = strchr(++field, ')'); 242 - if (sep == NULL) 243 - goto out_failure; 244 - size = sep - field; 245 - memcpy(comm_ev.comm, field, size++); 230 + while (!comm_ev.comm[0] || !comm_ev.pid) { 231 + if (fgets(bf, sizeof(bf), fp) == NULL) 232 + goto out_failure; 246 233 247 - comm_ev.pid = pid; 234 + if (memcmp(bf, "Name:", 5) == 0) { 235 + char *name = bf + 5; 236 + while (*name && isspace(*name)) 237 + ++name; 238 + size = strlen(name) - 1; 239 + memcpy(comm_ev.comm, name, size++); 240 + } else if (memcmp(bf, "Tgid:", 5) == 0) { 241 + char *tgids = bf + 5; 242 + while (*tgids && isspace(*tgids)) 243 + ++tgids; 244 + tgid = comm_ev.pid = atoi(tgids); 245 + } 246 + } 247 + 248 248 comm_ev.header.type = PERF_EVENT_COMM; 249 249 size = ALIGN(size, sizeof(u64)); 250 250 comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); ··· 255 251 comm_ev.tid = pid; 256 252 257 253 write_output(&comm_ev, comm_ev.header.size); 258 - return; 254 + goto out_fclose; 259 255 } 260 256 261 257 snprintf(filename, sizeof(filename), "/proc/%d/task", pid); ··· 272 268 write_output(&comm_ev, comm_ev.header.size); 273 269 } 274 270 closedir(tasks); 275 - return; 271 + 272 + out_fclose: 273 + fclose(fp); 274 + return tgid; 276 275 277 276 out_failure: 278 277 fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n", ··· 283 276 exit(EXIT_FAILURE); 284 277 } 285 278 286 - static void pid_synthesize_mmap_samples(pid_t pid) 279 + static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid) 287 280 { 288 281 char filename[PATH_MAX]; 289 282 FILE *fp; ··· 335 328 mmap_ev.len -= mmap_ev.start; 336 329 mmap_ev.header.size = (sizeof(mmap_ev) - 337 330 (sizeof(mmap_ev.filename) - size)); 338 - mmap_ev.pid = pid; 331 + mmap_ev.pid = tgid; 339 332 mmap_ev.tid = pid; 340 333 341 334 write_output(&mmap_ev, mmap_ev.header.size); ··· 354 347 355 348 while (!readdir_r(proc, &dirent, &next) && next) { 356 349 char *end; 357 - pid_t pid; 350 + pid_t pid, tgid; 358 351 359 352 pid = strtol(dirent.d_name, &end, 10); 360 353 if (*end) /* only interested in proper numerical dirents */ 361 354 continue; 362 355 363 - pid_synthesize_comm_event(pid, 1); 364 - pid_synthesize_mmap_samples(pid); 356 + tgid = pid_synthesize_comm_event(pid, 1); 357 + pid_synthesize_mmap_samples(pid, tgid); 365 358 } 366 359 367 360 closedir(proc); ··· 399 392 PERF_FORMAT_TOTAL_TIME_RUNNING | 400 393 PERF_FORMAT_ID; 401 394 402 - attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; 395 + attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; 403 396 404 397 if (freq) { 405 398 attr->sample_type |= PERF_SAMPLE_PERIOD; ··· 419 412 if (call_graph) 420 413 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 421 414 415 + if (raw_samples) 416 + attr->sample_type |= PERF_SAMPLE_RAW; 422 417 423 418 attr->mmap = track; 424 419 attr->comm = track; ··· 435 426 436 427 if (err == EPERM) 437 428 die("Permission error - are you root?\n"); 429 + else if (err == ENODEV && profile_cpu != -1) 430 + die("No such device - did you specify an out-of-range profile CPU?\n"); 438 431 439 432 /* 440 433 * If it's cycles then fall back to hrtimer ··· 570 559 if (pid == -1) 571 560 pid = getpid(); 572 561 573 - open_counters(-1, pid); 574 - } else for (i = 0; i < nr_cpus; i++) 575 - open_counters(i, target_pid); 562 + open_counters(profile_cpu, pid); 563 + } else { 564 + if (profile_cpu != -1) { 565 + open_counters(profile_cpu, target_pid); 566 + } else { 567 + for (i = 0; i < nr_cpus; i++) 568 + open_counters(i, target_pid); 569 + } 570 + } 576 571 577 572 if (file_new) 578 573 perf_header__write(header, output); 579 574 580 575 if (!system_wide) { 581 - pid_synthesize_comm_event(pid, 0); 582 - pid_synthesize_mmap_samples(pid); 576 + pid_t tgid = pid_synthesize_comm_event(pid, 0); 577 + pid_synthesize_mmap_samples(pid, tgid); 583 578 } else 584 579 synthesize_all(); 585 580 ··· 653 636 "record events on existing pid"), 654 637 OPT_INTEGER('r', "realtime", &realtime_prio, 655 638 "collect data with this RT SCHED_FIFO priority"), 639 + OPT_BOOLEAN('R', "raw-samples", &raw_samples, 640 + "collect raw sample records from all opened counters"), 656 641 OPT_BOOLEAN('a', "all-cpus", &system_wide, 657 642 "system-wide collection from all CPUs"), 658 643 OPT_BOOLEAN('A', "append", &append_file, 659 644 "append to the output file to do incremental profiling"), 645 + OPT_INTEGER('C', "profile_cpu", &profile_cpu, 646 + "CPU to profile on"), 660 647 OPT_BOOLEAN('f', "force", &force, 661 648 "overwrite existing data file"), 662 649 OPT_LONG('c', "count", &default_interval,
+7 -5
tools/perf/builtin-report.c
··· 1526 1526 more_data += sizeof(u64); 1527 1527 } 1528 1528 1529 - dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d: %p period: %Ld\n", 1529 + dprintf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", 1530 1530 (void *)(offset + head), 1531 1531 (void *)(long)(event->header.size), 1532 1532 event->header.misc, 1533 - event->ip.pid, 1533 + event->ip.pid, event->ip.tid, 1534 1534 (void *)(long)ip, 1535 1535 (long long)period); 1536 1536 ··· 1590 1590 if (show & show_mask) { 1591 1591 struct symbol *sym = resolve_symbol(thread, &map, &dso, &ip); 1592 1592 1593 - if (dso_list && dso && dso->name && !strlist__has_entry(dso_list, dso->name)) 1593 + if (dso_list && (!dso || !dso->name || 1594 + !strlist__has_entry(dso_list, dso->name))) 1594 1595 return 0; 1595 1596 1596 - if (sym_list && sym && !strlist__has_entry(sym_list, sym->name)) 1597 + if (sym_list && (!sym || !strlist__has_entry(sym_list, sym->name))) 1597 1598 return 0; 1598 1599 1599 1600 if (hist_entry__add(thread, map, dso, sym, ip, chain, level, period)) { ··· 1613 1612 struct thread *thread = threads__findnew(event->mmap.pid); 1614 1613 struct map *map = map__new(&event->mmap); 1615 1614 1616 - dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", 1615 + dprintf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n", 1617 1616 (void *)(offset + head), 1618 1617 (void *)(long)(event->header.size), 1619 1618 event->mmap.pid, 1619 + event->mmap.tid, 1620 1620 (void *)(long)event->mmap.start, 1621 1621 (void *)(long)event->mmap.len, 1622 1622 (void *)(long)event->mmap.pgoff,
+10
tools/perf/util/parse-events.c
··· 379 379 struct perf_counter_attr *attr) 380 380 { 381 381 const char *evt_name; 382 + char *flags; 382 383 char sys_name[MAX_EVENT_LENGTH]; 383 384 char id_buf[4]; 384 385 int fd; ··· 401 400 strncpy(sys_name, *strp, sys_length); 402 401 sys_name[sys_length] = '\0'; 403 402 evt_name = evt_name + 1; 403 + 404 + flags = strchr(evt_name, ':'); 405 + if (flags) { 406 + *flags = '\0'; 407 + flags++; 408 + if (!strncmp(flags, "record", strlen(flags))) 409 + attr->sample_type |= PERF_SAMPLE_RAW; 410 + } 411 + 404 412 evt_length = strlen(evt_name); 405 413 if (evt_length >= MAX_EVENT_LENGTH) 406 414 return 0;
+2 -15
tools/perf/util/symbol.c
··· 7 7 #include <gelf.h> 8 8 #include <elf.h> 9 9 10 - #ifndef NO_DEMANGLE 11 - #include <bfd.h> 12 - #else 13 - static inline 14 - char *bfd_demangle(void __used *v, const char __used *c, int __used i) 15 - { 16 - return NULL; 17 - } 18 - #endif 19 - 20 10 const char *sym_hist_filter; 21 - 22 - #ifndef DMGL_PARAMS 23 - #define DMGL_PARAMS (1 << 0) /* Include function args */ 24 - #define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ 25 - #endif 26 11 27 12 enum dso_origin { 28 13 DSO__ORIG_KERNEL = 0, ··· 801 816 } 802 817 out: 803 818 free(name); 819 + if (ret < 0 && strstr(self->name, " (deleted)") != NULL) 820 + return 0; 804 821 return ret; 805 822 } 806 823
+24
tools/perf/util/symbol.h
··· 7 7 #include <linux/rbtree.h> 8 8 #include "module.h" 9 9 10 + #ifdef HAVE_CPLUS_DEMANGLE 11 + extern char *cplus_demangle(const char *, int); 12 + 13 + static inline char *bfd_demangle(void __used *v, const char *c, int i) 14 + { 15 + return cplus_demangle(c, i); 16 + } 17 + #else 18 + #ifdef NO_DEMANGLE 19 + static inline char *bfd_demangle(void __used *v, const char __used *c, 20 + int __used i) 21 + { 22 + return NULL; 23 + } 24 + #else 25 + #include <bfd.h> 26 + #endif 27 + #endif 28 + 29 + #ifndef DMGL_PARAMS 30 + #define DMGL_PARAMS (1 << 0) /* Include function args */ 31 + #define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ 32 + #endif 33 + 10 34 struct symbol { 11 35 struct rb_node rb_node; 12 36 u64 start;