Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

perf cs-etm: Add separate decode paths for timeless and per-thread modes

Timeless and per-thread are orthogonal concepts that are currently
treated as if they are the same (per-thread == timeless). This breaks
when you modify the command line or itrace options to something that the
current logic doesn't expect.

For example:

# Force timeless with Z
--itrace=Zi10i

# Or inconsistent record options
-e cs_etm/timestamp=1/ --per-thread

Adding Z for decoding in per-cpu mode is particularly bad because in
per-thread mode trace channel IDs are discarded and all assumed to be 0,
which would mix trace from different CPUs in per-cpu mode.

Although the results might not be perfect in all scenarios, if the user
requests no timestamps, it should still be possible to decode in either
mode. Especially if the relative times of samples in different processes
aren't interesting, quite a bit of space can be saved by turning off
timestamps in per-cpu mode.

Signed-off-by: James Clark <james.clark@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Denis Nikitin <denik@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: coresight@lists.linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20230424134748.228137-8-james.clark@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

James Clark and committed by
Arnaldo Carvalho de Melo
d1efa4a0 1764ce06

+148 -38
+24
tools/perf/tests/shell/test_arm_coresight.sh
··· 150 150 echo "Recording trace with system wide mode" 151 151 perf record -o ${perfdata} -e cs_etm// -a -- ls > /dev/null 2>&1 152 152 153 + # System-wide mode should include perf samples so test for that 154 + # instead of ls 153 155 perf_script_branch_samples perf && 154 156 perf_report_branch_samples perf && 155 157 perf_report_instruction_samples perf ··· 184 182 arm_cs_report "CoreSight snapshot testing" $err 185 183 } 186 184 185 + arm_cs_etm_basic_test() { 186 + echo "Recording trace with '$*'" 187 + perf record -o ${perfdata} "$@" -- ls > /dev/null 2>&1 188 + 189 + perf_script_branch_samples ls && 190 + perf_report_branch_samples ls && 191 + perf_report_instruction_samples ls 192 + 193 + err=$? 194 + arm_cs_report "CoreSight basic testing with '$*'" $err 195 + } 196 + 187 197 arm_cs_etm_traverse_path_test 188 198 arm_cs_etm_system_wide_test 189 199 arm_cs_etm_snapshot_test 200 + 201 + # Test all combinations of per-thread, system-wide and normal mode with 202 + # and without timestamps 203 + arm_cs_etm_basic_test -e cs_etm/timestamp=0/ --per-thread 204 + arm_cs_etm_basic_test -e cs_etm/timestamp=1/ --per-thread 205 + arm_cs_etm_basic_test -e cs_etm/timestamp=0/ -a 206 + arm_cs_etm_basic_test -e cs_etm/timestamp=1/ -a 207 + arm_cs_etm_basic_test -e cs_etm/timestamp=0/ 208 + arm_cs_etm_basic_test -e cs_etm/timestamp=1/ 209 + 190 210 exit $glb_err
+124 -38
tools/perf/util/cs-etm.c
··· 50 50 struct thread *unknown_thread; 51 51 struct perf_tsc_conversion tc; 52 52 53 + /* 54 + * Timeless has no timestamps in the trace so overlapping mmap lookups 55 + * are less accurate but produces smaller trace data. We use context IDs 56 + * in the trace instead of matching timestamps with fork records so 57 + * they're not really needed in the general case. Overlapping mmaps 58 + * happen in cases like between a fork and an exec. 59 + */ 53 60 bool timeless_decoding; 61 + 62 + /* 63 + * Per-thread ignores the trace channel ID and instead assumes that 64 + * everything in a buffer comes from the same process regardless of 65 + * which CPU it ran on. It also implies no context IDs so the TID is 66 + * taken from the auxtrace buffer. 67 + */ 68 + bool per_thread_decoding; 54 69 bool snapshot_mode; 55 70 bool data_queued; 56 71 bool has_virtual_ts; /* Virtual/Kernel timestamps in the trace. */ ··· 113 98 /* RB tree for quick conversion between traceID and metadata pointers */ 114 99 static struct intlist *traceid_list; 115 100 116 - static int cs_etm__process_queues(struct cs_etm_auxtrace *etm); 101 + static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm); 117 102 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm, 118 103 pid_t tid); 119 104 static int cs_etm__get_data_block(struct cs_etm_queue *etmq); ··· 507 492 struct cs_etm_traceid_queue *tidq, **traceid_queues; 508 493 struct cs_etm_auxtrace *etm = etmq->etm; 509 494 510 - if (etm->timeless_decoding) 495 + if (etm->per_thread_decoding) 511 496 trace_chan_id = CS_ETM_PER_THREAD_TRACEID; 512 497 513 498 traceid_queues_list = etmq->traceid_queues_list; ··· 746 731 if (!tool->ordered_events) 747 732 return -EINVAL; 748 733 749 - if (etm->timeless_decoding) 734 + if (etm->timeless_decoding) { 735 + /* 736 + * Pass tid = -1 to process all queues. But likely they will have 737 + * already been processed on PERF_RECORD_EXIT anyway. 738 + */ 750 739 return cs_etm__process_timeless_queues(etm, -1); 740 + } 751 741 752 - return cs_etm__process_queues(etm); 742 + return cs_etm__process_timestamped_queues(etm); 753 743 } 754 744 755 745 static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq) ··· 1086 1066 * chronological order. 1087 1067 * 1088 1068 * Note that packets decoded above are still in the traceID's packet 1089 - * queue and will be processed in cs_etm__process_queues(). 1069 + * queue and will be processed in cs_etm__process_timestamped_queues(). 1090 1070 */ 1091 1071 cs_queue_nr = TO_CS_QUEUE_NR(queue_nr, trace_chan_id); 1092 1072 ret = auxtrace_heap__add(&etm->heap, cs_queue_nr, cs_timestamp); ··· 1367 1347 struct cs_etm_auxtrace *etm = etmq->etm; 1368 1348 struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue; 1369 1349 1370 - if (etm->timeless_decoding) 1371 - return 0; 1372 - else if (etm->has_virtual_ts) 1350 + if (!etm->timeless_decoding && etm->has_virtual_ts) 1373 1351 return packet_queue->cs_timestamp; 1374 1352 else 1375 1353 return etm->latest_kernel_timestamp; ··· 2347 2329 } 2348 2330 } 2349 2331 2350 - static int cs_etm__run_decoder(struct cs_etm_queue *etmq) 2332 + static int cs_etm__run_per_thread_timeless_decoder(struct cs_etm_queue *etmq) 2351 2333 { 2352 2334 int err = 0; 2353 2335 struct cs_etm_traceid_queue *tidq; ··· 2385 2367 return err; 2386 2368 } 2387 2369 2370 + static int cs_etm__run_per_cpu_timeless_decoder(struct cs_etm_queue *etmq) 2371 + { 2372 + int idx, err = 0; 2373 + struct cs_etm_traceid_queue *tidq; 2374 + struct int_node *inode; 2375 + 2376 + /* Go through each buffer in the queue and decode them one by one */ 2377 + while (1) { 2378 + err = cs_etm__get_data_block(etmq); 2379 + if (err <= 0) 2380 + return err; 2381 + 2382 + /* Run trace decoder until buffer consumed or end of trace */ 2383 + do { 2384 + err = cs_etm__decode_data_block(etmq); 2385 + if (err) 2386 + return err; 2387 + 2388 + /* 2389 + * cs_etm__run_per_thread_timeless_decoder() runs on a 2390 + * single traceID queue because each TID has a separate 2391 + * buffer. But here in per-cpu mode we need to iterate 2392 + * over each channel instead. 2393 + */ 2394 + intlist__for_each_entry(inode, 2395 + etmq->traceid_queues_list) { 2396 + idx = (int)(intptr_t)inode->priv; 2397 + tidq = etmq->traceid_queues[idx]; 2398 + cs_etm__process_traceid_queue(etmq, tidq); 2399 + } 2400 + } while (etmq->buf_len); 2401 + 2402 + intlist__for_each_entry(inode, etmq->traceid_queues_list) { 2403 + idx = (int)(intptr_t)inode->priv; 2404 + tidq = etmq->traceid_queues[idx]; 2405 + /* Flush any remaining branch stack entries */ 2406 + err = cs_etm__end_block(etmq, tidq); 2407 + if (err) 2408 + return err; 2409 + } 2410 + } 2411 + 2412 + return err; 2413 + } 2414 + 2388 2415 static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm, 2389 2416 pid_t tid) 2390 2417 { ··· 2444 2381 if (!etmq) 2445 2382 continue; 2446 2383 2447 - tidq = cs_etm__etmq_get_traceid_queue(etmq, 2448 - CS_ETM_PER_THREAD_TRACEID); 2384 + /* 2385 + * Per-cpu mode has contextIDs in the trace and the decoder 2386 + * calls cs_etm__set_pid_tid_cpu() automatically so no need 2387 + * to do this here 2388 + */ 2389 + if (etm->per_thread_decoding) { 2390 + tidq = cs_etm__etmq_get_traceid_queue( 2391 + etmq, CS_ETM_PER_THREAD_TRACEID); 2449 2392 2450 - if (!tidq) 2451 - continue; 2393 + if (!tidq) 2394 + continue; 2452 2395 2453 - if ((tid == -1) || (tidq->tid == tid)) { 2454 - cs_etm__set_pid_tid_cpu(etm, tidq); 2455 - cs_etm__run_decoder(etmq); 2456 - } 2396 + if ((tid == -1) || (tidq->tid == tid)) { 2397 + cs_etm__set_pid_tid_cpu(etm, tidq); 2398 + cs_etm__run_per_thread_timeless_decoder(etmq); 2399 + } 2400 + } else 2401 + cs_etm__run_per_cpu_timeless_decoder(etmq); 2457 2402 } 2458 2403 2459 2404 return 0; 2460 2405 } 2461 2406 2462 - static int cs_etm__process_queues(struct cs_etm_auxtrace *etm) 2407 + static int cs_etm__process_timestamped_queues(struct cs_etm_auxtrace *etm) 2463 2408 { 2464 2409 int ret = 0; 2465 2410 unsigned int cs_queue_nr, queue_nr, i; ··· 2644 2573 struct perf_sample *sample, 2645 2574 struct perf_tool *tool) 2646 2575 { 2647 - u64 sample_kernel_timestamp; 2648 2576 struct cs_etm_auxtrace *etm = container_of(session->auxtrace, 2649 2577 struct cs_etm_auxtrace, 2650 2578 auxtrace); ··· 2656 2586 return -EINVAL; 2657 2587 } 2658 2588 2659 - if (sample->time && (sample->time != (u64) -1)) 2660 - sample_kernel_timestamp = sample->time; 2661 - else 2662 - sample_kernel_timestamp = 0; 2589 + switch (event->header.type) { 2590 + case PERF_RECORD_EXIT: 2591 + /* 2592 + * Don't need to wait for cs_etm__flush_events() in per-thread mode to 2593 + * start the decode because we know there will be no more trace from 2594 + * this thread. All this does is emit samples earlier than waiting for 2595 + * the flush in other modes, but with timestamps it makes sense to wait 2596 + * for flush so that events from different threads are interleaved 2597 + * properly. 2598 + */ 2599 + if (etm->per_thread_decoding && etm->timeless_decoding) 2600 + return cs_etm__process_timeless_queues(etm, 2601 + event->fork.tid); 2602 + break; 2663 2603 2664 - /* 2665 - * Don't wait for cs_etm__flush_events() in per-thread/timeless mode to start the decode. We 2666 - * need the tid of the PERF_RECORD_EXIT event to assign to the synthesised samples because 2667 - * ETM_OPT_CTXTID is not enabled. 2668 - */ 2669 - if (etm->timeless_decoding && 2670 - event->header.type == PERF_RECORD_EXIT) 2671 - return cs_etm__process_timeless_queues(etm, 2672 - event->fork.tid); 2673 - 2674 - if (event->header.type == PERF_RECORD_ITRACE_START) 2604 + case PERF_RECORD_ITRACE_START: 2675 2605 return cs_etm__process_itrace_start(etm, event); 2676 - else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE) 2606 + 2607 + case PERF_RECORD_SWITCH_CPU_WIDE: 2677 2608 return cs_etm__process_switch_cpu_wide(etm, event); 2678 2609 2679 - if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) { 2610 + case PERF_RECORD_AUX: 2680 2611 /* 2681 2612 * Record the latest kernel timestamp available in the header 2682 2613 * for samples so that synthesised samples occur from this point 2683 2614 * onwards. 2684 2615 */ 2685 - etm->latest_kernel_timestamp = sample_kernel_timestamp; 2616 + if (sample->time && (sample->time != (u64)-1)) 2617 + etm->latest_kernel_timestamp = sample->time; 2618 + break; 2619 + 2620 + default: 2621 + break; 2686 2622 } 2687 2623 2688 2624 return 0; ··· 2897 2821 * Return 'not found' if mismatch. 2898 2822 */ 2899 2823 if (auxtrace_event->cpu == (__u32) -1) { 2824 + etm->per_thread_decoding = true; 2900 2825 if (auxtrace_event->tid != sample->tid) 2901 2826 return 1; 2902 - } else if (auxtrace_event->cpu != sample->cpu) 2827 + } else if (auxtrace_event->cpu != sample->cpu) { 2828 + if (etm->per_thread_decoding) { 2829 + /* 2830 + * Found a per-cpu buffer after a per-thread one was 2831 + * already found 2832 + */ 2833 + pr_err("CS ETM: Inconsistent per-thread/per-cpu mode.\n"); 2834 + return -EINVAL; 2835 + } 2903 2836 return 1; 2837 + } 2904 2838 2905 2839 if (aux_event->flags & PERF_AUX_FLAG_OVERWRITE) { 2906 2840 /*