page_pool: import Jesper's page_pool benchmark

+7

tools/testing/selftests/net/bench/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + 3 + TEST_GEN_MODS_DIR := page_pool 4 + 5 + TEST_PROGS += test_bench_page_pool.sh 6 + 7 + include ../../lib.mk

+17

tools/testing/selftests/net/bench/page_pool/Makefile

··· 1 + BENCH_PAGE_POOL_SIMPLE_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) 2 + KDIR ?= /lib/modules/$(shell uname -r)/build 3 + 4 + ifeq ($(V),1) 5 + Q = 6 + else 7 + Q = @ 8 + endif 9 + 10 + obj-m += bench_page_pool.o 11 + bench_page_pool-y += bench_page_pool_simple.o time_bench.o 12 + 13 + all: 14 + +$(Q)make -C $(KDIR) M=$(BENCH_PAGE_POOL_SIMPLE_TEST_DIR) modules 15 + 16 + clean: 17 + +$(Q)make -C $(KDIR) M=$(BENCH_PAGE_POOL_SIMPLE_TEST_DIR) clean

+276

tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Benchmark module for page_pool. 4 + * 5 + */ 6 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 + 8 + #include <linux/module.h> 9 + #include <linux/mutex.h> 10 + 11 + #include <linux/version.h> 12 + #include <net/page_pool/helpers.h> 13 + 14 + #include <linux/interrupt.h> 15 + #include <linux/limits.h> 16 + 17 + #include "time_bench.h" 18 + 19 + static int verbose = 1; 20 + #define MY_POOL_SIZE 1024 21 + 22 + static void _page_pool_put_page(struct page_pool *pool, struct page *page, 23 + bool allow_direct) 24 + { 25 + page_pool_put_page(pool, page, -1, allow_direct); 26 + } 27 + 28 + /* Makes tests selectable. Useful for perf-record to analyze a single test. 29 + * Hint: Bash shells support writing binary number like: $((2#101010) 30 + * 31 + * # modprobe bench_page_pool_simple run_flags=$((2#100)) 32 + */ 33 + static unsigned long run_flags = 0xFFFFFFFF; 34 + module_param(run_flags, ulong, 0); 35 + MODULE_PARM_DESC(run_flags, "Limit which bench test that runs"); 36 + 37 + /* Count the bit number from the enum */ 38 + enum benchmark_bit { 39 + bit_run_bench_baseline, 40 + bit_run_bench_no_softirq01, 41 + bit_run_bench_no_softirq02, 42 + bit_run_bench_no_softirq03, 43 + }; 44 + 45 + #define bit(b) (1 << (b)) 46 + #define enabled(b) ((run_flags & (bit(b)))) 47 + 48 + /* notice time_bench is limited to U32_MAX nr loops */ 49 + static unsigned long loops = 10000000; 50 + module_param(loops, ulong, 0); 51 + MODULE_PARM_DESC(loops, "Specify loops bench will run"); 52 + 53 + /* Timing at the nanosec level, we need to know the overhead 54 + * introduced by the for loop itself 55 + */ 56 + static int time_bench_for_loop(struct time_bench_record *rec, void *data) 57 + { 58 + uint64_t loops_cnt = 0; 59 + int i; 60 + 61 + time_bench_start(rec); 62 + /** Loop to measure **/ 63 + for (i = 0; i < rec->loops; i++) { 64 + loops_cnt++; 65 + barrier(); /* avoid compiler to optimize this loop */ 66 + } 67 + time_bench_stop(rec, loops_cnt); 68 + return loops_cnt; 69 + } 70 + 71 + static int time_bench_atomic_inc(struct time_bench_record *rec, void *data) 72 + { 73 + uint64_t loops_cnt = 0; 74 + atomic_t cnt; 75 + int i; 76 + 77 + atomic_set(&cnt, 0); 78 + 79 + time_bench_start(rec); 80 + /** Loop to measure **/ 81 + for (i = 0; i < rec->loops; i++) { 82 + atomic_inc(&cnt); 83 + barrier(); /* avoid compiler to optimize this loop */ 84 + } 85 + loops_cnt = atomic_read(&cnt); 86 + time_bench_stop(rec, loops_cnt); 87 + return loops_cnt; 88 + } 89 + 90 + /* The ptr_ping in page_pool uses a spinlock. We need to know the minimum 91 + * overhead of taking+releasing a spinlock, to know the cycles that can be saved 92 + * by e.g. amortizing this via bulking. 93 + */ 94 + static int time_bench_lock(struct time_bench_record *rec, void *data) 95 + { 96 + uint64_t loops_cnt = 0; 97 + spinlock_t lock; 98 + int i; 99 + 100 + spin_lock_init(&lock); 101 + 102 + time_bench_start(rec); 103 + /** Loop to measure **/ 104 + for (i = 0; i < rec->loops; i++) { 105 + spin_lock(&lock); 106 + loops_cnt++; 107 + barrier(); /* avoid compiler to optimize this loop */ 108 + spin_unlock(&lock); 109 + } 110 + time_bench_stop(rec, loops_cnt); 111 + return loops_cnt; 112 + } 113 + 114 + /* Helper for filling some page's into ptr_ring */ 115 + static void pp_fill_ptr_ring(struct page_pool *pp, int elems) 116 + { 117 + /* GFP_ATOMIC needed when under run softirq */ 118 + gfp_t gfp_mask = GFP_ATOMIC; 119 + struct page **array; 120 + int i; 121 + 122 + array = kcalloc(elems, sizeof(struct page *), gfp_mask); 123 + 124 + for (i = 0; i < elems; i++) 125 + array[i] = page_pool_alloc_pages(pp, gfp_mask); 126 + for (i = 0; i < elems; i++) 127 + _page_pool_put_page(pp, array[i], false); 128 + 129 + kfree(array); 130 + } 131 + 132 + enum test_type { type_fast_path, type_ptr_ring, type_page_allocator }; 133 + 134 + /* Depends on compile optimizing this function */ 135 + static int time_bench_page_pool(struct time_bench_record *rec, void *data, 136 + enum test_type type, const char *func) 137 + { 138 + uint64_t loops_cnt = 0; 139 + gfp_t gfp_mask = GFP_ATOMIC; /* GFP_ATOMIC is not really needed */ 140 + int i, err; 141 + 142 + struct page_pool *pp; 143 + struct page *page; 144 + 145 + struct page_pool_params pp_params = { 146 + .order = 0, 147 + .flags = 0, 148 + .pool_size = MY_POOL_SIZE, 149 + .nid = NUMA_NO_NODE, 150 + .dev = NULL, /* Only use for DMA mapping */ 151 + .dma_dir = DMA_BIDIRECTIONAL, 152 + }; 153 + 154 + pp = page_pool_create(&pp_params); 155 + if (IS_ERR(pp)) { 156 + err = PTR_ERR(pp); 157 + pr_warn("%s: Error(%d) creating page_pool\n", func, err); 158 + goto out; 159 + } 160 + pp_fill_ptr_ring(pp, 64); 161 + 162 + if (in_serving_softirq()) 163 + pr_warn("%s(): in_serving_softirq fast-path\n", func); 164 + else 165 + pr_warn("%s(): Cannot use page_pool fast-path\n", func); 166 + 167 + time_bench_start(rec); 168 + /** Loop to measure **/ 169 + for (i = 0; i < rec->loops; i++) { 170 + /* Common fast-path alloc that depend on in_serving_softirq() */ 171 + page = page_pool_alloc_pages(pp, gfp_mask); 172 + if (!page) 173 + break; 174 + loops_cnt++; 175 + barrier(); /* avoid compiler to optimize this loop */ 176 + 177 + /* The benchmarks purpose it to test different return paths. 178 + * Compiler should inline optimize other function calls out 179 + */ 180 + if (type == type_fast_path) { 181 + /* Fast-path recycling e.g. XDP_DROP use-case */ 182 + page_pool_recycle_direct(pp, page); 183 + 184 + } else if (type == type_ptr_ring) { 185 + /* Normal return path */ 186 + _page_pool_put_page(pp, page, false); 187 + 188 + } else if (type == type_page_allocator) { 189 + /* Test if not pages are recycled, but instead 190 + * returned back into systems page allocator 191 + */ 192 + get_page(page); /* cause no-recycling */ 193 + _page_pool_put_page(pp, page, false); 194 + put_page(page); 195 + } else { 196 + BUILD_BUG(); 197 + } 198 + } 199 + time_bench_stop(rec, loops_cnt); 200 + out: 201 + page_pool_destroy(pp); 202 + return loops_cnt; 203 + } 204 + 205 + static int time_bench_page_pool01_fast_path(struct time_bench_record *rec, 206 + void *data) 207 + { 208 + return time_bench_page_pool(rec, data, type_fast_path, __func__); 209 + } 210 + 211 + static int time_bench_page_pool02_ptr_ring(struct time_bench_record *rec, 212 + void *data) 213 + { 214 + return time_bench_page_pool(rec, data, type_ptr_ring, __func__); 215 + } 216 + 217 + static int time_bench_page_pool03_slow(struct time_bench_record *rec, 218 + void *data) 219 + { 220 + return time_bench_page_pool(rec, data, type_page_allocator, __func__); 221 + } 222 + 223 + static int run_benchmark_tests(void) 224 + { 225 + uint32_t nr_loops = loops; 226 + 227 + /* Baseline tests */ 228 + if (enabled(bit_run_bench_baseline)) { 229 + time_bench_loop(nr_loops * 10, 0, "for_loop", NULL, 230 + time_bench_for_loop); 231 + time_bench_loop(nr_loops * 10, 0, "atomic_inc", NULL, 232 + time_bench_atomic_inc); 233 + time_bench_loop(nr_loops, 0, "lock", NULL, time_bench_lock); 234 + } 235 + 236 + /* This test cannot activate correct code path, due to no-softirq ctx */ 237 + if (enabled(bit_run_bench_no_softirq01)) 238 + time_bench_loop(nr_loops, 0, "no-softirq-page_pool01", NULL, 239 + time_bench_page_pool01_fast_path); 240 + if (enabled(bit_run_bench_no_softirq02)) 241 + time_bench_loop(nr_loops, 0, "no-softirq-page_pool02", NULL, 242 + time_bench_page_pool02_ptr_ring); 243 + if (enabled(bit_run_bench_no_softirq03)) 244 + time_bench_loop(nr_loops, 0, "no-softirq-page_pool03", NULL, 245 + time_bench_page_pool03_slow); 246 + 247 + return 0; 248 + } 249 + 250 + static int __init bench_page_pool_simple_module_init(void) 251 + { 252 + if (verbose) 253 + pr_info("Loaded\n"); 254 + 255 + if (loops > U32_MAX) { 256 + pr_err("Module param loops(%lu) exceeded U32_MAX(%u)\n", loops, 257 + U32_MAX); 258 + return -ECHRNG; 259 + } 260 + 261 + run_benchmark_tests(); 262 + 263 + return 0; 264 + } 265 + module_init(bench_page_pool_simple_module_init); 266 + 267 + static void __exit bench_page_pool_simple_module_exit(void) 268 + { 269 + if (verbose) 270 + pr_info("Unloaded\n"); 271 + } 272 + module_exit(bench_page_pool_simple_module_exit); 273 + 274 + MODULE_DESCRIPTION("Benchmark of page_pool simple cases"); 275 + MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer@brouer.com>"); 276 + MODULE_LICENSE("GPL");

+394

tools/testing/selftests/net/bench/page_pool/time_bench.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Benchmarking code execution time inside the kernel 4 + * 5 + * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer 6 + */ 7 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 8 + 9 + #include <linux/module.h> 10 + #include <linux/time.h> 11 + 12 + #include <linux/perf_event.h> /* perf_event_create_kernel_counter() */ 13 + 14 + /* For concurrency testing */ 15 + #include <linux/completion.h> 16 + #include <linux/sched.h> 17 + #include <linux/workqueue.h> 18 + #include <linux/kthread.h> 19 + 20 + #include "time_bench.h" 21 + 22 + static int verbose = 1; 23 + 24 + /** TSC (Time-Stamp Counter) based ** 25 + * See: linux/time_bench.h 26 + * tsc_start_clock() and tsc_stop_clock() 27 + */ 28 + 29 + /** Wall-clock based ** 30 + */ 31 + 32 + /** PMU (Performance Monitor Unit) based ** 33 + */ 34 + #define PERF_FORMAT \ 35 + (PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | \ 36 + PERF_FORMAT_TOTAL_TIME_RUNNING) 37 + 38 + struct raw_perf_event { 39 + uint64_t config; /* event */ 40 + uint64_t config1; /* umask */ 41 + struct perf_event *save; 42 + char *desc; 43 + }; 44 + 45 + /* if HT is enable a maximum of 4 events (5 if one is instructions 46 + * retired can be specified, if HT is disabled a maximum of 8 (9 if 47 + * one is instructions retired) can be specified. 48 + * 49 + * From Table 19-1. Architectural Performance Events 50 + * Architectures Software Developer’s Manual Volume 3: System Programming 51 + * Guide 52 + */ 53 + struct raw_perf_event perf_events[] = { 54 + { 0x3c, 0x00, NULL, "Unhalted CPU Cycles" }, 55 + { 0xc0, 0x00, NULL, "Instruction Retired" } 56 + }; 57 + 58 + #define NUM_EVTS (ARRAY_SIZE(perf_events)) 59 + 60 + /* WARNING: PMU config is currently broken! 61 + */ 62 + bool time_bench_PMU_config(bool enable) 63 + { 64 + int i; 65 + struct perf_event_attr perf_conf; 66 + struct perf_event *perf_event; 67 + int cpu; 68 + 69 + preempt_disable(); 70 + cpu = smp_processor_id(); 71 + pr_info("DEBUG: cpu:%d\n", cpu); 72 + preempt_enable(); 73 + 74 + memset(&perf_conf, 0, sizeof(struct perf_event_attr)); 75 + perf_conf.type = PERF_TYPE_RAW; 76 + perf_conf.size = sizeof(struct perf_event_attr); 77 + perf_conf.read_format = PERF_FORMAT; 78 + perf_conf.pinned = 1; 79 + perf_conf.exclude_user = 1; /* No userspace events */ 80 + perf_conf.exclude_kernel = 0; /* Only kernel events */ 81 + 82 + for (i = 0; i < NUM_EVTS; i++) { 83 + perf_conf.disabled = enable; 84 + //perf_conf.disabled = (i == 0) ? 1 : 0; 85 + perf_conf.config = perf_events[i].config; 86 + perf_conf.config1 = perf_events[i].config1; 87 + if (verbose) 88 + pr_info("%s() enable PMU counter: %s\n", 89 + __func__, perf_events[i].desc); 90 + perf_event = perf_event_create_kernel_counter(&perf_conf, cpu, 91 + NULL /* task */, 92 + NULL /* overflow_handler*/, 93 + NULL /* context */); 94 + if (perf_event) { 95 + perf_events[i].save = perf_event; 96 + pr_info("%s():DEBUG perf_event success\n", __func__); 97 + 98 + perf_event_enable(perf_event); 99 + } else { 100 + pr_info("%s():DEBUG perf_event is NULL\n", __func__); 101 + } 102 + } 103 + 104 + return true; 105 + } 106 + 107 + /** Generic functions ** 108 + */ 109 + 110 + /* Calculate stats, store results in record */ 111 + bool time_bench_calc_stats(struct time_bench_record *rec) 112 + { 113 + #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ 114 + uint64_t ns_per_call_tmp_rem = 0; 115 + uint32_t ns_per_call_remainder = 0; 116 + uint64_t pmc_ipc_tmp_rem = 0; 117 + uint32_t pmc_ipc_remainder = 0; 118 + uint32_t pmc_ipc_div = 0; 119 + uint32_t invoked_cnt_precision = 0; 120 + uint32_t invoked_cnt = 0; /* 32-bit due to div_u64_rem() */ 121 + 122 + if (rec->flags & TIME_BENCH_LOOP) { 123 + if (rec->invoked_cnt < 1000) { 124 + pr_err("ERR: need more(>1000) loops(%llu) for timing\n", 125 + rec->invoked_cnt); 126 + return false; 127 + } 128 + if (rec->invoked_cnt > ((1ULL << 32) - 1)) { 129 + /* div_u64_rem() can only support div with 32bit*/ 130 + pr_err("ERR: Invoke cnt(%llu) too big overflow 32bit\n", 131 + rec->invoked_cnt); 132 + return false; 133 + } 134 + invoked_cnt = (uint32_t)rec->invoked_cnt; 135 + } 136 + 137 + /* TSC (Time-Stamp Counter) records */ 138 + if (rec->flags & TIME_BENCH_TSC) { 139 + rec->tsc_interval = rec->tsc_stop - rec->tsc_start; 140 + if (rec->tsc_interval == 0) { 141 + pr_err("ABORT: timing took ZERO TSC time\n"); 142 + return false; 143 + } 144 + /* Calculate stats */ 145 + if (rec->flags & TIME_BENCH_LOOP) 146 + rec->tsc_cycles = rec->tsc_interval / invoked_cnt; 147 + else 148 + rec->tsc_cycles = rec->tsc_interval; 149 + } 150 + 151 + /* Wall-clock time calc */ 152 + if (rec->flags & TIME_BENCH_WALLCLOCK) { 153 + rec->time_start = rec->ts_start.tv_nsec + 154 + (NANOSEC_PER_SEC * rec->ts_start.tv_sec); 155 + rec->time_stop = rec->ts_stop.tv_nsec + 156 + (NANOSEC_PER_SEC * rec->ts_stop.tv_sec); 157 + rec->time_interval = rec->time_stop - rec->time_start; 158 + if (rec->time_interval == 0) { 159 + pr_err("ABORT: timing took ZERO wallclock time\n"); 160 + return false; 161 + } 162 + /* Calculate stats */ 163 + /*** Division in kernel it tricky ***/ 164 + /* Orig: time_sec = (time_interval / NANOSEC_PER_SEC); */ 165 + /* remainder only correct because NANOSEC_PER_SEC is 10^9 */ 166 + rec->time_sec = div_u64_rem(rec->time_interval, NANOSEC_PER_SEC, 167 + &rec->time_sec_remainder); 168 + //TODO: use existing struct timespec records instead of div? 169 + 170 + if (rec->flags & TIME_BENCH_LOOP) { 171 + /*** Division in kernel it tricky ***/ 172 + /* Orig: ns = ((double)time_interval / invoked_cnt); */ 173 + /* First get quotient */ 174 + rec->ns_per_call_quotient = 175 + div_u64_rem(rec->time_interval, invoked_cnt, 176 + &ns_per_call_remainder); 177 + /* Now get decimals .xxx precision (incorrect roundup)*/ 178 + ns_per_call_tmp_rem = ns_per_call_remainder; 179 + invoked_cnt_precision = invoked_cnt / 1000; 180 + if (invoked_cnt_precision > 0) { 181 + rec->ns_per_call_decimal = 182 + div_u64_rem(ns_per_call_tmp_rem, 183 + invoked_cnt_precision, 184 + &ns_per_call_remainder); 185 + } 186 + } 187 + } 188 + 189 + /* Performance Monitor Unit (PMU) counters */ 190 + if (rec->flags & TIME_BENCH_PMU) { 191 + //FIXME: Overflow handling??? 192 + rec->pmc_inst = rec->pmc_inst_stop - rec->pmc_inst_start; 193 + rec->pmc_clk = rec->pmc_clk_stop - rec->pmc_clk_start; 194 + 195 + /* Calc Instruction Per Cycle (IPC) */ 196 + /* First get quotient */ 197 + rec->pmc_ipc_quotient = div_u64_rem(rec->pmc_inst, rec->pmc_clk, 198 + &pmc_ipc_remainder); 199 + /* Now get decimals .xxx precision (incorrect roundup)*/ 200 + pmc_ipc_tmp_rem = pmc_ipc_remainder; 201 + pmc_ipc_div = rec->pmc_clk / 1000; 202 + if (pmc_ipc_div > 0) { 203 + rec->pmc_ipc_decimal = div_u64_rem(pmc_ipc_tmp_rem, 204 + pmc_ipc_div, 205 + &pmc_ipc_remainder); 206 + } 207 + } 208 + 209 + return true; 210 + } 211 + 212 + /* Generic function for invoking a loop function and calculating 213 + * execution time stats. The function being called/timed is assumed 214 + * to perform a tight loop, and update the timing record struct. 215 + */ 216 + bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, 217 + int (*func)(struct time_bench_record *record, void *data)) 218 + { 219 + struct time_bench_record rec; 220 + 221 + /* Setup record */ 222 + memset(&rec, 0, sizeof(rec)); /* zero func might not update all */ 223 + rec.version_abi = 1; 224 + rec.loops = loops; 225 + rec.step = step; 226 + rec.flags = (TIME_BENCH_LOOP | TIME_BENCH_TSC | TIME_BENCH_WALLCLOCK); 227 + 228 + /*** Loop function being timed ***/ 229 + if (!func(&rec, data)) { 230 + pr_err("ABORT: function being timed failed\n"); 231 + return false; 232 + } 233 + 234 + if (rec.invoked_cnt < loops) 235 + pr_warn("WARNING: Invoke count(%llu) smaller than loops(%d)\n", 236 + rec.invoked_cnt, loops); 237 + 238 + /* Calculate stats */ 239 + time_bench_calc_stats(&rec); 240 + 241 + pr_info("Type:%s Per elem: %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n", 242 + txt, rec.tsc_cycles, rec.ns_per_call_quotient, 243 + rec.ns_per_call_decimal, rec.step, rec.time_sec, 244 + rec.time_sec_remainder, rec.time_interval, rec.invoked_cnt, 245 + rec.tsc_interval); 246 + if (rec.flags & TIME_BENCH_PMU) 247 + pr_info("Type:%s PMU inst/clock%llu/%llu = %llu.%03llu IPC (inst per cycle)\n", 248 + txt, rec.pmc_inst, rec.pmc_clk, rec.pmc_ipc_quotient, 249 + rec.pmc_ipc_decimal); 250 + return true; 251 + } 252 + 253 + /* Function getting invoked by kthread */ 254 + static int invoke_test_on_cpu_func(void *private) 255 + { 256 + struct time_bench_cpu *cpu = private; 257 + struct time_bench_sync *sync = cpu->sync; 258 + cpumask_t newmask = CPU_MASK_NONE; 259 + void *data = cpu->data; 260 + 261 + /* Restrict CPU */ 262 + cpumask_set_cpu(cpu->rec.cpu, &newmask); 263 + set_cpus_allowed_ptr(current, &newmask); 264 + 265 + /* Synchronize start of concurrency test */ 266 + atomic_inc(&sync->nr_tests_running); 267 + wait_for_completion(&sync->start_event); 268 + 269 + /* Start benchmark function */ 270 + if (!cpu->bench_func(&cpu->rec, data)) { 271 + pr_err("ERROR: function being timed failed on CPU:%d(%d)\n", 272 + cpu->rec.cpu, smp_processor_id()); 273 + } else { 274 + if (verbose) 275 + pr_info("SUCCESS: ran on CPU:%d(%d)\n", cpu->rec.cpu, 276 + smp_processor_id()); 277 + } 278 + cpu->did_bench_run = true; 279 + 280 + /* End test */ 281 + atomic_dec(&sync->nr_tests_running); 282 + /* Wait for kthread_stop() telling us to stop */ 283 + while (!kthread_should_stop()) { 284 + set_current_state(TASK_INTERRUPTIBLE); 285 + schedule(); 286 + } 287 + __set_current_state(TASK_RUNNING); 288 + return 0; 289 + } 290 + 291 + void time_bench_print_stats_cpumask(const char *desc, 292 + struct time_bench_cpu *cpu_tasks, 293 + const struct cpumask *mask) 294 + { 295 + uint64_t average = 0; 296 + int cpu; 297 + int step = 0; 298 + struct sum { 299 + uint64_t tsc_cycles; 300 + int records; 301 + } sum = { 0 }; 302 + 303 + /* Get stats */ 304 + for_each_cpu(cpu, mask) { 305 + struct time_bench_cpu *c = &cpu_tasks[cpu]; 306 + struct time_bench_record *rec = &c->rec; 307 + 308 + /* Calculate stats */ 309 + time_bench_calc_stats(rec); 310 + 311 + pr_info("Type:%s CPU(%d) %llu cycles(tsc) %llu.%03llu ns (step:%d) - (measurement period time:%llu.%09u sec time_interval:%llu) - (invoke count:%llu tsc_interval:%llu)\n", 312 + desc, cpu, rec->tsc_cycles, rec->ns_per_call_quotient, 313 + rec->ns_per_call_decimal, rec->step, rec->time_sec, 314 + rec->time_sec_remainder, rec->time_interval, 315 + rec->invoked_cnt, rec->tsc_interval); 316 + 317 + /* Collect average */ 318 + sum.records++; 319 + sum.tsc_cycles += rec->tsc_cycles; 320 + step = rec->step; 321 + } 322 + 323 + if (sum.records) /* avoid div-by-zero */ 324 + average = sum.tsc_cycles / sum.records; 325 + pr_info("Sum Type:%s Average: %llu cycles(tsc) CPUs:%d step:%d\n", desc, 326 + average, sum.records, step); 327 + } 328 + 329 + void time_bench_run_concurrent(uint32_t loops, int step, void *data, 330 + const struct cpumask *mask, /* Support masking outsome CPUs*/ 331 + struct time_bench_sync *sync, 332 + struct time_bench_cpu *cpu_tasks, 333 + int (*func)(struct time_bench_record *record, void *data)) 334 + { 335 + int cpu, running = 0; 336 + 337 + if (verbose) // DEBUG 338 + pr_warn("%s() Started on CPU:%d\n", __func__, 339 + smp_processor_id()); 340 + 341 + /* Reset sync conditions */ 342 + atomic_set(&sync->nr_tests_running, 0); 343 + init_completion(&sync->start_event); 344 + 345 + /* Spawn off jobs on all CPUs */ 346 + for_each_cpu(cpu, mask) { 347 + struct time_bench_cpu *c = &cpu_tasks[cpu]; 348 + 349 + running++; 350 + c->sync = sync; /* Send sync variable along */ 351 + c->data = data; /* Send opaque along */ 352 + 353 + /* Init benchmark record */ 354 + memset(&c->rec, 0, sizeof(struct time_bench_record)); 355 + c->rec.version_abi = 1; 356 + c->rec.loops = loops; 357 + c->rec.step = step; 358 + c->rec.flags = (TIME_BENCH_LOOP | TIME_BENCH_TSC | 359 + TIME_BENCH_WALLCLOCK); 360 + c->rec.cpu = cpu; 361 + c->bench_func = func; 362 + c->task = kthread_run(invoke_test_on_cpu_func, c, 363 + "time_bench%d", cpu); 364 + if (IS_ERR(c->task)) { 365 + pr_err("%s(): Failed to start test func\n", __func__); 366 + return; /* Argh, what about cleanup?! */ 367 + } 368 + } 369 + 370 + /* Wait until all processes are running */ 371 + while (atomic_read(&sync->nr_tests_running) < running) { 372 + set_current_state(TASK_UNINTERRUPTIBLE); 373 + schedule_timeout(10); 374 + } 375 + /* Kick off all CPU concurrently on completion event */ 376 + complete_all(&sync->start_event); 377 + 378 + /* Wait for CPUs to finish */ 379 + while (atomic_read(&sync->nr_tests_running)) { 380 + set_current_state(TASK_UNINTERRUPTIBLE); 381 + schedule_timeout(10); 382 + } 383 + 384 + /* Stop the kthreads */ 385 + for_each_cpu(cpu, mask) { 386 + struct time_bench_cpu *c = &cpu_tasks[cpu]; 387 + 388 + kthread_stop(c->task); 389 + } 390 + 391 + if (verbose) // DEBUG - happens often, finish on another CPU 392 + pr_warn("%s() Finished on CPU:%d\n", __func__, 393 + smp_processor_id()); 394 + }

+238

tools/testing/selftests/net/bench/page_pool/time_bench.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Benchmarking code execution time inside the kernel 4 + * 5 + * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer 6 + * for licensing details see kernel-base/COPYING 7 + */ 8 + #ifndef _LINUX_TIME_BENCH_H 9 + #define _LINUX_TIME_BENCH_H 10 + 11 + /* Main structure used for recording a benchmark run */ 12 + struct time_bench_record { 13 + uint32_t version_abi; 14 + uint32_t loops; /* Requested loop invocations */ 15 + uint32_t step; /* option for e.g. bulk invocations */ 16 + 17 + uint32_t flags; /* Measurements types enabled */ 18 + #define TIME_BENCH_LOOP BIT(0) 19 + #define TIME_BENCH_TSC BIT(1) 20 + #define TIME_BENCH_WALLCLOCK BIT(2) 21 + #define TIME_BENCH_PMU BIT(3) 22 + 23 + uint32_t cpu; /* Used when embedded in time_bench_cpu */ 24 + 25 + /* Records */ 26 + uint64_t invoked_cnt; /* Returned actual invocations */ 27 + uint64_t tsc_start; 28 + uint64_t tsc_stop; 29 + struct timespec64 ts_start; 30 + struct timespec64 ts_stop; 31 + /* PMU counters for instruction and cycles 32 + * instructions counter including pipelined instructions 33 + */ 34 + uint64_t pmc_inst_start; 35 + uint64_t pmc_inst_stop; 36 + /* CPU unhalted clock counter */ 37 + uint64_t pmc_clk_start; 38 + uint64_t pmc_clk_stop; 39 + 40 + /* Result records */ 41 + uint64_t tsc_interval; 42 + uint64_t time_start, time_stop, time_interval; /* in nanosec */ 43 + uint64_t pmc_inst, pmc_clk; 44 + 45 + /* Derived result records */ 46 + uint64_t tsc_cycles; // +decimal? 47 + uint64_t ns_per_call_quotient, ns_per_call_decimal; 48 + uint64_t time_sec; 49 + uint32_t time_sec_remainder; 50 + uint64_t pmc_ipc_quotient, pmc_ipc_decimal; /* inst per cycle */ 51 + }; 52 + 53 + /* For synchronizing parallel CPUs to run concurrently */ 54 + struct time_bench_sync { 55 + atomic_t nr_tests_running; 56 + struct completion start_event; 57 + }; 58 + 59 + /* Keep track of CPUs executing our bench function. 60 + * 61 + * Embed a time_bench_record for storing info per cpu 62 + */ 63 + struct time_bench_cpu { 64 + struct time_bench_record rec; 65 + struct time_bench_sync *sync; /* back ptr */ 66 + struct task_struct *task; 67 + /* "data" opaque could have been placed in time_bench_sync, 68 + * but to avoid any false sharing, place it per CPU 69 + */ 70 + void *data; 71 + /* Support masking outsome CPUs, mark if it ran */ 72 + bool did_bench_run; 73 + /* int cpu; // note CPU stored in time_bench_record */ 74 + int (*bench_func)(struct time_bench_record *record, void *data); 75 + }; 76 + 77 + /* 78 + * Below TSC assembler code is not compatible with other archs, and 79 + * can also fail on guests if cpu-flags are not correct. 80 + * 81 + * The way TSC reading is used, many iterations, does not require as 82 + * high accuracy as described below (in Intel Doc #324264). 83 + * 84 + * Considering changing to use get_cycles() (#include <asm/timex.h>). 85 + */ 86 + 87 + /** TSC (Time-Stamp Counter) based ** 88 + * Recommend reading, to understand details of reading TSC accurately: 89 + * Intel Doc #324264, "How to Benchmark Code Execution Times on Intel" 90 + * 91 + * Consider getting exclusive ownership of CPU by using: 92 + * unsigned long flags; 93 + * preempt_disable(); 94 + * raw_local_irq_save(flags); 95 + * _your_code_ 96 + * raw_local_irq_restore(flags); 97 + * preempt_enable(); 98 + * 99 + * Clobbered registers: "%rax", "%rbx", "%rcx", "%rdx" 100 + * RDTSC only change "%rax" and "%rdx" but 101 + * CPUID clears the high 32-bits of all (rax/rbx/rcx/rdx) 102 + */ 103 + static __always_inline uint64_t tsc_start_clock(void) 104 + { 105 + /* See: Intel Doc #324264 */ 106 + unsigned int hi, lo; 107 + 108 + asm volatile("CPUID\n\t" 109 + "RDTSC\n\t" 110 + "mov %%edx, %0\n\t" 111 + "mov %%eax, %1\n\t" 112 + : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); 113 + //FIXME: on 32bit use clobbered %eax + %edx 114 + return ((uint64_t)lo) | (((uint64_t)hi) << 32); 115 + } 116 + 117 + static __always_inline uint64_t tsc_stop_clock(void) 118 + { 119 + /* See: Intel Doc #324264 */ 120 + unsigned int hi, lo; 121 + 122 + asm volatile("RDTSCP\n\t" 123 + "mov %%edx, %0\n\t" 124 + "mov %%eax, %1\n\t" 125 + "CPUID\n\t" 126 + : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); 127 + return ((uint64_t)lo) | (((uint64_t)hi) << 32); 128 + } 129 + 130 + /** Wall-clock based ** 131 + * 132 + * use: getnstimeofday() 133 + * getnstimeofday(&rec->ts_start); 134 + * getnstimeofday(&rec->ts_stop); 135 + * 136 + * API changed see: Documentation/core-api/timekeeping.rst 137 + * https://www.kernel.org/doc/html/latest/core-api/timekeeping.html#c.getnstimeofday 138 + * 139 + * We should instead use: ktime_get_real_ts64() is a direct 140 + * replacement, but consider using monotonic time (ktime_get_ts64()) 141 + * and/or a ktime_t based interface (ktime_get()/ktime_get_real()). 142 + */ 143 + 144 + /** PMU (Performance Monitor Unit) based ** 145 + * 146 + * Needed for calculating: Instructions Per Cycle (IPC) 147 + * - The IPC number tell how efficient the CPU pipelining were 148 + */ 149 + //lookup: perf_event_create_kernel_counter() 150 + 151 + bool time_bench_PMU_config(bool enable); 152 + 153 + /* Raw reading via rdpmc() using fixed counters 154 + * 155 + * From: https://github.com/andikleen/simple-pmu 156 + */ 157 + enum { 158 + FIXED_SELECT = (1U << 30), /* == 0x40000000 */ 159 + FIXED_INST_RETIRED_ANY = 0, 160 + FIXED_CPU_CLK_UNHALTED_CORE = 1, 161 + FIXED_CPU_CLK_UNHALTED_REF = 2, 162 + }; 163 + 164 + static __always_inline unsigned int long long p_rdpmc(unsigned int in) 165 + { 166 + unsigned int d, a; 167 + 168 + asm volatile("rdpmc" : "=d"(d), "=a"(a) : "c"(in) : "memory"); 169 + return ((unsigned long long)d << 32) | a; 170 + } 171 + 172 + /* These PMU counter needs to be enabled, but I don't have the 173 + * configure code implemented. My current hack is running: 174 + * sudo perf stat -e cycles:k -e instructions:k insmod lib/ring_queue_test.ko 175 + */ 176 + /* Reading all pipelined instruction */ 177 + static __always_inline unsigned long long pmc_inst(void) 178 + { 179 + return p_rdpmc(FIXED_SELECT | FIXED_INST_RETIRED_ANY); 180 + } 181 + 182 + /* Reading CPU clock cycles */ 183 + static __always_inline unsigned long long pmc_clk(void) 184 + { 185 + return p_rdpmc(FIXED_SELECT | FIXED_CPU_CLK_UNHALTED_CORE); 186 + } 187 + 188 + /* Raw reading via MSR rdmsr() is likely wrong 189 + * FIXME: How can I know which raw MSR registers are conf for what? 190 + */ 191 + #define MSR_IA32_PCM0 0x400000C1 /* PERFCTR0 */ 192 + #define MSR_IA32_PCM1 0x400000C2 /* PERFCTR1 */ 193 + #define MSR_IA32_PCM2 0x400000C3 194 + static inline uint64_t msr_inst(unsigned long long *msr_result) 195 + { 196 + return rdmsrq_safe(MSR_IA32_PCM0, msr_result); 197 + } 198 + 199 + /** Generic functions ** 200 + */ 201 + bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, 202 + int (*func)(struct time_bench_record *rec, void *data)); 203 + bool time_bench_calc_stats(struct time_bench_record *rec); 204 + 205 + void time_bench_run_concurrent(uint32_t loops, int step, void *data, 206 + const struct cpumask *mask, /* Support masking outsome CPUs*/ 207 + struct time_bench_sync *sync, struct time_bench_cpu *cpu_tasks, 208 + int (*func)(struct time_bench_record *record, void *data)); 209 + void time_bench_print_stats_cpumask(const char *desc, 210 + struct time_bench_cpu *cpu_tasks, 211 + const struct cpumask *mask); 212 + 213 + //FIXME: use rec->flags to select measurement, should be MACRO 214 + static __always_inline void time_bench_start(struct time_bench_record *rec) 215 + { 216 + //getnstimeofday(&rec->ts_start); 217 + ktime_get_real_ts64(&rec->ts_start); 218 + if (rec->flags & TIME_BENCH_PMU) { 219 + rec->pmc_inst_start = pmc_inst(); 220 + rec->pmc_clk_start = pmc_clk(); 221 + } 222 + rec->tsc_start = tsc_start_clock(); 223 + } 224 + 225 + static __always_inline void time_bench_stop(struct time_bench_record *rec, 226 + uint64_t invoked_cnt) 227 + { 228 + rec->tsc_stop = tsc_stop_clock(); 229 + if (rec->flags & TIME_BENCH_PMU) { 230 + rec->pmc_inst_stop = pmc_inst(); 231 + rec->pmc_clk_stop = pmc_clk(); 232 + } 233 + //getnstimeofday(&rec->ts_stop); 234 + ktime_get_real_ts64(&rec->ts_stop); 235 + rec->invoked_cnt = invoked_cnt; 236 + } 237 + 238 + #endif /* _LINUX_TIME_BENCH_H */

+32

tools/testing/selftests/net/bench/test_bench_page_pool.sh

··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + # 4 + 5 + set -e 6 + 7 + DRIVER="./page_pool/bench_page_pool.ko" 8 + result="" 9 + 10 + function run_test() 11 + { 12 + rmmod "bench_page_pool.ko" || true 13 + insmod $DRIVER > /dev/null 2>&1 14 + result=$(dmesg | tail -10) 15 + echo "$result" 16 + 17 + echo 18 + echo "Fast path results:" 19 + echo "${result}" | grep -o -E "no-softirq-page_pool01 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" 20 + 21 + echo 22 + echo "ptr_ring results:" 23 + echo "${result}" | grep -o -E "no-softirq-page_pool02 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" 24 + 25 + echo 26 + echo "slow path results:" 27 + echo "${result}" | grep -o -E "no-softirq-page_pool03 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" 28 + } 29 + 30 + run_test 31 + 32 + exit 0

Configure Feed

Configure Feed