Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

delayacct: add timestamp of delay max

Problem
=======
Commit 658eb5ab916d ("delayacct: add delay max to record delay peak")
introduced the delay max for getdelays, which records abnormal latency
peaks and helps us understand the magnitude of such delays. However, the
peak latency value alone is insufficient for effective root cause
analysis. Without the precise timestamp of when the peak occurred, we
still lack the critical context needed to correlate it with other system
events.

Solution
========
To address this, we need to additionally record a precise timestamp when
the maximum latency occurs. By correlating this timestamp with system
logs and monitoring metrics, we can identify processes with abnormal
resource usage at the same moment, which can help us to pinpoint root
causes.

Use Case
========
bash-4.4# ./getdelays -d -t 227
print delayacct stats ON
TGID 227
CPU count real total virtual total delay total delay average delay max delay min delay max timestamp
46 188000000 192348334 4098012 0.089ms 0.429260ms 0.051205ms 2026-01-15T15:06:58
IO count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
SWAP count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
RECLAIM count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
THRAS HING count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
COMPACT count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A
WPCOPY count delay total delay average delay max delay min delay max timestamp
182 19413338 0.107ms 0.547353ms 0.022462ms 2026-01-15T15:05:24
IRQ count delay total delay average delay max delay min delay max timestamp
0 0 0.000ms 0.000000ms 0.000000ms N/A

Link: https://lkml.kernel.org/r/20260119100241520gWubW8-5QfhSf9gjqcc_E@zte.com.cn
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Wang Yaxin and committed by
Andrew Morton
503efe85 cc20650a

+223 -55
+16 -16
Documentation/accounting/delay-accounting.rst
··· 107 107 TGID 242 108 108 109 109 110 - CPU count real total virtual total delay total delay average delay max delay min 111 - 39 156000000 156576579 2111069 0.054ms 0.212296ms 0.031307ms 112 - IO count delay total delay average delay max delay min 113 - 0 0 0.000ms 0.000000ms 0.000000ms 114 - SWAP count delay total delay average delay max delay min 115 - 0 0 0.000ms 0.000000ms 0.000000ms 116 - RECLAIM count delay total delay average delay max delay min 117 - 0 0 0.000ms 0.000000ms 0.000000ms 118 - THRASHING count delay total delay average delay max delay min 119 - 0 0 0.000ms 0.000000ms 0.000000ms 120 - COMPACT count delay total delay average delay max delay min 121 - 0 0 0.000ms 0.000000ms 0.000000ms 122 - WPCOPY count delay total delay average delay max delay min 123 - 156 11215873 0.072ms 0.207403ms 0.033913ms 124 - IRQ count delay total delay average delay max delay min 125 - 0 0 0.000ms 0.000000ms 0.000000ms 110 + CPU count real total virtual total delay total delay average delay max delay min delay max timestamp 111 + 46 188000000 192348334 4098012 0.089ms 0.429260ms 0.051205ms 2026-01-15T15:06:58 112 + IO count delay total delay average delay max delay min delay max timestamp 113 + 0 0 0.000ms 0.000000ms 0.000000ms N/A 114 + SWAP count delay total delay average delay max delay min delay max timestamp 115 + 0 0 0.000ms 0.000000ms 0.000000ms N/A 116 + RECLAIM count delay total delay average delay max delay min delay max timestamp 117 + 0 0 0.000ms 0.000000ms 0.000000ms N/A 118 + THRASHING count delay total delay average delay max delay min delay max timestamp 119 + 0 0 0.000ms 0.000000ms 0.000000ms N/A 120 + COMPACT count delay total delay average delay max delay min delay max timestamp 121 + 0 0 0.000ms 0.000000ms 0.000000ms N/A 122 + WPCOPY count delay total delay average delay max delay min delay max timestamp 123 + 182 19413338 0.107ms 0.547353ms 0.022462ms 2026-01-15T15:05:24 124 + IRQ count delay total delay average delay max delay min delay max timestamp 125 + 0 0 0.000ms 0.000000ms 0.000000ms N/A 126 126 127 127 Get IO accounting for pid 1, it works only with -p:: 128 128
+8
include/linux/delayacct.h
··· 69 69 u32 compact_count; /* total count of memory compact */ 70 70 u32 wpcopy_count; /* total count of write-protect copy */ 71 71 u32 irq_count; /* total count of IRQ/SOFTIRQ */ 72 + 73 + struct timespec64 blkio_delay_max_ts; 74 + struct timespec64 swapin_delay_max_ts; 75 + struct timespec64 freepages_delay_max_ts; 76 + struct timespec64 thrashing_delay_max_ts; 77 + struct timespec64 compact_delay_max_ts; 78 + struct timespec64 wpcopy_delay_max_ts; 79 + struct timespec64 irq_delay_max_ts; 72 80 }; 73 81 #endif 74 82
+5
include/linux/sched.h
··· 49 49 #include <linux/tracepoint-defs.h> 50 50 #include <linux/unwind_deferred_types.h> 51 51 #include <asm/kmap_size.h> 52 + #include <linux/time64.h> 52 53 #ifndef COMPILE_OFFSETS 53 54 #include <generated/rq-offsets.h> 54 55 #endif ··· 87 86 struct task_delay_info; 88 87 struct task_group; 89 88 struct task_struct; 89 + struct timespec64; 90 90 struct user_event_mm; 91 91 92 92 #include <linux/sched/ext.h> ··· 436 434 437 435 /* When were we last queued to run? */ 438 436 unsigned long long last_queued; 437 + 438 + /* Timestamp of max time spent waiting on a runqueue: */ 439 + struct timespec64 max_run_delay_ts; 439 440 440 441 #endif /* CONFIG_SCHED_INFO */ 441 442 };
+21 -1
include/uapi/linux/taskstats.h
··· 18 18 #define _LINUX_TASKSTATS_H 19 19 20 20 #include <linux/types.h> 21 + #ifdef __KERNEL__ 22 + #include <linux/time64.h> 23 + #else 24 + #ifndef _LINUX_TIME64_H 25 + struct timespec64 { 26 + __s64 tv_sec; /* seconds */ 27 + long tv_nsec; /* nanoseconds */ 28 + }; 29 + #endif 30 + #endif 21 31 22 32 /* Format for per-task data returned to userland when 23 33 * - a task exits ··· 44 34 */ 45 35 46 36 47 - #define TASKSTATS_VERSION 16 37 + #define TASKSTATS_VERSION 17 48 38 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN 49 39 * in linux/sched.h */ 50 40 ··· 240 230 241 231 __u64 irq_delay_max; 242 232 __u64 irq_delay_min; 233 + 234 + /*v17: delay max timestamp record*/ 235 + struct timespec64 cpu_delay_max_ts; 236 + struct timespec64 blkio_delay_max_ts; 237 + struct timespec64 swapin_delay_max_ts; 238 + struct timespec64 freepages_delay_max_ts; 239 + struct timespec64 thrashing_delay_max_ts; 240 + struct timespec64 compact_delay_max_ts; 241 + struct timespec64 wpcopy_delay_max_ts; 242 + struct timespec64 irq_delay_max_ts; 243 243 }; 244 244 245 245
+22 -9
kernel/delayacct.c
··· 18 18 do { \ 19 19 d->type##_delay_max = tsk->delays->type##_delay_max; \ 20 20 d->type##_delay_min = tsk->delays->type##_delay_min; \ 21 + d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \ 21 22 tmp = d->type##_delay_total + tsk->delays->type##_delay; \ 22 23 d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ 23 24 d->type##_count += tsk->delays->type##_count; \ ··· 105 104 * Finish delay accounting for a statistic using its timestamps (@start), 106 105 * accumulator (@total) and @count 107 106 */ 108 - static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min) 107 + static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, 108 + u64 *max, u64 *min, struct timespec64 *ts) 109 109 { 110 110 s64 ns = local_clock() - *start; 111 111 unsigned long flags; ··· 115 113 raw_spin_lock_irqsave(lock, flags); 116 114 *total += ns; 117 115 (*count)++; 118 - if (ns > *max) 116 + if (ns > *max) { 119 117 *max = ns; 118 + ktime_get_real_ts64(ts); 119 + } 120 120 if (*min == 0 || ns < *min) 121 121 *min = ns; 122 122 raw_spin_unlock_irqrestore(lock, flags); ··· 141 137 &p->delays->blkio_delay, 142 138 &p->delays->blkio_count, 143 139 &p->delays->blkio_delay_max, 144 - &p->delays->blkio_delay_min); 140 + &p->delays->blkio_delay_min, 141 + &p->delays->blkio_delay_max_ts); 145 142 } 146 143 147 144 int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) ··· 175 170 176 171 d->cpu_delay_max = tsk->sched_info.max_run_delay; 177 172 d->cpu_delay_min = tsk->sched_info.min_run_delay; 173 + d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts; 178 174 tmp = (s64)d->cpu_delay_total + t2; 179 175 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; 180 176 tmp = (s64)d->cpu_run_virtual_total + t3; ··· 223 217 &current->delays->freepages_delay, 224 218 &current->delays->freepages_count, 225 219 &current->delays->freepages_delay_max, 226 - &current->delays->freepages_delay_min); 220 + &current->delays->freepages_delay_min, 221 + &current->delays->freepages_delay_max_ts); 227 222 } 228 223 229 224 void __delayacct_thrashing_start(bool *in_thrashing) ··· 248 241 &current->delays->thrashing_delay, 249 242 &current->delays->thrashing_count, 250 243 &current->delays->thrashing_delay_max, 251 - &current->delays->thrashing_delay_min); 244 + &current->delays->thrashing_delay_min, 245 + &current->delays->thrashing_delay_max_ts); 252 246 } 253 247 254 248 void __delayacct_swapin_start(void) ··· 264 256 &current->delays->swapin_delay, 265 257 &current->delays->swapin_count, 266 258 &current->delays->swapin_delay_max, 267 - &current->delays->swapin_delay_min); 259 + &current->delays->swapin_delay_min, 260 + &current->delays->swapin_delay_max_ts); 268 261 } 269 262 270 263 void __delayacct_compact_start(void) ··· 280 271 &current->delays->compact_delay, 281 272 &current->delays->compact_count, 282 273 &current->delays->compact_delay_max, 283 - &current->delays->compact_delay_min); 274 + &current->delays->compact_delay_min, 275 + &current->delays->compact_delay_max_ts); 284 276 } 285 277 286 278 void __delayacct_wpcopy_start(void) ··· 296 286 &current->delays->wpcopy_delay, 297 287 &current->delays->wpcopy_count, 298 288 &current->delays->wpcopy_delay_max, 299 - &current->delays->wpcopy_delay_min); 289 + &current->delays->wpcopy_delay_min, 290 + &current->delays->wpcopy_delay_max_ts); 300 291 } 301 292 302 293 void __delayacct_irq(struct task_struct *task, u32 delta) ··· 307 296 raw_spin_lock_irqsave(&task->delays->lock, flags); 308 297 task->delays->irq_delay += delta; 309 298 task->delays->irq_count++; 310 - if (delta > task->delays->irq_delay_max) 299 + if (delta > task->delays->irq_delay_max) { 311 300 task->delays->irq_delay_max = delta; 301 + ktime_get_real_ts64(&task->delays->irq_delay_max_ts); 302 + } 312 303 if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min)) 313 304 task->delays->irq_delay_min = delta; 314 305 raw_spin_unlock_irqrestore(&task->delays->lock, flags);
+6 -2
kernel/sched/stats.h
··· 253 253 delta = rq_clock(rq) - t->sched_info.last_queued; 254 254 t->sched_info.last_queued = 0; 255 255 t->sched_info.run_delay += delta; 256 - if (delta > t->sched_info.max_run_delay) 256 + if (delta > t->sched_info.max_run_delay) { 257 257 t->sched_info.max_run_delay = delta; 258 + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); 259 + } 258 260 if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) 259 261 t->sched_info.min_run_delay = delta; 260 262 rq_sched_info_dequeue(rq, delta); ··· 280 278 t->sched_info.run_delay += delta; 281 279 t->sched_info.last_arrival = now; 282 280 t->sched_info.pcount++; 283 - if (delta > t->sched_info.max_run_delay) 281 + if (delta > t->sched_info.max_run_delay) { 284 282 t->sched_info.max_run_delay = delta; 283 + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); 284 + } 285 285 if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) 286 286 t->sched_info.min_run_delay = delta; 287 287
+145 -27
tools/accounting/getdelays.c
··· 24 24 #include <sys/socket.h> 25 25 #include <sys/wait.h> 26 26 #include <signal.h> 27 + #include <time.h> 27 28 28 29 #include <linux/genetlink.h> 29 30 #include <linux/taskstats.h> ··· 196 195 #define delay_ms(t) (t / 1000000ULL) 197 196 198 197 /* 198 + * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS) 199 + * Returns formatted string or "N/A" if timestamp is zero 200 + */ 201 + static const char *format_timespec64(struct timespec64 *ts) 202 + { 203 + static char buffer[32]; 204 + struct tm tm_info; 205 + time_t time_sec; 206 + 207 + /* Check if timestamp is zero (not set) */ 208 + if (ts->tv_sec == 0 && ts->tv_nsec == 0) 209 + return "N/A"; 210 + 211 + time_sec = (time_t)ts->tv_sec; 212 + 213 + /* Use thread-safe localtime_r */ 214 + if (localtime_r(&time_sec, &tm_info) == NULL) 215 + return "N/A"; 216 + 217 + snprintf(buffer, sizeof(buffer), "%04d-%02d-%02dT%02d:%02d:%02d", 218 + tm_info.tm_year + 1900, 219 + tm_info.tm_mon + 1, 220 + tm_info.tm_mday, 221 + tm_info.tm_hour, 222 + tm_info.tm_min, 223 + tm_info.tm_sec); 224 + 225 + return buffer; 226 + } 227 + 228 + /* 199 229 * Version compatibility note: 200 230 * Field availability depends on taskstats version (t->version), 201 231 * corresponding to TASKSTATS_VERSION in kernel headers ··· 237 205 * version >= 13 - supports WPCOPY statistics 238 206 * version >= 14 - supports IRQ statistics 239 207 * version >= 16 - supports *_max and *_min delay statistics 208 + * version >= 17 - supports delay max timestamp statistics 240 209 * 241 210 * Always verify version before accessing version-dependent fields 242 211 * to maintain backward compatibility. 243 212 */ 244 213 #define PRINT_CPU_DELAY(version, t) \ 245 214 do { \ 246 - if (version >= 16) { \ 215 + if (version >= 17) { \ 216 + printf("%-10s%15s%15s%15s%15s%15s%15s%15s%25s\n", \ 217 + "CPU", "count", "real total", "virtual total", \ 218 + "delay total", "delay average", "delay max", \ 219 + "delay min", "delay max timestamp"); \ 220 + printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \ 221 + (unsigned long long)(t)->cpu_count, \ 222 + (unsigned long long)(t)->cpu_run_real_total, \ 223 + (unsigned long long)(t)->cpu_run_virtual_total, \ 224 + (unsigned long long)(t)->cpu_delay_total, \ 225 + average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \ 226 + delay_ms((double)(t)->cpu_delay_max), \ 227 + delay_ms((double)(t)->cpu_delay_min), \ 228 + format_timespec64(&(t)->cpu_delay_max_ts)); \ 229 + } else if (version >= 16) { \ 247 230 printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \ 248 231 "CPU", "count", "real total", "virtual total", \ 249 232 "delay total", "delay average", "delay max", "delay min"); \ ··· 304 257 } \ 305 258 } while (0) 306 259 260 + #define PRINT_FILED_DELAY_WITH_TS(name, version, t, count, total, max, min, max_ts) \ 261 + do { \ 262 + if (version >= 17) { \ 263 + printf("%-10s%15s%15s%15s%15s%15s%25s\n", \ 264 + name, "count", "delay total", "delay average", \ 265 + "delay max", "delay min", "delay max timestamp"); \ 266 + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \ 267 + (unsigned long long)(t)->count, \ 268 + (unsigned long long)(t)->total, \ 269 + average_ms((double)(t)->total, (t)->count), \ 270 + delay_ms((double)(t)->max), \ 271 + delay_ms((double)(t)->min), \ 272 + format_timespec64(&(t)->max_ts)); \ 273 + } else if (version >= 16) { \ 274 + printf("%-10s%15s%15s%15s%15s%15s\n", \ 275 + name, "count", "delay total", "delay average", \ 276 + "delay max", "delay min"); \ 277 + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ 278 + (unsigned long long)(t)->count, \ 279 + (unsigned long long)(t)->total, \ 280 + average_ms((double)(t)->total, (t)->count), \ 281 + delay_ms((double)(t)->max), \ 282 + delay_ms((double)(t)->min)); \ 283 + } else { \ 284 + printf("%-10s%15s%15s%15s\n", \ 285 + name, "count", "delay total", "delay average"); \ 286 + printf(" %15llu%15llu%15.3fms\n", \ 287 + (unsigned long long)(t)->count, \ 288 + (unsigned long long)(t)->total, \ 289 + average_ms((double)(t)->total, (t)->count)); \ 290 + } \ 291 + } while (0) 292 + 307 293 static void print_delayacct(struct taskstats *t) 308 294 { 309 295 printf("\n\n"); 310 296 311 297 PRINT_CPU_DELAY(t->version, t); 312 298 313 - PRINT_FILED_DELAY("IO", t->version, t, 314 - blkio_count, blkio_delay_total, 315 - blkio_delay_max, blkio_delay_min); 299 + /* Use new macro with timestamp support for version >= 17 */ 300 + if (t->version >= 17) { 301 + PRINT_FILED_DELAY_WITH_TS("IO", t->version, t, 302 + blkio_count, blkio_delay_total, 303 + blkio_delay_max, blkio_delay_min, blkio_delay_max_ts); 316 304 317 - PRINT_FILED_DELAY("SWAP", t->version, t, 318 - swapin_count, swapin_delay_total, 319 - swapin_delay_max, swapin_delay_min); 305 + PRINT_FILED_DELAY_WITH_TS("SWAP", t->version, t, 306 + swapin_count, swapin_delay_total, 307 + swapin_delay_max, swapin_delay_min, swapin_delay_max_ts); 320 308 321 - PRINT_FILED_DELAY("RECLAIM", t->version, t, 322 - freepages_count, freepages_delay_total, 323 - freepages_delay_max, freepages_delay_min); 309 + PRINT_FILED_DELAY_WITH_TS("RECLAIM", t->version, t, 310 + freepages_count, freepages_delay_total, 311 + freepages_delay_max, freepages_delay_min, freepages_delay_max_ts); 324 312 325 - PRINT_FILED_DELAY("THRASHING", t->version, t, 326 - thrashing_count, thrashing_delay_total, 327 - thrashing_delay_max, thrashing_delay_min); 313 + PRINT_FILED_DELAY_WITH_TS("THRASHING", t->version, t, 314 + thrashing_count, thrashing_delay_total, 315 + thrashing_delay_max, thrashing_delay_min, thrashing_delay_max_ts); 328 316 329 - if (t->version >= 11) { 330 - PRINT_FILED_DELAY("COMPACT", t->version, t, 331 - compact_count, compact_delay_total, 332 - compact_delay_max, compact_delay_min); 333 - } 317 + if (t->version >= 11) { 318 + PRINT_FILED_DELAY_WITH_TS("COMPACT", t->version, t, 319 + compact_count, compact_delay_total, 320 + compact_delay_max, compact_delay_min, compact_delay_max_ts); 321 + } 334 322 335 - if (t->version >= 13) { 336 - PRINT_FILED_DELAY("WPCOPY", t->version, t, 337 - wpcopy_count, wpcopy_delay_total, 338 - wpcopy_delay_max, wpcopy_delay_min); 339 - } 323 + if (t->version >= 13) { 324 + PRINT_FILED_DELAY_WITH_TS("WPCOPY", t->version, t, 325 + wpcopy_count, wpcopy_delay_total, 326 + wpcopy_delay_max, wpcopy_delay_min, wpcopy_delay_max_ts); 327 + } 340 328 341 - if (t->version >= 14) { 342 - PRINT_FILED_DELAY("IRQ", t->version, t, 343 - irq_count, irq_delay_total, 344 - irq_delay_max, irq_delay_min); 329 + if (t->version >= 14) { 330 + PRINT_FILED_DELAY_WITH_TS("IRQ", t->version, t, 331 + irq_count, irq_delay_total, 332 + irq_delay_max, irq_delay_min, irq_delay_max_ts); 333 + } 334 + } else { 335 + /* Use original macro for older versions */ 336 + PRINT_FILED_DELAY("IO", t->version, t, 337 + blkio_count, blkio_delay_total, 338 + blkio_delay_max, blkio_delay_min); 339 + 340 + PRINT_FILED_DELAY("SWAP", t->version, t, 341 + swapin_count, swapin_delay_total, 342 + swapin_delay_max, swapin_delay_min); 343 + 344 + PRINT_FILED_DELAY("RECLAIM", t->version, t, 345 + freepages_count, freepages_delay_total, 346 + freepages_delay_max, freepages_delay_min); 347 + 348 + PRINT_FILED_DELAY("THRASHING", t->version, t, 349 + thrashing_count, thrashing_delay_total, 350 + thrashing_delay_max, thrashing_delay_min); 351 + 352 + if (t->version >= 11) { 353 + PRINT_FILED_DELAY("COMPACT", t->version, t, 354 + compact_count, compact_delay_total, 355 + compact_delay_max, compact_delay_min); 356 + } 357 + 358 + if (t->version >= 13) { 359 + PRINT_FILED_DELAY("WPCOPY", t->version, t, 360 + wpcopy_count, wpcopy_delay_total, 361 + wpcopy_delay_max, wpcopy_delay_min); 362 + } 363 + 364 + if (t->version >= 14) { 365 + PRINT_FILED_DELAY("IRQ", t->version, t, 366 + irq_count, irq_delay_total, 367 + irq_delay_max, irq_delay_min); 368 + } 345 369 } 346 370 } 347 371