Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tracing: Have syscall trace events read user space string

As of commit 654ced4a1377 ("tracing: Introduce tracepoint_is_faultable()")
system call trace events allow faulting in user space memory. Have some of
the system call trace events take advantage of this.

Use the trace_user_fault_read() logic to read the user space buffer from
user space and instead of just saving the pointer to the buffer in the
system call event, also save the string that is passed in.

The syscall event has its nb_args shorten from an int to a short (where
even u8 is plenty big enough) and the freed two bytes are used for
"user_mask". The new "user_mask" field is used to store the index of the
"args" field array that has the address to read from user space. This
value is set to 0 if the system call event does not need to read user
space for a field. This mask can be used to know if the event may fault or
not. Only one bit set in user_mask is supported at this time.

This allows the output to look like this:

sys_access(filename: 0x7f8c55368470 "/etc/ld.so.preload", mode: 4)
sys_execve(filename: 0x564ebcf5a6b8 "/usr/bin/emacs", argv: 0x7fff357c0300, envp: 0x564ebc4a4820)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.261867956@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+421 -21
+3 -1
include/trace/syscall.h
··· 16 16 * @name: name of the syscall 17 17 * @syscall_nr: number of the syscall 18 18 * @nb_args: number of parameters it takes 19 + * @user_mask: mask of @args that will read user space 19 20 * @types: list of types as strings 20 21 * @args: list of args as strings (args[i] matches types[i]) 21 22 * @enter_fields: list of fields for syscall_enter trace event ··· 26 25 struct syscall_metadata { 27 26 const char *name; 28 27 int syscall_nr; 29 - int nb_args; 28 + short nb_args; 29 + short user_mask; 30 30 const char **types; 31 31 const char **args; 32 32 struct list_head enter_fields;
+418 -20
kernel/trace/trace_syscalls.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <trace/syscall.h> 3 3 #include <trace/events/syscalls.h> 4 + #include <linux/kernel_stat.h> 4 5 #include <linux/syscalls.h> 5 6 #include <linux/slab.h> 6 7 #include <linux/kernel.h> ··· 124 123 return entry->name; 125 124 } 126 125 126 + /* Added to user strings when max limit is reached */ 127 + #define EXTRA "..." 128 + 127 129 static enum print_line_t 128 130 print_syscall_enter(struct trace_iterator *iter, int flags, 129 131 struct trace_event *event) ··· 136 132 struct trace_entry *ent = iter->ent; 137 133 struct syscall_trace_enter *trace; 138 134 struct syscall_metadata *entry; 139 - int i, syscall; 135 + int i, syscall, val; 136 + unsigned char *ptr; 137 + int len; 140 138 141 139 trace = (typeof(trace))ent; 142 140 syscall = trace->nr; ··· 173 167 else 174 168 trace_seq_printf(s, "%s: 0x%lx", entry->args[i], 175 169 trace->args[i]); 170 + 171 + if (!(BIT(i) & entry->user_mask)) 172 + continue; 173 + 174 + /* This arg points to a user space string */ 175 + ptr = (void *)trace->args + sizeof(long) * entry->nb_args; 176 + val = *(int *)ptr; 177 + 178 + /* The value is a dynamic string (len << 16 | offset) */ 179 + ptr = (void *)ent + (val & 0xffff); 180 + len = val >> 16; 181 + 182 + trace_seq_printf(s, " \"%.*s\"", len, ptr); 176 183 } 177 184 178 185 trace_seq_putc(s, ')'); ··· 242 223 243 224 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 244 225 for (i = 0; i < entry->nb_args; i++) { 245 - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", 246 - entry->args[i], sizeof(unsigned long), 247 - i == entry->nb_args - 1 ? "" : ", "); 226 + if (i) 227 + pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); 228 + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", 229 + entry->args[i], sizeof(unsigned long)); 230 + 231 + if (!(BIT(i) & entry->user_mask)) 232 + continue; 233 + 234 + /* Add the format for the user space string */ 235 + pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); 248 236 } 249 237 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 250 238 251 239 for (i = 0; i < entry->nb_args; i++) { 252 240 pos += snprintf(buf + pos, LEN_OR_ZERO, 253 241 ", ((unsigned long)(REC->%s))", entry->args[i]); 242 + if (!(BIT(i) & entry->user_mask)) 243 + continue; 244 + /* The user space string for arg has name __<arg>_val */ 245 + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", 246 + entry->args[i]); 254 247 } 255 248 256 249 #undef LEN_OR_ZERO ··· 308 277 { 309 278 struct syscall_trace_enter trace; 310 279 struct syscall_metadata *meta = call->data; 280 + unsigned long mask; 281 + char *arg; 311 282 int offset = offsetof(typeof(trace), args); 283 + int idx; 312 284 int ret = 0; 285 + int len; 313 286 int i; 314 287 315 288 for (i = 0; i < meta->nb_args; i++) { ··· 326 291 offset += sizeof(unsigned long); 327 292 } 328 293 294 + if (ret || !meta->user_mask) 295 + return ret; 296 + 297 + mask = meta->user_mask; 298 + idx = ffs(mask) - 1; 299 + 300 + /* 301 + * User space strings are faulted into a temporary buffer and then 302 + * added as a dynamic string to the end of the event. 303 + * The user space string name for the arg pointer is "__<arg>_val". 304 + */ 305 + len = strlen(meta->args[idx]) + sizeof("___val"); 306 + arg = kmalloc(len, GFP_KERNEL); 307 + if (WARN_ON_ONCE(!arg)) { 308 + meta->user_mask = 0; 309 + return -ENOMEM; 310 + } 311 + 312 + snprintf(arg, len, "__%s_val", meta->args[idx]); 313 + 314 + ret = trace_define_field(call, "__data_loc char[]", 315 + arg, offset, sizeof(int), 0, 316 + FILTER_OTHER); 317 + if (ret) 318 + kfree(arg); 329 319 return ret; 320 + } 321 + 322 + #define SYSCALL_FAULT_BUF_SZ 512 323 + 324 + /* Use the tracing per CPU buffer infrastructure to copy from user space */ 325 + struct syscall_user_buffer { 326 + struct trace_user_buf_info buf; 327 + struct rcu_head rcu; 328 + }; 329 + 330 + static struct syscall_user_buffer *syscall_buffer; 331 + 332 + static int syscall_fault_buffer_enable(void) 333 + { 334 + struct syscall_user_buffer *sbuf; 335 + int ret; 336 + 337 + lockdep_assert_held(&syscall_trace_lock); 338 + 339 + if (syscall_buffer) { 340 + trace_user_fault_get(&syscall_buffer->buf); 341 + return 0; 342 + } 343 + 344 + sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL); 345 + if (!sbuf) 346 + return -ENOMEM; 347 + 348 + ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); 349 + if (ret < 0) { 350 + kfree(sbuf); 351 + return ret; 352 + } 353 + 354 + WRITE_ONCE(syscall_buffer, sbuf); 355 + 356 + return 0; 357 + } 358 + 359 + static void rcu_free_syscall_buffer(struct rcu_head *rcu) 360 + { 361 + struct syscall_user_buffer *sbuf = 362 + container_of(rcu, struct syscall_user_buffer, rcu); 363 + 364 + trace_user_fault_destroy(&sbuf->buf); 365 + kfree(sbuf); 366 + } 367 + 368 + 369 + static void syscall_fault_buffer_disable(void) 370 + { 371 + struct syscall_user_buffer *sbuf = syscall_buffer; 372 + 373 + lockdep_assert_held(&syscall_trace_lock); 374 + 375 + if (trace_user_fault_put(&sbuf->buf)) 376 + return; 377 + 378 + WRITE_ONCE(syscall_buffer, NULL); 379 + call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); 380 + } 381 + 382 + static int syscall_copy_user(char *buf, const char __user *ptr, 383 + size_t size, void *data) 384 + { 385 + unsigned long *ret_size = data; 386 + int ret; 387 + 388 + ret = strncpy_from_user(buf, ptr, size); 389 + if (ret < 0) 390 + return 1; 391 + *ret_size = ret; 392 + return 0; 393 + } 394 + 395 + static char *sys_fault_user(struct syscall_metadata *sys_data, 396 + struct syscall_user_buffer *sbuf, 397 + unsigned long *args, unsigned int *data_size) 398 + { 399 + unsigned long size = SYSCALL_FAULT_BUF_SZ - 1; 400 + unsigned long mask = sys_data->user_mask; 401 + int idx = ffs(mask) - 1; 402 + char *ptr; 403 + char *buf; 404 + 405 + /* Get the pointer to user space memory to read */ 406 + ptr = (char *)args[idx]; 407 + *data_size = 0; 408 + 409 + buf = trace_user_fault_read(&sbuf->buf, ptr, size, 410 + syscall_copy_user, &size); 411 + if (!buf) 412 + return NULL; 413 + 414 + /* Replace any non-printable characters with '.' */ 415 + for (int i = 0; i < size; i++) { 416 + if (!isprint(buf[i])) 417 + buf[i] = '.'; 418 + } 419 + 420 + /* 421 + * If the text was truncated due to our max limit, add "..." to 422 + * the string. 423 + */ 424 + if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { 425 + strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), 426 + EXTRA, sizeof(EXTRA)); 427 + size = SYSCALL_FAULT_BUF_SZ; 428 + } else { 429 + buf[size++] = '\0'; 430 + } 431 + 432 + *data_size = size; 433 + return buf; 330 434 } 331 435 332 436 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) ··· 476 302 struct syscall_metadata *sys_data; 477 303 struct trace_event_buffer fbuffer; 478 304 unsigned long args[6]; 305 + char *user_ptr; 306 + int user_size = 0; 479 307 int syscall_nr; 480 - int size; 308 + int size = 0; 309 + bool mayfault; 481 310 482 311 /* 483 312 * Syscall probe called with preemption enabled, but the ring 484 313 * buffer and per-cpu data require preemption to be disabled. 485 314 */ 486 315 might_fault(); 487 - guard(preempt_notrace)(); 488 316 489 317 syscall_nr = trace_get_syscall_nr(current, regs); 490 318 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) ··· 503 327 if (!sys_data) 504 328 return; 505 329 506 - size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 330 + /* Check if this syscall event faults in user space memory */ 331 + mayfault = sys_data->user_mask != 0; 332 + 333 + guard(preempt_notrace)(); 334 + 335 + syscall_get_arguments(current, regs, args); 336 + 337 + if (mayfault) { 338 + struct syscall_user_buffer *sbuf; 339 + 340 + /* If the syscall_buffer is NULL, tracing is being shutdown */ 341 + sbuf = READ_ONCE(syscall_buffer); 342 + if (!sbuf) 343 + return; 344 + 345 + user_ptr = sys_fault_user(sys_data, sbuf, args, &user_size); 346 + /* 347 + * user_size is the amount of data to append. 348 + * Need to add 4 for the meta field that points to 349 + * the user memory at the end of the event and also 350 + * stores its size. 351 + */ 352 + size = 4 + user_size; 353 + } 354 + 355 + size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 507 356 508 357 entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); 509 358 if (!entry) ··· 536 335 537 336 entry = ring_buffer_event_data(fbuffer.event); 538 337 entry->nr = syscall_nr; 539 - syscall_get_arguments(current, regs, args); 338 + 540 339 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); 340 + 341 + if (mayfault) { 342 + void *ptr; 343 + int val; 344 + 345 + /* 346 + * Set the pointer to point to the meta data of the event 347 + * that has information about the stored user space memory. 348 + */ 349 + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; 350 + 351 + /* 352 + * The meta data will store the offset of the user data from 353 + * the beginning of the event. 354 + */ 355 + val = (ptr - (void *)entry) + 4; 356 + 357 + /* Store the offset and the size into the meta data */ 358 + *(int *)ptr = val | (user_size << 16); 359 + 360 + /* Nothing to do if the user space was empty or faulted */ 361 + if (user_size) { 362 + /* Now store the user space data into the event */ 363 + ptr += 4; 364 + memcpy(ptr, user_ptr, user_size); 365 + } 366 + } 541 367 542 368 trace_event_buffer_commit(&fbuffer); 543 369 } ··· 614 386 static int reg_event_syscall_enter(struct trace_event_file *file, 615 387 struct trace_event_call *call) 616 388 { 389 + struct syscall_metadata *sys_data = call->data; 617 390 struct trace_array *tr = file->tr; 618 391 int ret = 0; 619 392 int num; 620 393 621 - num = ((struct syscall_metadata *)call->data)->syscall_nr; 394 + num = sys_data->syscall_nr; 622 395 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 623 396 return -ENOSYS; 624 - mutex_lock(&syscall_trace_lock); 625 - if (!tr->sys_refcount_enter) 626 - ret = register_trace_sys_enter(ftrace_syscall_enter, tr); 627 - if (!ret) { 628 - WRITE_ONCE(tr->enter_syscall_files[num], file); 629 - tr->sys_refcount_enter++; 397 + guard(mutex)(&syscall_trace_lock); 398 + if (sys_data->user_mask) { 399 + ret = syscall_fault_buffer_enable(); 400 + if (ret < 0) 401 + return ret; 630 402 } 631 - mutex_unlock(&syscall_trace_lock); 632 - return ret; 403 + if (!tr->sys_refcount_enter) { 404 + ret = register_trace_sys_enter(ftrace_syscall_enter, tr); 405 + if (ret < 0) { 406 + if (sys_data->user_mask) 407 + syscall_fault_buffer_disable(); 408 + return ret; 409 + } 410 + } 411 + WRITE_ONCE(tr->enter_syscall_files[num], file); 412 + tr->sys_refcount_enter++; 413 + return 0; 633 414 } 634 415 635 416 static void unreg_event_syscall_enter(struct trace_event_file *file, 636 417 struct trace_event_call *call) 637 418 { 419 + struct syscall_metadata *sys_data = call->data; 638 420 struct trace_array *tr = file->tr; 639 421 int num; 640 422 641 - num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 + num = sys_data->syscall_nr; 642 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 643 425 return; 644 - mutex_lock(&syscall_trace_lock); 426 + guard(mutex)(&syscall_trace_lock); 645 427 tr->sys_refcount_enter--; 646 428 WRITE_ONCE(tr->enter_syscall_files[num], NULL); 647 429 if (!tr->sys_refcount_enter) 648 430 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 649 - mutex_unlock(&syscall_trace_lock); 431 + if (sys_data->user_mask) 432 + syscall_fault_buffer_disable(); 650 433 } 651 434 652 435 static int reg_event_syscall_exit(struct trace_event_file *file, ··· 698 459 mutex_unlock(&syscall_trace_lock); 699 460 } 700 461 462 + /* 463 + * For system calls that reference user space memory that can 464 + * be recorded into the event, set the system call meta data's user_mask 465 + * to the "args" index that points to the user space memory to retrieve. 466 + */ 467 + static void check_faultable_syscall(struct trace_event_call *call, int nr) 468 + { 469 + struct syscall_metadata *sys_data = call->data; 470 + 471 + /* Only work on entry */ 472 + if (sys_data->enter_event != call) 473 + return; 474 + 475 + switch (nr) { 476 + /* user arg at position 0 */ 477 + #ifdef __NR_access 478 + case __NR_access: 479 + #endif 480 + case __NR_acct: 481 + case __NR_add_key: /* Just _type. TODO add _description */ 482 + case __NR_chdir: 483 + #ifdef __NR_chown 484 + case __NR_chown: 485 + #endif 486 + #ifdef __NR_chmod 487 + case __NR_chmod: 488 + #endif 489 + case __NR_chroot: 490 + #ifdef __NR_creat 491 + case __NR_creat: 492 + #endif 493 + case __NR_delete_module: 494 + case __NR_execve: 495 + case __NR_fsopen: 496 + case __NR_getxattr: /* Just pathname, TODO add name */ 497 + #ifdef __NR_lchown 498 + case __NR_lchown: 499 + #endif 500 + case __NR_lgetxattr: /* Just pathname, TODO add name */ 501 + case __NR_lremovexattr: /* Just pathname, TODO add name */ 502 + #ifdef __NR_link 503 + case __NR_link: /* Just oldname. TODO add newname */ 504 + #endif 505 + case __NR_listxattr: /* Just pathname, TODO add list */ 506 + case __NR_llistxattr: /* Just pathname, TODO add list */ 507 + case __NR_lsetxattr: /* Just pathname, TODO add list */ 508 + #ifdef __NR_open 509 + case __NR_open: 510 + #endif 511 + case __NR_memfd_create: 512 + case __NR_mount: /* Just dev_name, TODO add dir_name and type */ 513 + #ifdef __NR_mkdir 514 + case __NR_mkdir: 515 + #endif 516 + #ifdef __NR_mknod 517 + case __NR_mknod: 518 + #endif 519 + case __NR_mq_open: 520 + case __NR_mq_unlink: 521 + case __NR_pivot_root: /* Just new_root, TODO add old_root */ 522 + #ifdef __NR_readlink 523 + case __NR_readlink: 524 + #endif 525 + case __NR_removexattr: /* Just pathname, TODO add name */ 526 + #ifdef __NR_rename 527 + case __NR_rename: /* Just oldname. TODO add newname */ 528 + #endif 529 + case __NR_request_key: /* Just _type. TODO add _description */ 530 + #ifdef __NR_rmdir 531 + case __NR_rmdir: 532 + #endif 533 + case __NR_setxattr: /* Just pathname, TODO add list */ 534 + case __NR_shmdt: 535 + #ifdef __NR_statfs 536 + case __NR_statfs: 537 + #endif 538 + case __NR_swapon: 539 + case __NR_swapoff: 540 + #ifdef __NR_symlink 541 + case __NR_symlink: /* Just oldname. TODO add newname */ 542 + #endif 543 + #ifdef __NR_truncate 544 + case __NR_truncate: 545 + #endif 546 + #ifdef __NR_unlink 547 + case __NR_unlink: 548 + #endif 549 + case __NR_umount2: 550 + #ifdef __NR_utime 551 + case __NR_utime: 552 + #endif 553 + #ifdef __NR_utimes 554 + case __NR_utimes: 555 + #endif 556 + sys_data->user_mask = BIT(0); 557 + break; 558 + /* user arg at position 1 */ 559 + case __NR_execveat: 560 + case __NR_faccessat: 561 + case __NR_faccessat2: 562 + case __NR_finit_module: 563 + case __NR_fchmodat: 564 + case __NR_fchmodat2: 565 + case __NR_fchownat: 566 + case __NR_fgetxattr: 567 + case __NR_flistxattr: 568 + case __NR_fsetxattr: 569 + case __NR_fspick: 570 + case __NR_fremovexattr: 571 + #ifdef __NR_futimesat 572 + case __NR_futimesat: 573 + #endif 574 + case __NR_getxattrat: /* Just pathname, TODO add name */ 575 + case __NR_inotify_add_watch: 576 + case __NR_linkat: /* Just oldname. TODO add newname */ 577 + case __NR_listxattrat: /* Just pathname, TODO add list */ 578 + case __NR_mkdirat: 579 + case __NR_mknodat: 580 + case __NR_mount_setattr: 581 + case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */ 582 + case __NR_name_to_handle_at: 583 + #ifdef __NR_newfstatat 584 + case __NR_newfstatat: 585 + #endif 586 + case __NR_openat: 587 + case __NR_openat2: 588 + case __NR_open_tree: 589 + case __NR_open_tree_attr: 590 + case __NR_readlinkat: 591 + #ifdef __NR_renameat 592 + case __NR_renameat: /* Just oldname. TODO add newname */ 593 + #endif 594 + case __NR_renameat2: /* Just oldname. TODO add newname */ 595 + case __NR_removexattrat: /* Just pathname, TODO add name */ 596 + case __NR_quotactl: 597 + case __NR_setxattrat: /* Just pathname, TODO add list */ 598 + case __NR_syslog: 599 + case __NR_symlinkat: /* Just oldname. TODO add newname */ 600 + case __NR_statx: 601 + case __NR_unlinkat: 602 + case __NR_utimensat: 603 + sys_data->user_mask = BIT(1); 604 + break; 605 + /* user arg at position 2 */ 606 + case __NR_init_module: 607 + case __NR_fsconfig: 608 + sys_data->user_mask = BIT(2); 609 + break; 610 + /* user arg at position 4 */ 611 + case __NR_fanotify_mark: 612 + sys_data->user_mask = BIT(4); 613 + break; 614 + default: 615 + sys_data->user_mask = 0; 616 + } 617 + } 618 + 701 619 static int __init init_syscall_trace(struct trace_event_call *call) 702 620 { 703 621 int id; ··· 866 470 ((struct syscall_metadata *)call->data)->name); 867 471 return -ENOSYS; 868 472 } 473 + 474 + check_faultable_syscall(call, num); 869 475 870 476 if (set_syscall_print_fmt(call) < 0) 871 477 return -ENOMEM;