Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tracing: Allow syscall trace events to read more than one user parameter

Allow more than one field of a syscall trace event to read user space.
Build on top of the user_mask by allowing more than one bit to be set that
corresponds to the @args array of the syscall metadata. For each argument
in the @args array that is to be read, it will have a dynamic array/string
field associated to it.

Note that multiple fields to be read from user space is not supported if
the user_arg_size field is set in the syscall metada. That field can only
be used if only one field is being read from user space as that field is a
number representing the size field of the syscall event that holds the
size of the data to read from user space. It becomes ambiguous if the
system call reads more than one field. Currently this is not an issue.

If a syscall event happens to enable two events to read user space and
sets the user_arg_size field, it will trigger a warning at boot and the
user_arg_size field will be cleared.

The per CPU buffer that is used to read the user space addresses is now
broken up into 3 sections, each of 168 bytes. The reason for 168 is that
it is the biggest portion of 512 bytes divided by 3 that is 8 byte aligned.

The max amount copied into the ring buffer from user space is now only 128
bytes, which is plenty. When reading user space, it still reads 167
(168-1) bytes and uses the remaining to know if it should append the extra
"..." to the end or not.

This will allow the event to look like this:

sys_renameat2(olddfd: 0xffffff9c, oldname: 0x7ffe02facdff "/tmp/x", newdfd: 0xffffff9c, newname: 0x7ffe02face06 "/tmp/y", flags: 1)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.095789277@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+230 -109
+230 -109
kernel/trace/trace_syscalls.c
··· 138 138 struct syscall_metadata *entry; 139 139 int i, syscall, val, len; 140 140 unsigned char *ptr; 141 + int offset = 0; 141 142 142 143 trace = (typeof(trace))ent; 143 144 syscall = trace->nr; ··· 178 177 continue; 179 178 180 179 /* This arg points to a user space string */ 181 - ptr = (void *)trace->args + sizeof(long) * entry->nb_args; 180 + ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; 182 181 val = *(int *)ptr; 183 182 184 183 /* The value is a dynamic string (len << 16 | offset) */ 185 184 ptr = (void *)ent + (val & 0xffff); 186 185 len = val >> 16; 186 + offset += 4; 187 187 188 188 if (entry->user_arg_size < 0 || entry->user_arg_is_str) { 189 189 trace_seq_printf(s, " \"%.*s\"", len, ptr); ··· 337 335 unsigned long mask; 338 336 char *arg; 339 337 int offset = offsetof(typeof(trace), args); 340 - int idx; 341 338 int ret = 0; 342 339 int len; 343 340 int i; ··· 355 354 return ret; 356 355 357 356 mask = meta->user_mask; 358 - idx = ffs(mask) - 1; 359 357 360 - /* 361 - * User space data is faulted into a temporary buffer and then 362 - * added as a dynamic string or array to the end of the event. 363 - * The user space data name for the arg pointer is "__<arg>_val". 364 - */ 365 - len = strlen(meta->args[idx]) + sizeof("___val"); 366 - arg = kmalloc(len, GFP_KERNEL); 367 - if (WARN_ON_ONCE(!arg)) { 368 - meta->user_mask = 0; 369 - return -ENOMEM; 358 + while (mask) { 359 + int idx = ffs(mask) - 1; 360 + mask &= ~BIT(idx); 361 + 362 + /* 363 + * User space data is faulted into a temporary buffer and then 364 + * added as a dynamic string or array to the end of the event. 365 + * The user space data name for the arg pointer is 366 + * "__<arg>_val". 367 + */ 368 + len = strlen(meta->args[idx]) + sizeof("___val"); 369 + arg = kmalloc(len, GFP_KERNEL); 370 + if (WARN_ON_ONCE(!arg)) { 371 + meta->user_mask = 0; 372 + return -ENOMEM; 373 + } 374 + 375 + snprintf(arg, len, "__%s_val", meta->args[idx]); 376 + 377 + ret = trace_define_field(call, "__data_loc char[]", 378 + arg, offset, sizeof(int), 0, 379 + FILTER_OTHER); 380 + if (ret) { 381 + kfree(arg); 382 + break; 383 + } 384 + offset += 4; 370 385 } 371 - 372 - snprintf(arg, len, "__%s_val", meta->args[idx]); 373 - 374 - ret = trace_define_field(call, "__data_loc char[]", 375 - arg, offset, sizeof(int), 0, 376 - FILTER_OTHER); 377 - if (ret) 378 - kfree(arg); 379 386 return ret; 380 387 } 381 388 389 + /* 390 + * Create a per CPU temporary buffer to copy user space pointers into. 391 + * 392 + * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use 393 + * to copy memory from user space addresses into. 394 + * 395 + * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space. 396 + * 397 + * SYSCALL_FAULT_USER_MAX is the amount to copy into the ring buffer. 398 + * It's slightly smaller than SYSCALL_FAULT_ARG_SZ to know if it 399 + * needs to append the EXTRA or not. 400 + * 401 + * This only allows up to 3 args from system calls. 402 + */ 382 403 #define SYSCALL_FAULT_BUF_SZ 512 404 + #define SYSCALL_FAULT_ARG_SZ 168 405 + #define SYSCALL_FAULT_USER_MAX 128 406 + #define SYSCALL_FAULT_MAX_CNT 3 383 407 384 408 /* Use the tracing per CPU buffer infrastructure to copy from user space */ 385 409 struct syscall_user_buffer { ··· 464 438 call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); 465 439 } 466 440 441 + struct syscall_args { 442 + char *ptr_array[SYSCALL_FAULT_MAX_CNT]; 443 + int read[SYSCALL_FAULT_MAX_CNT]; 444 + int uargs; 445 + }; 446 + 467 447 static int syscall_copy_user(char *buf, const char __user *ptr, 468 448 size_t size, void *data) 469 449 { 470 - unsigned long *ret_size = data; 450 + struct syscall_args *args = data; 471 451 int ret; 472 452 473 - ret = strncpy_from_user(buf, ptr, size); 474 - if (ret < 0) 475 - return 1; 476 - *ret_size = ret; 453 + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 454 + ptr = (char __user *)args->ptr_array[i]; 455 + ret = strncpy_from_user(buf, ptr, size); 456 + args->read[i] = ret; 457 + } 458 + return 0; 459 + } 460 + 461 + static int syscall_copy_user_array(char *buf, const char __user *ptr, 462 + size_t size, void *data) 463 + { 464 + struct syscall_args *args = data; 465 + int ret; 466 + 467 + for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 468 + ptr = (char __user *)args->ptr_array[i]; 469 + ret = __copy_from_user(buf, ptr, size); 470 + args->read[i] = ret ? -1 : size; 471 + } 477 472 return 0; 478 473 } 479 474 480 475 static char *sys_fault_user(struct syscall_metadata *sys_data, 481 476 struct syscall_user_buffer *sbuf, 482 - unsigned long *args, unsigned int *data_size) 477 + unsigned long *args, 478 + unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) 483 479 { 484 480 trace_user_buf_copy syscall_copy = syscall_copy_user; 485 - unsigned long size = SYSCALL_FAULT_BUF_SZ - 1; 486 481 unsigned long mask = sys_data->user_mask; 487 - int idx = ffs(mask) - 1; 482 + unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; 483 + struct syscall_args sargs; 488 484 bool array = false; 489 - char *ptr; 485 + char *buffer; 490 486 char *buf; 487 + int ret; 488 + int i = 0; 491 489 492 - /* Get the pointer to user space memory to read */ 493 - ptr = (char *)args[idx]; 494 - *data_size = 0; 490 + /* The extra is appended to the user data in the buffer */ 491 + BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= 492 + SYSCALL_FAULT_ARG_SZ); 495 493 496 494 /* 497 495 * If this system call event has a size argument, use ··· 525 475 if (sys_data->user_arg_size >= 0) { 526 476 array = true; 527 477 size = args[sys_data->user_arg_size]; 528 - if (size > SYSCALL_FAULT_BUF_SZ - 1) 529 - size = SYSCALL_FAULT_BUF_SZ - 1; 530 - /* use normal copy_from_user() */ 531 - syscall_copy = NULL; 478 + if (size > SYSCALL_FAULT_ARG_SZ - 1) 479 + size = SYSCALL_FAULT_ARG_SZ - 1; 480 + syscall_copy = syscall_copy_user_array; 532 481 } 533 482 534 - buf = trace_user_fault_read(&sbuf->buf, ptr, size, 535 - syscall_copy, &size); 536 - if (!buf) 483 + while (mask) { 484 + int idx = ffs(mask) - 1; 485 + mask &= ~BIT(idx); 486 + 487 + if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) 488 + break; 489 + 490 + /* Get the pointer to user space memory to read */ 491 + sargs.ptr_array[i++] = (char *)args[idx]; 492 + } 493 + 494 + sargs.uargs = i; 495 + 496 + /* Clear the values that are not used */ 497 + for (; i < SYSCALL_FAULT_MAX_CNT; i++) { 498 + data_size[i] = -1; /* Denotes no pointer */ 499 + } 500 + 501 + buffer = trace_user_fault_read(&sbuf->buf, NULL, size, 502 + syscall_copy, &sargs); 503 + if (!buffer) 537 504 return NULL; 538 505 539 - /* For strings, replace any non-printable characters with '.' */ 540 - if (!array) { 541 - for (int i = 0; i < size; i++) { 542 - if (!isprint(buf[i])) 543 - buf[i] = '.'; 544 - } 506 + buf = buffer; 507 + for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 545 508 546 - /* 547 - * If the text was truncated due to our max limit, add "..." to 548 - * the string. 549 - */ 550 - if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { 551 - strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), 552 - EXTRA, sizeof(EXTRA)); 553 - size = SYSCALL_FAULT_BUF_SZ; 509 + ret = sargs.read[i]; 510 + if (ret < 0) 511 + continue; 512 + buf[ret] = '\0'; 513 + 514 + /* For strings, replace any non-printable characters with '.' */ 515 + if (!array) { 516 + for (int x = 0; x < ret; x++) { 517 + if (!isprint(buf[x])) 518 + buf[x] = '.'; 519 + } 520 + 521 + /* 522 + * If the text was truncated due to our max limit, 523 + * add "..." to the string. 524 + */ 525 + if (ret > SYSCALL_FAULT_USER_MAX) { 526 + strscpy(buf + SYSCALL_FAULT_USER_MAX, EXTRA, 527 + sizeof(EXTRA)); 528 + ret = SYSCALL_FAULT_USER_MAX + sizeof(EXTRA); 529 + } else { 530 + buf[ret++] = '\0'; 531 + } 554 532 } else { 555 - buf[size++] = '\0'; 533 + ret = min(ret, SYSCALL_FAULT_USER_MAX); 556 534 } 535 + data_size[i] = ret; 557 536 } 558 537 559 - *data_size = size; 560 - return buf; 538 + return buffer; 561 539 } 562 540 563 541 static int 564 542 syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, 565 - char **buffer, int *size, int *user_size) 543 + char **buffer, int *size, int *user_sizes, int *uargs) 566 544 { 567 545 struct syscall_user_buffer *sbuf; 546 + int i; 568 547 569 548 /* If the syscall_buffer is NULL, tracing is being shutdown */ 570 549 sbuf = READ_ONCE(syscall_buffer); 571 550 if (!sbuf) 572 551 return -1; 573 552 574 - *buffer = sys_fault_user(sys_data, sbuf, args, user_size); 553 + *buffer = sys_fault_user(sys_data, sbuf, args, user_sizes); 575 554 /* 576 555 * user_size is the amount of data to append. 577 556 * Need to add 4 for the meta field that points to 578 557 * the user memory at the end of the event and also 579 558 * stores its size. 580 559 */ 581 - *size = 4 + *user_size; 560 + for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) { 561 + if (user_sizes[i] < 0) 562 + break; 563 + *size += user_sizes[i] + 4; 564 + } 565 + /* Save the number of user read arguments of this syscall */ 566 + *uargs = i; 582 567 return 0; 583 568 } 584 569 585 570 static void syscall_put_data(struct syscall_metadata *sys_data, 586 571 struct syscall_trace_enter *entry, 587 - char *buffer, int size, int user_size) 572 + char *buffer, int size, int *user_sizes, int uargs) 588 573 { 574 + char *buf = buffer; 589 575 void *ptr; 590 576 int val; 591 577 ··· 633 547 634 548 /* 635 549 * The meta data will store the offset of the user data from 636 - * the beginning of the event. 550 + * the beginning of the event. That is after the static arguments 551 + * and the meta data fields. 637 552 */ 638 - val = (ptr - (void *)entry) + 4; 553 + val = (ptr - (void *)entry) + 4 * uargs; 639 554 640 - /* Store the offset and the size into the meta data */ 641 - *(int *)ptr = val | (user_size << 16); 555 + for (int i = 0; i < uargs; i++) { 642 556 643 - if (WARN_ON_ONCE((ptr - (void *)entry + user_size) > size)) 644 - user_size = 0; 557 + if (i) 558 + val += user_sizes[i - 1]; 645 559 646 - /* Nothing to do if the user space was empty or faulted */ 647 - if (user_size) { 648 - /* Now store the user space data into the event */ 560 + /* Store the offset and the size into the meta data */ 561 + *(int *)ptr = val | (user_sizes[i] << 16); 562 + 563 + /* Skip the meta data */ 649 564 ptr += 4; 650 - memcpy(ptr, buffer, user_size); 565 + } 566 + 567 + for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 568 + /* Nothing to do if the user space was empty or faulted */ 569 + if (!user_sizes[i]) 570 + continue; 571 + 572 + memcpy(ptr, buf, user_sizes[i]); 573 + ptr += user_sizes[i]; 651 574 } 652 575 } 653 576 ··· 669 574 struct trace_event_buffer fbuffer; 670 575 unsigned long args[6]; 671 576 char *user_ptr; 672 - int user_size = 0; 577 + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; 673 578 int syscall_nr; 674 579 int size = 0; 580 + int uargs = 0; 675 581 bool mayfault; 676 582 677 583 /* ··· 705 609 706 610 if (mayfault) { 707 611 if (syscall_get_data(sys_data, args, &user_ptr, 708 - &size, &user_size) < 0) 612 + &size, user_sizes, &uargs) < 0) 709 613 return; 710 614 } 711 615 ··· 721 625 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); 722 626 723 627 if (mayfault) 724 - syscall_put_data(sys_data, entry, user_ptr, size, user_size); 628 + syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs); 725 629 726 630 trace_event_buffer_commit(&fbuffer); 727 631 } ··· 863 767 static void check_faultable_syscall(struct trace_event_call *call, int nr) 864 768 { 865 769 struct syscall_metadata *sys_data = call->data; 770 + unsigned long mask; 866 771 867 772 /* Only work on entry */ 868 773 if (sys_data->enter_event != call) ··· 899 802 case __NR_access: 900 803 #endif 901 804 case __NR_acct: 902 - case __NR_add_key: /* Just _type. TODO add _description */ 903 805 case __NR_chdir: 904 806 #ifdef __NR_chown 905 807 case __NR_chown: ··· 913 817 case __NR_delete_module: 914 818 case __NR_execve: 915 819 case __NR_fsopen: 916 - case __NR_getxattr: /* Just pathname, TODO add name */ 917 820 #ifdef __NR_lchown 918 821 case __NR_lchown: 919 822 #endif 920 - case __NR_lgetxattr: /* Just pathname, TODO add name */ 921 - case __NR_lremovexattr: /* Just pathname, TODO add name */ 922 - #ifdef __NR_link 923 - case __NR_link: /* Just oldname. TODO add newname */ 924 - #endif 925 - case __NR_listxattr: /* Just pathname, TODO add list */ 926 - case __NR_llistxattr: /* Just pathname, TODO add list */ 927 - case __NR_lsetxattr: /* Just pathname, TODO add list */ 928 823 #ifdef __NR_open 929 824 case __NR_open: 930 825 #endif 931 826 case __NR_memfd_create: 932 - case __NR_mount: /* Just dev_name, TODO add dir_name and type */ 933 827 #ifdef __NR_mkdir 934 828 case __NR_mkdir: 935 829 #endif ··· 928 842 #endif 929 843 case __NR_mq_open: 930 844 case __NR_mq_unlink: 931 - case __NR_pivot_root: /* Just new_root, TODO add old_root */ 932 845 #ifdef __NR_readlink 933 846 case __NR_readlink: 934 847 #endif 935 - case __NR_removexattr: /* Just pathname, TODO add name */ 936 - #ifdef __NR_rename 937 - case __NR_rename: /* Just oldname. TODO add newname */ 938 - #endif 939 - case __NR_request_key: /* Just _type. TODO add _description */ 940 848 #ifdef __NR_rmdir 941 849 case __NR_rmdir: 942 850 #endif 943 - case __NR_setxattr: /* Just pathname, TODO add list */ 944 851 case __NR_shmdt: 945 852 #ifdef __NR_statfs 946 853 case __NR_statfs: 947 854 #endif 948 855 case __NR_swapon: 949 856 case __NR_swapoff: 950 - #ifdef __NR_symlink 951 - case __NR_symlink: /* Just oldname. TODO add newname */ 952 - #endif 953 857 #ifdef __NR_truncate 954 858 case __NR_truncate: 955 859 #endif ··· 971 895 #ifdef __NR_futimesat 972 896 case __NR_futimesat: 973 897 #endif 974 - case __NR_getxattrat: /* Just pathname, TODO add name */ 975 898 case __NR_inotify_add_watch: 976 - case __NR_linkat: /* Just oldname. TODO add newname */ 977 - case __NR_listxattrat: /* Just pathname, TODO add list */ 978 899 case __NR_mkdirat: 979 900 case __NR_mknodat: 980 901 case __NR_mount_setattr: 981 - case __NR_move_mount: /* Just from_pathname, TODO add to_pathname */ 982 902 case __NR_name_to_handle_at: 983 903 #ifdef __NR_newfstatat 984 904 case __NR_newfstatat: ··· 984 912 case __NR_open_tree: 985 913 case __NR_open_tree_attr: 986 914 case __NR_readlinkat: 987 - #ifdef __NR_renameat 988 - case __NR_renameat: /* Just oldname. TODO add newname */ 989 - #endif 990 - case __NR_renameat2: /* Just oldname. TODO add newname */ 991 - case __NR_removexattrat: /* Just pathname, TODO add name */ 992 915 case __NR_quotactl: 993 - case __NR_setxattrat: /* Just pathname, TODO add list */ 994 916 case __NR_syslog: 995 - case __NR_symlinkat: /* Just oldname. TODO add newname */ 996 917 case __NR_statx: 997 918 case __NR_unlinkat: 998 919 case __NR_utimensat: ··· 1000 935 case __NR_fanotify_mark: 1001 936 sys_data->user_mask = BIT(4); 1002 937 break; 938 + /* 2 user args, 0 and 1 */ 939 + case __NR_add_key: 940 + case __NR_getxattr: 941 + case __NR_lgetxattr: 942 + case __NR_lremovexattr: 943 + #ifdef __NR_link 944 + case __NR_link: 945 + #endif 946 + case __NR_listxattr: 947 + case __NR_llistxattr: 948 + case __NR_lsetxattr: 949 + case __NR_pivot_root: 950 + case __NR_removexattr: 951 + #ifdef __NR_rename 952 + case __NR_rename: 953 + #endif 954 + case __NR_request_key: 955 + case __NR_setxattr: 956 + #ifdef __NR_symlink 957 + case __NR_symlink: 958 + #endif 959 + sys_data->user_mask = BIT(0) | BIT(1); 960 + break; 961 + /* 2 user args, 0 and 2 */ 962 + case __NR_symlinkat: 963 + sys_data->user_mask = BIT(0) | BIT(2); 964 + break; 965 + /* 2 user args, 1 and 3 */ 966 + case __NR_getxattrat: 967 + case __NR_linkat: 968 + case __NR_listxattrat: 969 + case __NR_move_mount: 970 + #ifdef __NR_renameat 971 + case __NR_renameat: 972 + #endif 973 + case __NR_renameat2: 974 + case __NR_removexattrat: 975 + case __NR_setxattrat: 976 + sys_data->user_mask = BIT(1) | BIT(3); 977 + break; 978 + case __NR_mount: /* Just dev_name and dir_name, TODO add type */ 979 + sys_data->user_mask = BIT(0) | BIT(1) | BIT(2); 980 + break; 1003 981 default: 1004 982 sys_data->user_mask = 0; 983 + return; 1005 984 } 985 + 986 + if (sys_data->user_arg_size < 0) 987 + return; 988 + 989 + /* 990 + * The user_arg_size can only be used when the system call 991 + * is reading only a single address from user space. 992 + */ 993 + mask = sys_data->user_mask; 994 + if (WARN_ON(mask & (mask - 1))) 995 + sys_data->user_arg_size = -1; 1006 996 } 1007 997 1008 998 static int __init init_syscall_trace(struct trace_event_call *call) ··· 1203 1083 bool valid_prog_array; 1204 1084 bool mayfault; 1205 1085 char *user_ptr; 1086 + int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; 1206 1087 int syscall_nr; 1207 - int user_size; 1208 1088 int rctx; 1209 1089 int size = 0; 1090 + int uargs = 0; 1210 1091 1211 1092 /* 1212 1093 * Syscall probe called with preemption enabled, but the ring ··· 1233 1112 1234 1113 if (mayfault) { 1235 1114 if (syscall_get_data(sys_data, args, &user_ptr, 1236 - &size, &user_size) < 0) 1115 + &size, user_sizes, &uargs) < 0) 1237 1116 return; 1238 1117 } 1239 1118 ··· 1255 1134 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); 1256 1135 1257 1136 if (mayfault) 1258 - syscall_put_data(sys_data, rec, user_ptr, size, user_size); 1137 + syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); 1259 1138 1260 1139 if ((valid_prog_array && 1261 1140 !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||