Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tracing: Make trace_user_fault_read() exposed to rest of tracing

The write to the trace_marker file is a critical section where it cannot
take locks nor allocate memory. To read from user space, it allocates a per
CPU buffer when the trace_marker file is opened, and then when the write
system call is performed, it uses the following method to read from user
space:

preempt_disable();
buffer = per_cpu_ptr(cpu_buffers, cpu);
do {
cnt = nr_context_switches_cpu();
migrate_disable();
preempt_enable();
ret = copy_from_user(buffer, ptr, len);
preempt_disable();
migrate_enable();
} while (!ret && cnt != nr_context_switches_cpu());
if (!ret)
ring_buffer_write(buffer);
preempt_enable();

It records the number of context switches for the current CPU, enables
preemption, copies from user space, disable preemption and then checks if
the number of context switches changed. If it did not, then the buffer is
valid, otherwise the buffer may have been corrupted and the read from user
space must be tried again.

The system call trace events are now faultable and have the same
restrictions as the trace_marker write. For system calls to read the user
space buffer (for example to read the file of the openat system call), it
needs the same logic. Instead of copying the code over to the system call
trace events, make the code generic to allow the system call trace events to
use the same code. The following API is added internally to the tracing sub
system (these are only exposed within the tracing subsystem and not to be
used outside of it):

trace_user_fault_init() - initializes a trace_user_buf_info descriptor
that will allocate the per CPU buffers to copy from user space into.

trace_user_fault_destroy() - used to free the allocations made by
trace_user_fault_init().

trace_user_fault_get() - update the ref count of the info descriptor to
allow more than one user to use the same descriptor.

trace_user_fault_put() - decrement the ref count.

trace_user_fault_read() - performs the above action to read user space
into the per CPU buffer. The preempt_disable() is expected before
calling this function and preemption must remain disabled while the
buffer returned is in use.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.096570057@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+208 -65
+191 -65
kernel/trace/trace.c
··· 7223 7223 char *buf; 7224 7224 }; 7225 7225 7226 - struct trace_user_buf_info { 7227 - struct trace_user_buf __percpu *tbuf; 7228 - int ref; 7229 - }; 7230 - 7231 - 7232 7226 static DEFINE_MUTEX(trace_user_buffer_mutex); 7233 7227 static struct trace_user_buf_info *trace_user_buffer; 7234 7228 7235 - static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo) 7229 + /** 7230 + * trace_user_fault_destroy - free up allocated memory of a trace user buffer 7231 + * @tinfo: The descriptor to free up 7232 + * 7233 + * Frees any data allocated in the trace info dsecriptor. 7234 + */ 7235 + void trace_user_fault_destroy(struct trace_user_buf_info *tinfo) 7236 7236 { 7237 7237 char *buf; 7238 7238 int cpu; 7239 + 7240 + if (!tinfo || !tinfo->tbuf) 7241 + return; 7239 7242 7240 7243 for_each_possible_cpu(cpu) { 7241 7244 buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; 7242 7245 kfree(buf); 7243 7246 } 7244 7247 free_percpu(tinfo->tbuf); 7245 - kfree(tinfo); 7246 7248 } 7247 7249 7248 - static int trace_user_fault_buffer_enable(void) 7250 + static int user_fault_buffer_enable(struct trace_user_buf_info *tinfo, size_t size) 7249 7251 { 7250 - struct trace_user_buf_info *tinfo; 7251 7252 char *buf; 7252 7253 int cpu; 7253 7254 7254 - guard(mutex)(&trace_user_buffer_mutex); 7255 - 7256 - if (trace_user_buffer) { 7257 - trace_user_buffer->ref++; 7258 - return 0; 7259 - } 7260 - 7261 - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 7262 - if (!tinfo) 7263 - return -ENOMEM; 7255 + lockdep_assert_held(&trace_user_buffer_mutex); 7264 7256 7265 7257 tinfo->tbuf = alloc_percpu(struct trace_user_buf); 7266 - if (!tinfo->tbuf) { 7267 - kfree(tinfo); 7258 + if (!tinfo->tbuf) 7268 7259 return -ENOMEM; 7269 - } 7270 7260 7271 7261 tinfo->ref = 1; 7262 + tinfo->size = size; 7272 7263 7273 7264 /* Clear each buffer in case of error */ 7274 7265 for_each_possible_cpu(cpu) { ··· 7267 7276 } 7268 7277 7269 7278 for_each_possible_cpu(cpu) { 7270 - buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL, 7279 + buf = kmalloc_node(size, GFP_KERNEL, 7271 7280 cpu_to_node(cpu)); 7272 - if (!buf) { 7273 - trace_user_fault_buffer_free(tinfo); 7281 + if (!buf) 7274 7282 return -ENOMEM; 7275 - } 7276 7283 per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; 7277 7284 } 7278 - 7279 - trace_user_buffer = tinfo; 7280 7285 7281 7286 return 0; 7282 7287 } 7283 7288 7284 - static void trace_user_fault_buffer_disable(void) 7289 + /* For internal use. Free and reinitialize */ 7290 + static void user_buffer_free(struct trace_user_buf_info **tinfo) 7285 7291 { 7286 - struct trace_user_buf_info *tinfo; 7292 + lockdep_assert_held(&trace_user_buffer_mutex); 7293 + 7294 + trace_user_fault_destroy(*tinfo); 7295 + kfree(*tinfo); 7296 + *tinfo = NULL; 7297 + } 7298 + 7299 + /* For internal use. Initialize and allocate */ 7300 + static int user_buffer_init(struct trace_user_buf_info **tinfo, size_t size) 7301 + { 7302 + bool alloc = false; 7303 + int ret; 7304 + 7305 + lockdep_assert_held(&trace_user_buffer_mutex); 7306 + 7307 + if (!*tinfo) { 7308 + alloc = true; 7309 + *tinfo = kzalloc(sizeof(**tinfo), GFP_KERNEL); 7310 + if (!*tinfo) 7311 + return -ENOMEM; 7312 + } 7313 + 7314 + ret = user_fault_buffer_enable(*tinfo, size); 7315 + if (ret < 0 && alloc) 7316 + user_buffer_free(tinfo); 7317 + 7318 + return ret; 7319 + } 7320 + 7321 + /* For internal use, derefrence and free if necessary */ 7322 + static void user_buffer_put(struct trace_user_buf_info **tinfo) 7323 + { 7324 + guard(mutex)(&trace_user_buffer_mutex); 7325 + 7326 + if (WARN_ON_ONCE(!*tinfo || !(*tinfo)->ref)) 7327 + return; 7328 + 7329 + if (--(*tinfo)->ref) 7330 + return; 7331 + 7332 + user_buffer_free(tinfo); 7333 + } 7334 + 7335 + /** 7336 + * trace_user_fault_init - Allocated or reference a per CPU buffer 7337 + * @tinfo: A pointer to the trace buffer descriptor 7338 + * @size: The size to allocate each per CPU buffer 7339 + * 7340 + * Create a per CPU buffer that can be used to copy from user space 7341 + * in a task context. When calling trace_user_fault_read(), preemption 7342 + * must be disabled, and it will enable preemption and copy user 7343 + * space data to the buffer. If any schedule switches occur, it will 7344 + * retry until it succeeds without a schedule switch knowing the buffer 7345 + * is still valid. 7346 + * 7347 + * Returns 0 on success, negative on failure. 7348 + */ 7349 + int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size) 7350 + { 7351 + int ret; 7352 + 7353 + if (!tinfo) 7354 + return -EINVAL; 7287 7355 7288 7356 guard(mutex)(&trace_user_buffer_mutex); 7289 7357 7290 - tinfo = trace_user_buffer; 7358 + ret = user_buffer_init(&tinfo, size); 7359 + if (ret < 0) 7360 + trace_user_fault_destroy(tinfo); 7291 7361 7292 - if (WARN_ON_ONCE(!tinfo)) 7293 - return; 7294 - 7295 - if (--tinfo->ref) 7296 - return; 7297 - 7298 - trace_user_fault_buffer_free(tinfo); 7299 - trace_user_buffer = NULL; 7362 + return ret; 7300 7363 } 7301 7364 7302 - /* Must be called with preemption disabled */ 7303 - static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, 7304 - const char __user *ptr, size_t size, 7305 - size_t *read_size) 7365 + /** 7366 + * trace_user_fault_get - up the ref count for the user buffer 7367 + * @tinfo: A pointer to a pointer to the trace buffer descriptor 7368 + * 7369 + * Ups the ref count of the trace buffer. 7370 + * 7371 + * Returns the new ref count. 7372 + */ 7373 + int trace_user_fault_get(struct trace_user_buf_info *tinfo) 7374 + { 7375 + if (!tinfo) 7376 + return -1; 7377 + 7378 + guard(mutex)(&trace_user_buffer_mutex); 7379 + 7380 + tinfo->ref++; 7381 + return tinfo->ref; 7382 + } 7383 + 7384 + /** 7385 + * trace_user_fault_put - dereference a per cpu trace buffer 7386 + * @tinfo: The @tinfo that was passed to trace_user_fault_get() 7387 + * 7388 + * Decrement the ref count of @tinfo. 7389 + * 7390 + * Returns the new refcount (negative on error). 7391 + */ 7392 + int trace_user_fault_put(struct trace_user_buf_info *tinfo) 7393 + { 7394 + guard(mutex)(&trace_user_buffer_mutex); 7395 + 7396 + if (WARN_ON_ONCE(!tinfo || !tinfo->ref)) 7397 + return -1; 7398 + 7399 + --tinfo->ref; 7400 + return tinfo->ref; 7401 + } 7402 + 7403 + /** 7404 + * trace_user_fault_read - Read user space into a per CPU buffer 7405 + * @tinfo: The @tinfo allocated by trace_user_fault_get() 7406 + * @ptr: The user space pointer to read 7407 + * @size: The size of user space to read. 7408 + * @copy_func: Optional function to use to copy from user space 7409 + * @data: Data to pass to copy_func if it was supplied 7410 + * 7411 + * Preemption must be disabled when this is called, and must not 7412 + * be enabled while using the returned buffer. 7413 + * This does the copying from user space into a per CPU buffer. 7414 + * 7415 + * The @size must not be greater than the size passed in to 7416 + * trace_user_fault_init(). 7417 + * 7418 + * If @copy_func is NULL, trace_user_fault_read() will use copy_from_user(), 7419 + * otherwise it will call @copy_func. It will call @copy_func with: 7420 + * 7421 + * buffer: the per CPU buffer of the @tinfo. 7422 + * ptr: The pointer @ptr to user space to read 7423 + * size: The @size of the ptr to read 7424 + * data: The @data parameter 7425 + * 7426 + * It is expected that @copy_func will return 0 on success and non zero 7427 + * if there was a fault. 7428 + * 7429 + * Returns a pointer to the buffer with the content read from @ptr. 7430 + * Preemption must remain disabled while the caller accesses the 7431 + * buffer returned by this function. 7432 + * Returns NULL if there was a fault, or the size passed in is 7433 + * greater than the size passed to trace_user_fault_init(). 7434 + */ 7435 + char *trace_user_fault_read(struct trace_user_buf_info *tinfo, 7436 + const char __user *ptr, size_t size, 7437 + trace_user_buf_copy copy_func, void *data) 7306 7438 { 7307 7439 int cpu = smp_processor_id(); 7308 7440 char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; ··· 7433 7319 int trys = 0; 7434 7320 int ret; 7435 7321 7436 - if (size > TRACE_MARKER_MAX_SIZE) 7437 - size = TRACE_MARKER_MAX_SIZE; 7438 - *read_size = 0; 7322 + lockdep_assert_preemption_disabled(); 7323 + 7324 + /* 7325 + * It's up to the caller to not try to copy more than it said 7326 + * it would. 7327 + */ 7328 + if (size > tinfo->size) 7329 + return NULL; 7439 7330 7440 7331 /* 7441 7332 * This acts similar to a seqcount. The per CPU context switches are ··· 7480 7361 */ 7481 7362 preempt_enable_notrace(); 7482 7363 7483 - ret = __copy_from_user(buffer, ptr, size); 7364 + /* Make sure preemption is enabled here */ 7365 + lockdep_assert_preemption_enabled(); 7366 + 7367 + if (copy_func) { 7368 + ret = copy_func(buffer, ptr, size, data); 7369 + } else { 7370 + ret = __copy_from_user(buffer, ptr, size); 7371 + } 7484 7372 7485 7373 preempt_disable_notrace(); 7486 7374 migrate_enable(); ··· 7504 7378 */ 7505 7379 } while (nr_context_switches_cpu(cpu) != cnt); 7506 7380 7507 - *read_size = size; 7508 7381 return buffer; 7509 7382 } 7510 7383 ··· 7514 7389 struct trace_array *tr = filp->private_data; 7515 7390 ssize_t written = -ENODEV; 7516 7391 unsigned long ip; 7517 - size_t size; 7518 7392 char *buf; 7519 7393 7520 7394 if (tracing_disabled) ··· 7531 7407 /* Must have preemption disabled while having access to the buffer */ 7532 7408 guard(preempt_notrace)(); 7533 7409 7534 - buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); 7410 + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); 7535 7411 if (!buf) 7536 7412 return -EFAULT; 7537 - 7538 - if (cnt > size) 7539 - cnt = size; 7540 7413 7541 7414 /* The selftests expect this function to be the IP address */ 7542 7415 ip = _THIS_IP_; ··· 7594 7473 { 7595 7474 struct trace_array *tr = filp->private_data; 7596 7475 ssize_t written = -ENODEV; 7597 - size_t size; 7598 7476 char *buf; 7599 7477 7600 7478 if (tracing_disabled) ··· 7606 7486 if (cnt < sizeof(unsigned int)) 7607 7487 return -EINVAL; 7608 7488 7489 + /* raw write is all or nothing */ 7490 + if (cnt > TRACE_MARKER_MAX_SIZE) 7491 + return -EINVAL; 7492 + 7609 7493 /* Must have preemption disabled while having access to the buffer */ 7610 7494 guard(preempt_notrace)(); 7611 7495 7612 - buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); 7496 + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); 7613 7497 if (!buf) 7614 7498 return -EFAULT; 7615 - 7616 - /* raw write is all or nothing */ 7617 - if (cnt > size) 7618 - return -EINVAL; 7619 7499 7620 7500 /* The global trace_marker_raw can go to multiple instances */ 7621 7501 if (tr == &global_trace) { ··· 7636 7516 { 7637 7517 int ret; 7638 7518 7639 - ret = trace_user_fault_buffer_enable(); 7640 - if (ret < 0) 7641 - return ret; 7519 + scoped_guard(mutex, &trace_user_buffer_mutex) { 7520 + if (!trace_user_buffer) { 7521 + ret = user_buffer_init(&trace_user_buffer, TRACE_MARKER_MAX_SIZE); 7522 + if (ret < 0) 7523 + return ret; 7524 + } else { 7525 + trace_user_buffer->ref++; 7526 + } 7527 + } 7642 7528 7643 7529 stream_open(inode, filp); 7644 7530 ret = tracing_open_generic_tr(inode, filp); 7645 7531 if (ret < 0) 7646 - trace_user_fault_buffer_disable(); 7532 + user_buffer_put(&trace_user_buffer); 7647 7533 return ret; 7648 7534 } 7649 7535 7650 7536 static int tracing_mark_release(struct inode *inode, struct file *file) 7651 7537 { 7652 - trace_user_fault_buffer_disable(); 7538 + user_buffer_put(&trace_user_buffer); 7653 7539 return tracing_release_generic_tr(inode, file); 7654 7540 } 7655 7541
+17
kernel/trace/trace.h
··· 1531 1531 1532 1532 void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); 1533 1533 1534 + struct trace_user_buf; 1535 + struct trace_user_buf_info { 1536 + struct trace_user_buf __percpu *tbuf; 1537 + size_t size; 1538 + int ref; 1539 + }; 1540 + 1541 + typedef int (*trace_user_buf_copy)(char *dst, const char __user *src, 1542 + size_t size, void *data); 1543 + int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size); 1544 + int trace_user_fault_get(struct trace_user_buf_info *tinfo); 1545 + int trace_user_fault_put(struct trace_user_buf_info *tinfo); 1546 + void trace_user_fault_destroy(struct trace_user_buf_info *tinfo); 1547 + char *trace_user_fault_read(struct trace_user_buf_info *tinfo, 1548 + const char __user *ptr, size_t size, 1549 + trace_user_buf_copy copy_func, void *data); 1550 + 1534 1551 static inline void 1535 1552 __trace_event_discard_commit(struct trace_buffer *buffer, 1536 1553 struct ring_buffer_event *event)