Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tracing/user_events: Use remote writes for event enablement

As part of the discussions for user_events aligned with user space
tracers, it was determined that user programs should register a aligned
value to set or clear a bit when an event becomes enabled. Currently a
shared page is being used that requires mmap(). Remove the shared page
implementation and move to a user registered address implementation.

In this new model during the event registration from user programs 3 new
values are specified. The first is the address to update when the event
is either enabled or disabled. The second is the bit to set/clear to
reflect the event being enabled. The third is the size of the value at
the specified address.

This allows for a local 32/64-bit value in user programs to support
both kernel and user tracers. As an example, setting bit 31 for kernel
tracers when the event becomes enabled allows for user tracers to use
the other bits for ref counts or other flags. The kernel side updates
the bit atomically, user programs need to also update these values
atomically.

User provided addresses must be aligned on a natural boundary, this
allows for single page checking and prevents odd behaviors such as a
enable value straddling 2 pages instead of a single page. Currently
page faults are only logged, future patches will handle these.

Link: https://lkml.kernel.org/r/20230328235219.203-4-beaub@linux.microsoft.com

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

authored by

Beau Belgrave and committed by
Steven Rostedt (Google)
72357590 fd593511

+517 -142
+52 -1
include/linux/user_events.h
··· 9 9 #ifndef _LINUX_USER_EVENTS_H 10 10 #define _LINUX_USER_EVENTS_H 11 11 12 + #include <linux/list.h> 13 + #include <linux/refcount.h> 14 + #include <linux/mm_types.h> 15 + #include <linux/workqueue.h> 12 16 #include <uapi/linux/user_events.h> 13 17 14 18 #ifdef CONFIG_USER_EVENTS 15 19 struct user_event_mm { 20 + struct list_head link; 21 + struct list_head enablers; 22 + struct mm_struct *mm; 23 + struct user_event_mm *next; 24 + refcount_t refcnt; 25 + refcount_t tasks; 26 + struct rcu_work put_rwork; 16 27 }; 17 - #endif 18 28 29 + extern void user_event_mm_dup(struct task_struct *t, 30 + struct user_event_mm *old_mm); 31 + 32 + extern void user_event_mm_remove(struct task_struct *t); 33 + 34 + static inline void user_events_fork(struct task_struct *t, 35 + unsigned long clone_flags) 36 + { 37 + struct user_event_mm *old_mm; 38 + 39 + if (!t || !current->user_event_mm) 40 + return; 41 + 42 + old_mm = current->user_event_mm; 43 + 44 + if (clone_flags & CLONE_VM) { 45 + t->user_event_mm = old_mm; 46 + refcount_inc(&old_mm->tasks); 47 + return; 48 + } 49 + 50 + user_event_mm_dup(t, old_mm); 51 + } 52 + 53 + static inline void user_events_execve(struct task_struct *t) 54 + { 55 + if (!t || !t->user_event_mm) 56 + return; 57 + 58 + user_event_mm_remove(t); 59 + } 60 + 61 + static inline void user_events_exit(struct task_struct *t) 62 + { 63 + if (!t || !t->user_event_mm) 64 + return; 65 + 66 + user_event_mm_remove(t); 67 + } 68 + #else 19 69 static inline void user_events_fork(struct task_struct *t, 20 70 unsigned long clone_flags) 21 71 { ··· 78 28 static inline void user_events_exit(struct task_struct *t) 79 29 { 80 30 } 31 + #endif /* CONFIG_USER_EVENTS */ 81 32 82 33 #endif /* _LINUX_USER_EVENTS_H */
+12 -3
include/uapi/linux/user_events.h
··· 27 27 /* Input: Size of the user_reg structure being used */ 28 28 __u32 size; 29 29 30 + /* Input: Bit in enable address to use */ 31 + __u8 enable_bit; 32 + 33 + /* Input: Enable size in bytes at address */ 34 + __u8 enable_size; 35 + 36 + /* Input: Flags for future use, set to 0 */ 37 + __u16 flags; 38 + 39 + /* Input: Address to update when enabled */ 40 + __u64 enable_addr; 41 + 30 42 /* Input: Pointer to string with event name, description and flags */ 31 43 __u64 name_args; 32 - 33 - /* Output: Bitwise index of the event within the status page */ 34 - __u32 status_bit; 35 44 36 45 /* Output: Index of the event to use when writing data */ 37 46 __u32 write_index;
+3 -2
kernel/trace/Kconfig
··· 798 798 can be used like an existing kernel trace event. User trace 799 799 events are generated by writing to a tracefs file. User 800 800 processes can determine if their tracing events should be 801 - generated by memory mapping a tracefs file and checking for 802 - an associated byte being non-zero. 801 + generated by registering a value and bit with the kernel 802 + that reflects when it is enabled or not. 803 803 804 + See Documentation/trace/user_events.rst. 804 805 If in doubt, say N. 805 806 806 807 config HIST_TRIGGERS
+450 -136
kernel/trace/trace_events_user.c
··· 19 19 #include <linux/tracefs.h> 20 20 #include <linux/types.h> 21 21 #include <linux/uaccess.h> 22 + #include <linux/highmem.h> 22 23 #include <linux/user_events.h> 23 24 #include "trace.h" 24 25 #include "trace_dynevent.h" ··· 30 29 #define FIELD_DEPTH_NAME 1 31 30 #define FIELD_DEPTH_SIZE 2 32 31 33 - /* 34 - * Limits how many trace_event calls user processes can create: 35 - * Must be a power of two of PAGE_SIZE. 36 - */ 37 - #define MAX_PAGE_ORDER 0 38 - #define MAX_PAGES (1 << MAX_PAGE_ORDER) 39 - #define MAX_BYTES (MAX_PAGES * PAGE_SIZE) 40 - #define MAX_EVENTS (MAX_BYTES * 8) 41 - 42 32 /* Limit how long of an event name plus args within the subsystem. */ 43 33 #define MAX_EVENT_DESC 512 44 34 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) 45 35 #define MAX_FIELD_ARRAY_SIZE 1024 46 - 47 - /* 48 - * The MAP_STATUS_* macros are used for taking a index and determining the 49 - * appropriate byte and the bit in the byte to set/reset for an event. 50 - * 51 - * The lower 3 bits of the index decide which bit to set. 52 - * The remaining upper bits of the index decide which byte to use for the bit. 53 - * 54 - * This is used when an event has a probe attached/removed to reflect live 55 - * status of the event wanting tracing or not to user-programs via shared 56 - * memory maps. 57 - */ 58 - #define MAP_STATUS_BYTE(index) ((index) >> 3) 59 - #define MAP_STATUS_MASK(index) BIT((index) & 7) 60 36 61 37 /* 62 38 * Internal bits (kernel side only) to keep track of connected probes: ··· 48 70 #define EVENT_STATUS_OTHER BIT(7) 49 71 50 72 /* 51 - * Stores the pages, tables, and locks for a group of events. 52 - * Each logical grouping of events has its own group, with a 53 - * matching page for status checks within user programs. This 54 - * allows for isolation of events to user programs by various 55 - * means. 73 + * Stores the system name, tables, and locks for a group of events. This 74 + * allows isolation for events by various means. 56 75 */ 57 76 struct user_event_group { 58 - struct page *pages; 59 - char *register_page_data; 60 77 char *system_name; 61 78 struct hlist_node node; 62 79 struct mutex reg_mutex; 63 80 DECLARE_HASHTABLE(register_table, 8); 64 - DECLARE_BITMAP(page_bitmap, MAX_EVENTS); 65 81 }; 66 82 67 83 /* Group for init_user_ns mapping, top-most group */ ··· 78 106 struct list_head fields; 79 107 struct list_head validators; 80 108 refcount_t refcnt; 81 - int index; 82 - int flags; 83 109 int min_size; 84 110 char status; 85 111 }; 112 + 113 + /* 114 + * Stores per-mm/event properties that enable an address to be 115 + * updated properly for each task. As tasks are forked, we use 116 + * these to track enablement sites that are tied to an event. 117 + */ 118 + struct user_event_enabler { 119 + struct list_head link; 120 + struct user_event *event; 121 + unsigned long addr; 122 + 123 + /* Track enable bit, flags, etc. Aligned for bitops. */ 124 + unsigned int values; 125 + }; 126 + 127 + /* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ 128 + #define ENABLE_VAL_BIT_MASK 0x3F 129 + 130 + /* Only duplicate the bit value */ 131 + #define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK 132 + 133 + /* Global list of memory descriptors using user_events */ 134 + static LIST_HEAD(user_event_mms); 135 + static DEFINE_SPINLOCK(user_event_mms_lock); 86 136 87 137 /* 88 138 * Stores per-file events references, as users register events ··· 139 145 char *args, char *flags, 140 146 struct user_event **newuser); 141 147 148 + static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); 149 + static struct user_event_mm *user_event_mm_get_all(struct user_event *user); 150 + static void user_event_mm_put(struct user_event_mm *mm); 151 + 142 152 static u32 user_event_key(char *name) 143 153 { 144 154 return jhash(name, strlen(name), 0); 145 155 } 146 156 147 - static void set_page_reservations(char *pages, bool set) 148 - { 149 - int page; 150 - 151 - for (page = 0; page < MAX_PAGES; ++page) { 152 - void *addr = pages + (PAGE_SIZE * page); 153 - 154 - if (set) 155 - SetPageReserved(virt_to_page(addr)); 156 - else 157 - ClearPageReserved(virt_to_page(addr)); 158 - } 159 - } 160 - 161 157 static void user_event_group_destroy(struct user_event_group *group) 162 158 { 163 - if (group->register_page_data) 164 - set_page_reservations(group->register_page_data, false); 165 - 166 - if (group->pages) 167 - __free_pages(group->pages, MAX_PAGE_ORDER); 168 - 169 159 kfree(group->system_name); 170 160 kfree(group); 171 161 } ··· 220 242 if (!group->system_name) 221 243 goto error; 222 244 223 - group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); 224 - 225 - if (!group->pages) 226 - goto error; 227 - 228 - group->register_page_data = page_address(group->pages); 229 - 230 - set_page_reservations(group->register_page_data, true); 231 - 232 - /* Zero all bits beside 0 (which is reserved for failures) */ 233 - bitmap_zero(group->page_bitmap, MAX_EVENTS); 234 - set_bit(0, group->page_bitmap); 235 - 236 245 mutex_init(&group->reg_mutex); 237 246 hash_init(group->register_table); 238 247 ··· 231 266 return NULL; 232 267 }; 233 268 234 - static __always_inline 235 - void user_event_register_set(struct user_event *user) 269 + static void user_event_enabler_destroy(struct user_event_enabler *enabler) 236 270 { 237 - int i = user->index; 271 + list_del_rcu(&enabler->link); 238 272 239 - user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); 273 + /* No longer tracking the event via the enabler */ 274 + refcount_dec(&enabler->event->refcnt); 275 + 276 + kfree(enabler); 240 277 } 241 278 242 - static __always_inline 243 - void user_event_register_clear(struct user_event *user) 279 + static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr) 244 280 { 245 - int i = user->index; 281 + bool unlocked; 282 + int ret; 246 283 247 - user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); 284 + mmap_read_lock(mm->mm); 285 + 286 + /* Ensure MM has tasks, cannot use after exit_mm() */ 287 + if (refcount_read(&mm->tasks) == 0) { 288 + ret = -ENOENT; 289 + goto out; 290 + } 291 + 292 + ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, 293 + &unlocked); 294 + out: 295 + mmap_read_unlock(mm->mm); 296 + 297 + return ret; 298 + } 299 + 300 + static int user_event_enabler_write(struct user_event_mm *mm, 301 + struct user_event_enabler *enabler) 302 + { 303 + unsigned long uaddr = enabler->addr; 304 + unsigned long *ptr; 305 + struct page *page; 306 + void *kaddr; 307 + int ret; 308 + 309 + lockdep_assert_held(&event_mutex); 310 + mmap_assert_locked(mm->mm); 311 + 312 + /* Ensure MM has tasks, cannot use after exit_mm() */ 313 + if (refcount_read(&mm->tasks) == 0) 314 + return -ENOENT; 315 + 316 + ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, 317 + &page, NULL, NULL); 318 + 319 + if (ret <= 0) { 320 + pr_warn("user_events: Enable write failed\n"); 321 + return -EFAULT; 322 + } 323 + 324 + kaddr = kmap_local_page(page); 325 + ptr = kaddr + (uaddr & ~PAGE_MASK); 326 + 327 + /* Update bit atomically, user tracers must be atomic as well */ 328 + if (enabler->event && enabler->event->status) 329 + set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); 330 + else 331 + clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr); 332 + 333 + kunmap_local(kaddr); 334 + unpin_user_pages_dirty_lock(&page, 1, true); 335 + 336 + return 0; 337 + } 338 + 339 + static void user_event_enabler_update(struct user_event *user) 340 + { 341 + struct user_event_enabler *enabler; 342 + struct user_event_mm *mm = user_event_mm_get_all(user); 343 + struct user_event_mm *next; 344 + 345 + while (mm) { 346 + next = mm->next; 347 + mmap_read_lock(mm->mm); 348 + rcu_read_lock(); 349 + 350 + list_for_each_entry_rcu(enabler, &mm->enablers, link) 351 + if (enabler->event == user) 352 + user_event_enabler_write(mm, enabler); 353 + 354 + rcu_read_unlock(); 355 + mmap_read_unlock(mm->mm); 356 + user_event_mm_put(mm); 357 + mm = next; 358 + } 359 + } 360 + 361 + static bool user_event_enabler_dup(struct user_event_enabler *orig, 362 + struct user_event_mm *mm) 363 + { 364 + struct user_event_enabler *enabler; 365 + 366 + enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT); 367 + 368 + if (!enabler) 369 + return false; 370 + 371 + enabler->event = orig->event; 372 + enabler->addr = orig->addr; 373 + 374 + /* Only dup part of value (ignore future flags, etc) */ 375 + enabler->values = orig->values & ENABLE_VAL_DUP_MASK; 376 + 377 + refcount_inc(&enabler->event->refcnt); 378 + list_add_rcu(&enabler->link, &mm->enablers); 379 + 380 + return true; 381 + } 382 + 383 + static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm) 384 + { 385 + refcount_inc(&mm->refcnt); 386 + 387 + return mm; 388 + } 389 + 390 + static struct user_event_mm *user_event_mm_get_all(struct user_event *user) 391 + { 392 + struct user_event_mm *found = NULL; 393 + struct user_event_enabler *enabler; 394 + struct user_event_mm *mm; 395 + 396 + /* 397 + * We do not want to block fork/exec while enablements are being 398 + * updated, so we use RCU to walk the current tasks that have used 399 + * user_events ABI for 1 or more events. Each enabler found in each 400 + * task that matches the event being updated has a write to reflect 401 + * the kernel state back into the process. Waits/faults must not occur 402 + * during this. So we scan the list under RCU for all the mm that have 403 + * the event within it. This is needed because mm_read_lock() can wait. 404 + * Each user mm returned has a ref inc to handle remove RCU races. 405 + */ 406 + rcu_read_lock(); 407 + 408 + list_for_each_entry_rcu(mm, &user_event_mms, link) 409 + list_for_each_entry_rcu(enabler, &mm->enablers, link) 410 + if (enabler->event == user) { 411 + mm->next = found; 412 + found = user_event_mm_get(mm); 413 + break; 414 + } 415 + 416 + rcu_read_unlock(); 417 + 418 + return found; 419 + } 420 + 421 + static struct user_event_mm *user_event_mm_create(struct task_struct *t) 422 + { 423 + struct user_event_mm *user_mm; 424 + unsigned long flags; 425 + 426 + user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL); 427 + 428 + if (!user_mm) 429 + return NULL; 430 + 431 + user_mm->mm = t->mm; 432 + INIT_LIST_HEAD(&user_mm->enablers); 433 + refcount_set(&user_mm->refcnt, 1); 434 + refcount_set(&user_mm->tasks, 1); 435 + 436 + spin_lock_irqsave(&user_event_mms_lock, flags); 437 + list_add_rcu(&user_mm->link, &user_event_mms); 438 + spin_unlock_irqrestore(&user_event_mms_lock, flags); 439 + 440 + t->user_event_mm = user_mm; 441 + 442 + /* 443 + * The lifetime of the memory descriptor can slightly outlast 444 + * the task lifetime if a ref to the user_event_mm is taken 445 + * between list_del_rcu() and call_rcu(). Therefore we need 446 + * to take a reference to it to ensure it can live this long 447 + * under this corner case. This can also occur in clones that 448 + * outlast the parent. 449 + */ 450 + mmgrab(user_mm->mm); 451 + 452 + return user_mm; 453 + } 454 + 455 + static struct user_event_mm *current_user_event_mm(void) 456 + { 457 + struct user_event_mm *user_mm = current->user_event_mm; 458 + 459 + if (user_mm) 460 + goto inc; 461 + 462 + user_mm = user_event_mm_create(current); 463 + 464 + if (!user_mm) 465 + goto error; 466 + inc: 467 + refcount_inc(&user_mm->refcnt); 468 + error: 469 + return user_mm; 470 + } 471 + 472 + static void user_event_mm_destroy(struct user_event_mm *mm) 473 + { 474 + struct user_event_enabler *enabler, *next; 475 + 476 + list_for_each_entry_safe(enabler, next, &mm->enablers, link) 477 + user_event_enabler_destroy(enabler); 478 + 479 + mmdrop(mm->mm); 480 + kfree(mm); 481 + } 482 + 483 + static void user_event_mm_put(struct user_event_mm *mm) 484 + { 485 + if (mm && refcount_dec_and_test(&mm->refcnt)) 486 + user_event_mm_destroy(mm); 487 + } 488 + 489 + static void delayed_user_event_mm_put(struct work_struct *work) 490 + { 491 + struct user_event_mm *mm; 492 + 493 + mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork); 494 + user_event_mm_put(mm); 495 + } 496 + 497 + void user_event_mm_remove(struct task_struct *t) 498 + { 499 + struct user_event_mm *mm; 500 + unsigned long flags; 501 + 502 + might_sleep(); 503 + 504 + mm = t->user_event_mm; 505 + t->user_event_mm = NULL; 506 + 507 + /* Clone will increment the tasks, only remove if last clone */ 508 + if (!refcount_dec_and_test(&mm->tasks)) 509 + return; 510 + 511 + /* Remove the mm from the list, so it can no longer be enabled */ 512 + spin_lock_irqsave(&user_event_mms_lock, flags); 513 + list_del_rcu(&mm->link); 514 + spin_unlock_irqrestore(&user_event_mms_lock, flags); 515 + 516 + /* 517 + * We need to wait for currently occurring writes to stop within 518 + * the mm. This is required since exit_mm() snaps the current rss 519 + * stats and clears them. On the final mmdrop(), check_mm() will 520 + * report a bug if these increment. 521 + * 522 + * All writes/pins are done under mmap_read lock, take the write 523 + * lock to ensure in-progress faults have completed. Faults that 524 + * are pending but yet to run will check the task count and skip 525 + * the fault since the mm is going away. 526 + */ 527 + mmap_write_lock(mm->mm); 528 + mmap_write_unlock(mm->mm); 529 + 530 + /* 531 + * Put for mm must be done after RCU delay to handle new refs in 532 + * between the list_del_rcu() and now. This ensures any get refs 533 + * during rcu_read_lock() are accounted for during list removal. 534 + * 535 + * CPU A | CPU B 536 + * --------------------------------------------------------------- 537 + * user_event_mm_remove() | rcu_read_lock(); 538 + * list_del_rcu() | list_for_each_entry_rcu(); 539 + * call_rcu() | refcount_inc(); 540 + * . | rcu_read_unlock(); 541 + * schedule_work() | . 542 + * user_event_mm_put() | . 543 + * 544 + * mmdrop() cannot be called in the softirq context of call_rcu() 545 + * so we use a work queue after call_rcu() to run within. 546 + */ 547 + INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); 548 + queue_rcu_work(system_wq, &mm->put_rwork); 549 + } 550 + 551 + void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) 552 + { 553 + struct user_event_mm *mm = user_event_mm_create(t); 554 + struct user_event_enabler *enabler; 555 + 556 + if (!mm) 557 + return; 558 + 559 + rcu_read_lock(); 560 + 561 + list_for_each_entry_rcu(enabler, &old_mm->enablers, link) 562 + if (!user_event_enabler_dup(enabler, mm)) 563 + goto error; 564 + 565 + rcu_read_unlock(); 566 + 567 + return; 568 + error: 569 + rcu_read_unlock(); 570 + user_event_mm_remove(t); 571 + } 572 + 573 + static struct user_event_enabler 574 + *user_event_enabler_create(struct user_reg *reg, struct user_event *user, 575 + int *write_result) 576 + { 577 + struct user_event_enabler *enabler; 578 + struct user_event_mm *user_mm; 579 + unsigned long uaddr = (unsigned long)reg->enable_addr; 580 + 581 + user_mm = current_user_event_mm(); 582 + 583 + if (!user_mm) 584 + return NULL; 585 + 586 + enabler = kzalloc(sizeof(*enabler), GFP_KERNEL); 587 + 588 + if (!enabler) 589 + goto out; 590 + 591 + enabler->event = user; 592 + enabler->addr = uaddr; 593 + enabler->values = reg->enable_bit; 594 + retry: 595 + /* Prevents state changes from racing with new enablers */ 596 + mutex_lock(&event_mutex); 597 + 598 + /* Attempt to reflect the current state within the process */ 599 + mmap_read_lock(user_mm->mm); 600 + *write_result = user_event_enabler_write(user_mm, enabler); 601 + mmap_read_unlock(user_mm->mm); 602 + 603 + /* 604 + * If the write works, then we will track the enabler. A ref to the 605 + * underlying user_event is held by the enabler to prevent it going 606 + * away while the enabler is still in use by a process. The ref is 607 + * removed when the enabler is destroyed. This means a event cannot 608 + * be forcefully deleted from the system until all tasks using it 609 + * exit or run exec(), which includes forks and clones. 610 + */ 611 + if (!*write_result) { 612 + refcount_inc(&enabler->event->refcnt); 613 + list_add_rcu(&enabler->link, &user_mm->enablers); 614 + } 615 + 616 + mutex_unlock(&event_mutex); 617 + 618 + if (*write_result) { 619 + /* Attempt to fault-in and retry if it worked */ 620 + if (!user_event_mm_fault_in(user_mm, uaddr)) 621 + goto retry; 622 + 623 + kfree(enabler); 624 + enabler = NULL; 625 + } 626 + out: 627 + user_event_mm_put(user_mm); 628 + 629 + return enabler; 248 630 } 249 631 250 632 static __always_inline __must_check ··· 1136 824 return ret; 1137 825 1138 826 dyn_event_remove(&user->devent); 1139 - 1140 - user_event_register_clear(user); 1141 - clear_bit(user->index, user->group->page_bitmap); 1142 827 hash_del(&user->node); 1143 828 1144 829 user_event_destroy_validators(user); ··· 1281 972 #endif 1282 973 1283 974 /* 1284 - * Update the register page that is shared between user processes. 975 + * Update the enabled bit among all user processes. 1285 976 */ 1286 - static void update_reg_page_for(struct user_event *user) 977 + static void update_enable_bit_for(struct user_event *user) 1287 978 { 1288 979 struct tracepoint *tp = &user->tracepoint; 1289 980 char status = 0; ··· 1314 1005 rcu_read_unlock_sched(); 1315 1006 } 1316 1007 1317 - if (status) 1318 - user_event_register_set(user); 1319 - else 1320 - user_event_register_clear(user); 1321 - 1322 1008 user->status = status; 1009 + 1010 + user_event_enabler_update(user); 1323 1011 } 1324 1012 1325 1013 /* ··· 1373 1067 return ret; 1374 1068 inc: 1375 1069 refcount_inc(&user->refcnt); 1376 - update_reg_page_for(user); 1070 + update_enable_bit_for(user); 1377 1071 return 0; 1378 1072 dec: 1379 - update_reg_page_for(user); 1073 + update_enable_bit_for(user); 1380 1074 refcount_dec(&user->refcnt); 1381 1075 return 0; 1382 1076 } ··· 1572 1266 struct user_event **newuser) 1573 1267 { 1574 1268 int ret; 1575 - int index; 1576 1269 u32 key; 1577 1270 struct user_event *user; 1578 1271 ··· 1589 1284 kfree(name); 1590 1285 return 0; 1591 1286 } 1592 - 1593 - index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS); 1594 - 1595 - if (index == MAX_EVENTS) 1596 - return -EMFILE; 1597 1287 1598 1288 user = kzalloc(sizeof(*user), GFP_KERNEL); 1599 1289 ··· 1635 1335 if (ret) 1636 1336 goto put_user_lock; 1637 1337 1638 - user->index = index; 1639 - 1640 1338 /* Ensure we track self ref and caller ref (2) */ 1641 1339 refcount_set(&user->refcnt, 2); 1642 1340 1643 1341 dyn_event_init(&user->devent, &user_event_dops); 1644 1342 dyn_event_add(&user->devent, &user->call); 1645 - set_bit(user->index, group->page_bitmap); 1646 1343 hash_add(group->register_table, &user->node, key); 1647 1344 1648 1345 mutex_unlock(&event_mutex); ··· 1856 1559 if (ret) 1857 1560 return ret; 1858 1561 1562 + /* Ensure no flags, since we don't support any yet */ 1563 + if (kreg->flags != 0) 1564 + return -EINVAL; 1565 + 1566 + /* Ensure supported size */ 1567 + switch (kreg->enable_size) { 1568 + case 4: 1569 + /* 32-bit */ 1570 + break; 1571 + #if BITS_PER_LONG >= 64 1572 + case 8: 1573 + /* 64-bit */ 1574 + break; 1575 + #endif 1576 + default: 1577 + return -EINVAL; 1578 + } 1579 + 1580 + /* Ensure natural alignment */ 1581 + if (kreg->enable_addr % kreg->enable_size) 1582 + return -EINVAL; 1583 + 1584 + /* Ensure bit range for size */ 1585 + if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1) 1586 + return -EINVAL; 1587 + 1588 + /* Ensure accessible */ 1589 + if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr, 1590 + kreg->enable_size)) 1591 + return -EFAULT; 1592 + 1859 1593 kreg->size = size; 1860 1594 1861 1595 return 0; ··· 1901 1573 struct user_reg __user *ureg = (struct user_reg __user *)uarg; 1902 1574 struct user_reg reg; 1903 1575 struct user_event *user; 1576 + struct user_event_enabler *enabler; 1904 1577 char *name; 1905 1578 long ret; 1579 + int write_result; 1906 1580 1907 1581 ret = user_reg_get(ureg, &reg); 1908 1582 ··· 1935 1605 if (ret < 0) 1936 1606 return ret; 1937 1607 1608 + /* 1609 + * user_events_ref_add succeeded: 1610 + * At this point we have a user_event, it's lifetime is bound by the 1611 + * reference count, not this file. If anything fails, the user_event 1612 + * still has a reference until the file is released. During release 1613 + * any remaining references (from user_events_ref_add) are decremented. 1614 + * 1615 + * Attempt to create an enabler, which too has a lifetime tied in the 1616 + * same way for the event. Once the task that caused the enabler to be 1617 + * created exits or issues exec() then the enablers it has created 1618 + * will be destroyed and the ref to the event will be decremented. 1619 + */ 1620 + enabler = user_event_enabler_create(&reg, user, &write_result); 1621 + 1622 + if (!enabler) 1623 + return -ENOMEM; 1624 + 1625 + /* Write failed/faulted, give error back to caller */ 1626 + if (write_result) 1627 + return write_result; 1628 + 1938 1629 put_user((u32)ret, &ureg->write_index); 1939 - put_user(user->index, &ureg->status_bit); 1940 1630 1941 1631 return 0; 1942 1632 } ··· 2070 1720 .release = user_events_release, 2071 1721 }; 2072 1722 2073 - static struct user_event_group *user_status_group(struct file *file) 2074 - { 2075 - struct seq_file *m = file->private_data; 2076 - 2077 - if (!m) 2078 - return NULL; 2079 - 2080 - return m->private; 2081 - } 2082 - 2083 - /* 2084 - * Maps the shared page into the user process for checking if event is enabled. 2085 - */ 2086 - static int user_status_mmap(struct file *file, struct vm_area_struct *vma) 2087 - { 2088 - char *pages; 2089 - struct user_event_group *group = user_status_group(file); 2090 - unsigned long size = vma->vm_end - vma->vm_start; 2091 - 2092 - if (size != MAX_BYTES) 2093 - return -EINVAL; 2094 - 2095 - if (!group) 2096 - return -EINVAL; 2097 - 2098 - pages = group->register_page_data; 2099 - 2100 - return remap_pfn_range(vma, vma->vm_start, 2101 - virt_to_phys(pages) >> PAGE_SHIFT, 2102 - size, vm_get_page_prot(VM_READ)); 2103 - } 2104 - 2105 1723 static void *user_seq_start(struct seq_file *m, loff_t *pos) 2106 1724 { 2107 1725 if (*pos) ··· 2093 1775 struct user_event_group *group = m->private; 2094 1776 struct user_event *user; 2095 1777 char status; 2096 - int i, active = 0, busy = 0, flags; 1778 + int i, active = 0, busy = 0; 2097 1779 2098 1780 if (!group) 2099 1781 return -EINVAL; ··· 2102 1784 2103 1785 hash_for_each(group->register_table, i, user, node) { 2104 1786 status = user->status; 2105 - flags = user->flags; 2106 1787 2107 - seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); 1788 + seq_printf(m, "%s", EVENT_NAME(user)); 2108 1789 2109 - if (flags != 0 || status != 0) 1790 + if (status != 0) 2110 1791 seq_puts(m, " #"); 2111 1792 2112 1793 if (status != 0) { ··· 2128 1811 seq_puts(m, "\n"); 2129 1812 seq_printf(m, "Active: %d\n", active); 2130 1813 seq_printf(m, "Busy: %d\n", busy); 2131 - seq_printf(m, "Max: %ld\n", MAX_EVENTS); 2132 1814 2133 1815 return 0; 2134 1816 } ··· 2163 1847 2164 1848 static const struct file_operations user_status_fops = { 2165 1849 .open = user_status_open, 2166 - .mmap = user_status_mmap, 2167 1850 .read = seq_read, 2168 1851 .llseek = seq_lseek, 2169 1852 .release = seq_release, ··· 2183 1868 goto err; 2184 1869 } 2185 1870 2186 - /* mmap with MAP_SHARED requires writable fd */ 2187 - emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE, 1871 + emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ, 2188 1872 NULL, NULL, &user_status_fops); 2189 1873 2190 1874 if (!emmap) {